├── img └── list-2.0.png ├── .gitignore ├── .github └── workflows │ └── rust.yml ├── Cargo.toml ├── src ├── main.rs ├── medias │ ├── mod.rs │ ├── ops.rs │ └── naming.rs ├── utils │ ├── running.rs │ ├── mod.rs │ └── natural.rs ├── entries │ ├── input.rs │ ├── mod.rs │ ├── filter.rs │ └── entry.rs ├── commands.rs └── commands │ ├── list.rs │ ├── rename.rs │ ├── probe.rs │ ├── rebuild.rs │ ├── join.rs │ └── dupes.rs ├── LICENSE ├── CHANGELOG.md ├── Cargo.lock └── README.md /img/list-2.0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsalmei/refine/HEAD/img/list-2.0.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | /target/ 4 | 5 | # These are backup files generated by rustfmt 6 | **/*.rs.bk 7 | -------------------------------------------------------------------------------- /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: Rust 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | main: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v4 17 | - name: Build 18 | run: cargo build --verbose 19 | - name: Run tests 20 | run: cargo test --verbose 21 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "refine" 3 | version = "3.0.0" 4 | edition = "2024" 5 | 6 | authors = ["Rogério Sampaio de Almeida "] 7 | description = "Refine your file collections using Rust!" 8 | keywords = ["files", "deduplicate", "rename", "batch", "scan"] 9 | categories = ["command-line-utilities", "filesystem"] 10 | documentation = "https://docs.rs/refine/" 11 | repository = "https://github.com/rsalmei/refine" 12 | readme = "README.md" 13 | license = "MIT" 14 | 15 | [dependencies] 16 | human-repr = { version = "1", features = [] } 17 | clap = { version = "4", features = ["derive"] } 18 | anyhow = "1" 19 | regex = "1.11" 20 | ctrlc = "3" 21 | ureq = "3" 22 | yansi = { version = "1" } 23 | dirs = "6" 24 | mime_guess = "2.0" 25 | strsim = "0.11" 26 | rayon = "1.10" 27 | deunicode = "1.6" 28 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | mod commands; 2 | mod entries; 3 | mod medias; 4 | mod utils; 5 | 6 | use anyhow::Result; 7 | use clap::Parser; 8 | use commands::Command; 9 | use entries::Input; 10 | 11 | #[derive(Debug, Parser)] 12 | #[command(version, about, long_about = None, after_help = "For more information, see https://github.com/rsalmei/refine", 13 | override_usage = "refine [DIRS]... [FETCH] [OPTIONS]", 14 | )] 15 | pub struct Args { 16 | #[command(subcommand)] 17 | cmd: Command, 18 | #[command(flatten)] 19 | input: Input, 20 | } 21 | 22 | fn main() -> Result<()> { 23 | utils::install_ctrl_c_handler(); 24 | 25 | println!("Refine v{}", env!("CARGO_PKG_VERSION")); 26 | let args = Args::parse(); 27 | let effective = args.input.try_into()?; 28 | args.cmd.execute(effective) 29 | } 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Rogério Sampaio de Almeida 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/medias/mod.rs: -------------------------------------------------------------------------------- 1 | mod naming; 2 | mod ops; 3 | 4 | use crate::entries::Entry; 5 | pub use naming::*; 6 | pub use ops::*; 7 | 8 | pub trait SourceEntry { 9 | /// The original entry of the file. 10 | fn src_entry(&self) -> &Entry; 11 | } 12 | 13 | pub trait NewEntry { 14 | /// The new path the file will be renamed to. 15 | fn new_entry(&self) -> Entry; 16 | } 17 | 18 | pub trait NewName { 19 | fn new_name(&self) -> &str; 20 | } 21 | 22 | pub trait NewNameMut { 23 | fn new_name_mut(&mut self) -> &mut String; 24 | } 25 | 26 | impl NewEntry for M { 27 | fn new_entry(&self) -> Entry { 28 | self.src_entry().with_file_name(self.new_name()) 29 | } 30 | } 31 | 32 | #[macro_export] 33 | macro_rules! impl_source_entry { 34 | ($t:ty) => { 35 | impl $crate::medias::SourceEntry for $t { 36 | fn src_entry(&self) -> &$crate::entries::Entry { 37 | &self.entry 38 | } 39 | } 40 | }; 41 | } 42 | 43 | #[macro_export] 44 | macro_rules! impl_new_name { 45 | ($t:ty) => { 46 | impl $crate::medias::NewName for $t { 47 | fn new_name(&self) -> &str { 48 | &self.new_name 49 | } 50 | } 51 | }; 52 | } 53 | 54 | #[macro_export] 55 | macro_rules! impl_new_name_mut { 56 | ($t:ty) => { 57 | impl $crate::medias::NewNameMut for $t { 58 | fn new_name_mut(&mut self) -> &mut String { 59 | &mut self.new_name 60 | } 61 | } 62 | }; 63 | } 64 | -------------------------------------------------------------------------------- /src/utils/running.rs: -------------------------------------------------------------------------------- 1 | use anyhow::{Result, anyhow}; 2 | use std::fmt::{Display, Formatter}; 3 | use std::sync::atomic::AtomicBool; 4 | use std::sync::{LazyLock, atomic}; 5 | 6 | static RUNNING_FLAG: LazyLock = LazyLock::new(|| AtomicBool::new(true)); 7 | 8 | /// Check whether the program should continue running. 9 | pub fn is_running() -> bool { 10 | RUNNING_FLAG.load(atomic::Ordering::Relaxed) 11 | } 12 | 13 | /// Check whether the user asked to abort, and if so, return an error which can be propagated. 14 | pub fn aborted() -> Result<()> { 15 | match is_running() { 16 | true => Ok(()), 17 | false => Err(anyhow!("user asked to abort")), 18 | } 19 | } 20 | 21 | /// Return an object that prints an abort marker if the program is aborted. 22 | pub fn display_abort(cond: bool) -> impl Display { 23 | DisplayAbort { cond } 24 | } 25 | 26 | #[derive(Debug, Copy, Clone)] 27 | pub struct DisplayAbort { 28 | cond: bool, 29 | } 30 | 31 | impl Display for DisplayAbort { 32 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 33 | if self.cond && !is_running() { 34 | write!(f, " (aborted)")?; 35 | } 36 | Ok(()) 37 | } 38 | } 39 | 40 | /// Install a Ctrl-C handler. It must be called only once. 41 | pub fn install_ctrl_c_handler() { 42 | let handler = || { 43 | eprintln!(" aborting..."); 44 | RUNNING_FLAG.store(false, atomic::Ordering::Relaxed); 45 | }; 46 | if let Err(err) = ctrlc::set_handler(handler) { 47 | eprintln!("error: set Ctrl-C handler: {err:?}"); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/utils/mod.rs: -------------------------------------------------------------------------------- 1 | mod natural; 2 | mod running; 3 | 4 | use anyhow::{Result, anyhow}; 5 | pub use natural::*; 6 | pub use running::*; 7 | use std::collections::HashSet; 8 | use std::error::Error; 9 | use std::io::{Write, stdin, stdout}; 10 | use std::str::FromStr; 11 | use std::sync::{LazyLock, Mutex, mpsc}; 12 | use std::thread; 13 | use std::time::Duration; 14 | 15 | #[derive(Debug)] 16 | pub enum PromptError { 17 | No, 18 | Quit, 19 | } 20 | 21 | impl From for PromptError { 22 | fn from(_: anyhow::Error) -> Self { 23 | PromptError::Quit 24 | } 25 | } 26 | 27 | impl From for anyhow::Error { 28 | fn from(err: PromptError) -> Self { 29 | match err { 30 | PromptError::No => anyhow!("declined"), 31 | PromptError::Quit => anyhow!("cancelled"), 32 | } 33 | } 34 | } 35 | 36 | /// Prompt the user for confirmation. 37 | pub fn prompt_yes_no(msg: impl Into>) -> Result<(), PromptError> { 38 | let (tx, rx) = mpsc::channel(); 39 | let msg = msg.into(); // I need ownership of an immutable message here. 40 | let f = move |input: &mut String| { 41 | aborted()?; 42 | print!("{msg} [y|n|q]: "); 43 | stdout().flush()?; 44 | input.clear(); 45 | stdin().read_line(input)?; 46 | Ok::<_, anyhow::Error>(()) 47 | }; 48 | thread::spawn(move || { 49 | let mut input = String::new(); 50 | let res = loop { 51 | match (f(&mut input), input.trim()) { 52 | (Err(err), _) => break Err(err.into()), 53 | (Ok(()), "y" | "yes") => break Ok(()), 54 | (Ok(()), "n" | "no") => break Err(PromptError::No), 55 | (Ok(()), "q" | "quit") => break Err(PromptError::Quit), 56 | _ => {} 57 | } 58 | }; 59 | let _ = tx.send(res); 60 | }); 61 | 62 | loop { 63 | match rx.recv_timeout(Duration::from_millis(1000 / 2)) { 64 | Ok(res) => break res, 65 | Err(_) => aborted().map_err(|_| PromptError::Quit)?, 66 | } 67 | } 68 | } 69 | 70 | /// Intern a string, to prevent duplicates and redundant allocations. 71 | pub fn intern(text: &str) -> &'static str { 72 | static CACHE: LazyLock>> = LazyLock::new(Default::default); 73 | 74 | let mut cache = CACHE.lock().unwrap(); 75 | match cache.get(text) { 76 | Some(x) => x, 77 | None => { 78 | let interned = Box::leak(text.to_owned().into_boxed_str()); 79 | cache.insert(interned); 80 | interned 81 | } 82 | } 83 | } 84 | 85 | /// Parse a key-value pair from a string, for use in clap. 86 | pub fn parse_key_value(s: &str) -> Result<(K, V)> 87 | where 88 | K: FromStr, 89 | V: FromStr, 90 | { 91 | let pos = s 92 | .find('=') 93 | .ok_or_else(|| anyhow!("missing =value in: {s:?}"))?; 94 | Ok((s[..pos].parse()?, s[pos + 1..].parse()?)) 95 | } 96 | -------------------------------------------------------------------------------- /src/entries/input.rs: -------------------------------------------------------------------------------- 1 | use crate::entries::{Entry, Fetcher, Filter}; 2 | use anyhow::{Result, anyhow}; 3 | use clap::Args; 4 | use std::path::PathBuf; 5 | 6 | #[derive(Debug, Args)] 7 | pub struct Input { 8 | /// Just show the entries that would be processed, without running any command. 9 | #[arg(long, global = true)] 10 | show: bool, 11 | /// Directories to scan. 12 | #[arg(global = true, help_heading = None)] 13 | dirs: Vec, 14 | /// The maximum recursion depth; use 0 for unlimited. 15 | #[arg(short = 'R', long, default_value_t = 0, value_name = "INT", global = true, help_heading = Some("Fetch"))] 16 | recursion: u32, 17 | #[command(flatten)] 18 | filter: Filter, 19 | } 20 | 21 | /// The input data structure that holds the effective paths to scan and their properties. 22 | #[derive(Debug)] 23 | pub struct EffectiveInput { 24 | pub show: bool, 25 | pub info: InputInfo, 26 | fetcher: Fetcher, 27 | } 28 | 29 | impl EffectiveInput { 30 | pub fn fetcher(self) -> Fetcher { 31 | self.fetcher 32 | } 33 | } 34 | 35 | #[derive(Debug)] 36 | pub struct InputInfo { 37 | /// The effective number of paths to scan, after deduplication and validation. 38 | pub num_valid: usize, 39 | /// Whether there were invalid/not found paths. 40 | pub has_invalid: bool, 41 | } 42 | 43 | impl TryFrom for EffectiveInput { 44 | type Error = anyhow::Error; 45 | 46 | fn try_from(input: Input) -> Result { 47 | let (dirs, info) = validate(input.dirs)?; 48 | if dirs.is_empty() { 49 | return Err(anyhow!("no valid paths given")); 50 | } 51 | let filter = input.filter.try_into()?; 52 | let fetcher = Fetcher::new(dirs, input.recursion.into(), filter); 53 | let ei = EffectiveInput { 54 | show: input.show, 55 | info, 56 | fetcher, 57 | }; 58 | Ok(ei) 59 | } 60 | } 61 | 62 | fn validate(mut dirs: Vec) -> Result<(Vec, InputInfo)> { 63 | if dirs.is_empty() { 64 | dirs = vec![".".into()]; // use the current directory if no paths are given. 65 | } 66 | let n = dirs.len(); 67 | dirs.sort_unstable(); 68 | dirs.dedup(); 69 | if n != dirs.len() { 70 | eprintln!("warning: {} duplicated directories ignored", n - dirs.len()); 71 | } 72 | 73 | let n = dirs.len(); 74 | let dirs = dirs 75 | .into_iter() 76 | .map(Entry::try_from) 77 | .filter_map(|res| match res { 78 | Ok(entry) if entry.is_dir() => Some(entry), 79 | Ok(entry) => { 80 | eprintln!("warning: {entry} is not a directory, skipping"); 81 | None 82 | } 83 | Err((pb, err)) => { 84 | eprintln!("warning: invalid path {pb:?}: {err}"); 85 | None 86 | } 87 | }) 88 | .collect::>(); 89 | 90 | if dirs.is_empty() { 91 | return Err(anyhow!("no valid paths given")); 92 | } 93 | 94 | let info = InputInfo { 95 | num_valid: dirs.len(), 96 | has_invalid: n != dirs.len(), 97 | }; 98 | Ok((dirs, info)) 99 | } 100 | -------------------------------------------------------------------------------- /src/medias/ops.rs: -------------------------------------------------------------------------------- 1 | use super::{NewEntry, SourceEntry}; 2 | use std::io::Write; 3 | use std::path::Path; 4 | use std::{fs, io}; 5 | 6 | /// Implements file operations that consume the original media data on success. 7 | pub struct FileOps; 8 | 9 | impl FileOps { 10 | /// Rename files and directories, or move them within the same file system. 11 | pub fn rename_move(medias: &mut Vec) { 12 | files_op(medias, silent, |p, q| fs::rename(p, q)) 13 | } 14 | /// Copy files to a new location, even if the file systems are different. 15 | pub fn copy(medias: &mut Vec) { 16 | files_op(medias, verbose, |p, q| copy_path(p, q, false, 0)) 17 | } 18 | /// Move files to a new location by copying and removing the original, even if the file systems are different. 19 | pub fn cross_move(medias: &mut Vec) { 20 | files_op(medias, verbose, |p, q| copy_path(p, q, true, 0)) 21 | } 22 | } 23 | 24 | fn files_op( 25 | paths: &mut Vec, 26 | notify: fn(&[u8]), 27 | op: fn(&Path, &Path) -> io::Result<()>, 28 | ) { 29 | paths.retain(|m| { 30 | let target = m.new_entry(); 31 | if target.exists() { 32 | notify(b"-\n"); 33 | eprintln!("error: file already exists: {} -> {target}", m.src_entry()); 34 | notify(b"\n"); 35 | return true; 36 | } 37 | match op(m.src_entry().as_ref(), target.as_ref()) { 38 | Ok(()) => false, 39 | Err(err) => { 40 | notify(b"x\n"); 41 | eprintln!("error: {err}: {} -> {target}", m.src_entry()); 42 | notify(b"\n"); 43 | true 44 | } 45 | } 46 | }); 47 | notify(b"\n"); 48 | } 49 | 50 | // `n` is just a counter for verbose output. 51 | fn copy_path(p: &Path, q: &Path, remove_dir: bool, n: usize) -> io::Result<()> { 52 | if p.is_dir() { 53 | fs::create_dir(q).and_then(|()| { 54 | verbose(b"d["); 55 | let files = fs::read_dir(p)? 56 | .flatten() 57 | .try_fold(Vec::new(), |mut acc, de| { 58 | let is_dir = de.path().is_dir(); // need to cache because is_dir goes to the fs again, and copy_path may have removed it. 59 | copy_path(&de.path(), &q.join(de.file_name()), remove_dir, n + 1).map(|()| { 60 | if !is_dir { 61 | verbose(b"."); 62 | if remove_dir { 63 | acc.push(de.path()) 64 | } 65 | } 66 | acc 67 | }) 68 | }); 69 | verbose(b"]"); 70 | if remove_dir { 71 | files 72 | .and_then(|files| files.iter().try_for_each(fs::remove_file)) 73 | .and_then(|()| fs::remove_dir(p)) 74 | } else { 75 | files.map(|_| ()) 76 | } 77 | }) 78 | } else if n == 0 { 79 | fs::copy(p, q).and_then(|_| { 80 | verbose(b"."); 81 | if remove_dir { 82 | fs::remove_file(p)? 83 | } 84 | Ok(()) 85 | }) 86 | } else { 87 | fs::copy(p, q).map(|_| ()) // this is called recursively by the is_dir case above. 88 | } 89 | } 90 | 91 | fn silent(_: &[u8]) {} 92 | fn verbose(c: &[u8]) { 93 | io::stdout().write_all(c).unwrap(); 94 | io::stdout().flush().unwrap(); 95 | } 96 | -------------------------------------------------------------------------------- /src/commands.rs: -------------------------------------------------------------------------------- 1 | mod dupes; 2 | mod join; 3 | mod list; 4 | mod probe; 5 | mod rebuild; 6 | mod rename; 7 | 8 | use crate::entries::{EffectiveInput, Entry, InputInfo, TraversalMode}; 9 | use crate::utils::natural_cmp; 10 | use anyhow::Result; 11 | use clap::Subcommand; 12 | 13 | #[derive(Debug, Subcommand)] 14 | pub enum Command { 15 | /// Find possibly duplicated files by both size/sample and filename similarity. 16 | #[command(override_usage = "refine dupes [DIRS]... [FETCH] [OPTIONS]")] 17 | Dupes(dupes::Dupes), 18 | /// Join files into a single directory with advanced conflict resolution. 19 | #[command(override_usage = "refine join [DIRS]... [FETCH] [OPTIONS]")] 20 | Join(join::Join), 21 | /// List files from multiple disjoint directories sorted together. 22 | #[command(override_usage = "refine list [DIRS]... [FETCH] [OPTIONS]")] 23 | List(list::List), 24 | /// Rebuild entire media collections' filenames intelligently. 25 | #[command(override_usage = "refine rebuild [DIRS]... [FETCH] [OPTIONS]")] 26 | Rebuild(rebuild::Rebuild), 27 | /// Rename files and directories in batch using advanced regex rules. 28 | #[command(override_usage = "refine rename [DIRS]... [FETCH] [OPTIONS]")] 29 | Rename(rename::Rename), 30 | /// Probe collections' filenames against a remote server. 31 | #[command(override_usage = "refine probe [DIRS]... [FETCH] [OPTIONS]")] 32 | Probe(probe::Probe), 33 | } 34 | 35 | /// The common interface for commands that refine media files. 36 | pub trait Refine { 37 | type Media: TryFrom; 38 | 39 | /// The opening line to display when running the command. 40 | const OPENING_LINE: &'static str; 41 | /// The mode of traversal to use when fetching entries. 42 | const T_MODE: TraversalMode; 43 | 44 | /// Tweak the command options to fix small issues after the opening line, but before fetching 45 | /// the entries and converting them to the proper Media type. 46 | fn tweak(&mut self, _: &InputInfo) {} 47 | /// Actual command implementation, called with the fetched media files. 48 | fn refine(&self, medias: Vec) -> Result<()>; 49 | } 50 | 51 | // /// The common interface for commands that change the configuration of Refine commands. 52 | // pub trait Configure { 53 | // /// The opening line to display when running the command. 54 | // const OPENING_LINE: &'static str; 55 | // 56 | // /// Actual command implementation. 57 | // fn config(&self) -> Result<()>; 58 | // } 59 | // fn configure(mut r: C, fetcher: Fetcher, info: InputSpec) -> Result<()> { 60 | // println!("=> {}\n", C::OPENING_LINE); 61 | // r.config() 62 | // } 63 | 64 | fn refine(mut opt: R, ei: EffectiveInput) -> Result<()> { 65 | println!("=> {}\n", R::OPENING_LINE); 66 | opt.tweak(&ei.info); 67 | opt.refine(gen_medias(ei.fetcher().fetch(R::T_MODE))) 68 | } 69 | 70 | fn show(_: R, ei: EffectiveInput) { 71 | println!("\nentries this command will process:\n"); 72 | let mut entries = ei.fetcher().fetch(R::T_MODE).collect::>(); 73 | entries.sort_unstable_by(|e, f| natural_cmp(e.to_str(), f.to_str())); 74 | entries.iter().for_each(|e| println!("{e}")); 75 | match entries.len() { 76 | 0 => println!("no entries found"), 77 | n => println!("\ntotal entries: {n}"), 78 | } 79 | } 80 | 81 | impl Command { 82 | pub fn execute(self, ei: EffectiveInput) -> Result<()> { 83 | macro_rules! call { 84 | ($opt:expr) => { 85 | match ei.show { 86 | false => refine($opt, ei), 87 | true => Ok(show($opt, ei)), 88 | } 89 | }; 90 | } 91 | match self { 92 | Command::Dupes(opt) => call!(opt), 93 | Command::Join(opt) => call!(opt), 94 | Command::List(opt) => call!(opt), 95 | Command::Rebuild(opt) => call!(opt), 96 | Command::Rename(opt) => call!(opt), 97 | Command::Probe(opt) => call!(opt), 98 | } 99 | } 100 | } 101 | 102 | fn gen_medias(entries: impl Iterator) -> Vec 103 | where 104 | T: TryFrom, 105 | { 106 | entries 107 | .map(|entry| T::try_from(entry)) 108 | .inspect(|res| { 109 | if let Err((entry, err)) = res { 110 | eprintln!("error: load media {entry}: {err}"); 111 | } 112 | }) 113 | .flatten() 114 | .collect() 115 | } 116 | -------------------------------------------------------------------------------- /src/entries/mod.rs: -------------------------------------------------------------------------------- 1 | mod entry; 2 | mod filter; 3 | mod input; 4 | 5 | use crate::utils; 6 | pub use entry::*; 7 | pub use filter::*; 8 | pub use input::*; 9 | use std::iter; 10 | use std::rc::Rc; 11 | 12 | /// The object that fetches and filters entries from multiple directories. 13 | #[derive(Debug)] 14 | pub struct Fetcher { 15 | dirs: Vec, 16 | recurse: Recurse, 17 | filter: FilterRules, 18 | } 19 | 20 | /// The mode of traversal to use when fetching entries. 21 | #[derive(Debug, Copy, Clone)] 22 | pub enum TraversalMode { 23 | /// Only files (dupes, probe, and rebuild). 24 | Files, 25 | /// Directories stop recursion because the dir itself is the output (join). 26 | DirsStop, 27 | /// Directories are chained with their content (rename). 28 | DirsAndContent, 29 | /// Contents are listed while recursing, and change to directories at the max depth (list). 30 | ContentOverDirs, 31 | } 32 | 33 | /// The friendly recursion mode for fetching entries. 34 | #[derive(Debug, Copy, Clone)] 35 | pub enum Recurse { 36 | Full, 37 | Shallow, 38 | UpTo(u32), 39 | } 40 | 41 | impl Fetcher { 42 | /// Fetches all entries from a single entry directory. 43 | pub fn single(entry: &Entry, recurse: Recurse) -> Self { 44 | Self::new(vec![entry.to_owned()], recurse, FilterRules::default()) 45 | } 46 | 47 | /// Fetches entries from the given entry directories. 48 | pub fn new(dirs: Vec, recurse: Recurse, filter: FilterRules) -> Self { 49 | Fetcher { 50 | dirs, 51 | recurse, 52 | filter, 53 | } 54 | } 55 | 56 | pub fn fetch(self, mode: TraversalMode) -> impl Iterator { 57 | let depth = self.recurse.into(); 58 | let fr = Rc::new(self.filter); 59 | self.dirs 60 | .into_iter() 61 | .flat_map(move |dir| entries(dir, depth, mode, Rc::clone(&fr))) 62 | } 63 | } 64 | 65 | fn entries( 66 | dir: Entry, 67 | depth: Depth, 68 | mode: TraversalMode, 69 | fr: Rc, 70 | ) -> Box> { 71 | if !utils::is_running() { 72 | return Box::new(iter::empty()); 73 | } 74 | 75 | // this does allow hidden directories, if the user directly asks for them. 76 | match std::fs::read_dir(&dir) { 77 | Ok(rd) => Box::new( 78 | rd.inspect(|res| { 79 | if let Err(err) = res { 80 | eprintln!("error: dir entry: {err}"); 81 | } 82 | }) 83 | .flatten() 84 | .map(move |de| de.file_name().to_str().map(|s| dir.join(s)).ok_or(de)) 85 | .inspect(|res| { 86 | if let Err(de) = res { 87 | eprintln!("error: no UTF-8 name: {de:?}"); 88 | } 89 | }) 90 | .flatten() 91 | .flat_map(move |entry| { 92 | use TraversalMode::*; 93 | if !entry.is_dir() { 94 | // files that pass the filter are always included in any mode. 95 | return if fr.is_in(&entry) && !entry.file_name().starts_with(".") { 96 | Box::new(iter::once(entry)) as Box> 97 | } else { 98 | Box::new(iter::empty()) 99 | }; 100 | } 101 | // if the entry is a directory, it's much more complicated. 102 | match (fr.is_in(&entry), (mode, depth.deeper())) { 103 | // cases that the directory is yielded and not recursed into. 104 | (true, (DirsAndContent | ContentOverDirs, None) | (DirsStop, _)) => { 105 | Box::new(iter::once(entry)) 106 | } 107 | // the directory is yielded with its content and recursed into. 108 | (true, (DirsAndContent, Some(d))) => Box::new( 109 | iter::once(entry.clone()).chain(entries(entry, d, mode, Rc::clone(&fr))), 110 | ), 111 | // recurse into dirs if depth available, to find more matching entries deeper in the hierarchy. 112 | (_, (_, Some(d))) if !entry.file_name().starts_with(".") => { 113 | entries(entry, d, mode, Rc::clone(&fr)) 114 | } 115 | _ => Box::new(iter::empty()), 116 | } 117 | }), 118 | ), 119 | Err(err) => { 120 | eprintln!("error: read dir {dir}: {err}"); 121 | Box::new(iter::empty()) 122 | } 123 | } 124 | } 125 | 126 | impl From for Recurse { 127 | fn from(d: u32) -> Self { 128 | match d { 129 | 0 => Recurse::Full, 130 | 1 => Recurse::Shallow, 131 | _ => Recurse::UpTo(d), 132 | } 133 | } 134 | } 135 | 136 | impl From for Depth { 137 | fn from(r: Recurse) -> Self { 138 | match r { 139 | Recurse::Full => Depth { max: 0, curr: 0 }, 140 | Recurse::Shallow => Depth { max: 1, curr: 0 }, 141 | Recurse::UpTo(d) => Depth { max: d, curr: 0 }, 142 | } 143 | } 144 | } 145 | 146 | /// Used to track the depth of recursion when fetching entries. 147 | #[derive(Debug, Copy, Clone)] 148 | struct Depth { 149 | curr: u32, 150 | max: u32, 151 | } 152 | 153 | impl Depth { 154 | fn deeper(self) -> Option { 155 | let Depth { curr, max } = self; 156 | let curr = curr + 1; 157 | (curr < max || max == 0).then_some(Depth { curr, max }) 158 | } 159 | } 160 | -------------------------------------------------------------------------------- /src/commands/list.rs: -------------------------------------------------------------------------------- 1 | use crate::commands::Refine; 2 | use crate::entries::{Entry, Fetcher, InputInfo, Recurse, TraversalMode}; 3 | use crate::utils::{display_abort, natural_cmp}; 4 | use anyhow::Result; 5 | use clap::{Args, ValueEnum}; 6 | use human_repr::HumanCount; 7 | use std::cmp::Ordering; 8 | use std::sync::OnceLock; 9 | use yansi::{Color, Paint}; 10 | 11 | #[derive(Debug, Args)] 12 | pub struct List { 13 | /// Sort by. 14 | #[arg(short = 'b', long, default_value_t = By::Size, value_name = "STR", value_enum)] 15 | by: By, 16 | /// Reverse the default order (size/count:desc, name/path:asc). 17 | #[arg(short = 'r', long)] 18 | rev: bool, 19 | /// Show full file paths. 20 | #[arg(short = 'p', long)] 21 | paths: bool, 22 | /// Do not calculate directory sizes. 23 | #[arg(short = 'c', long)] 24 | no_calc_dirs: bool, 25 | } 26 | 27 | #[derive(Debug, Copy, Clone, PartialEq, ValueEnum)] 28 | pub enum By { 29 | #[value(alias = "s")] 30 | Size, 31 | #[value(alias = "c")] 32 | Count, 33 | #[value(alias = "n")] 34 | Name, 35 | #[value(alias = "p")] 36 | Path, 37 | } 38 | 39 | #[derive(Debug)] 40 | pub struct Media { 41 | entry: Entry, 42 | size_count: Option<(u64, u32)>, 43 | } 44 | 45 | const ORDERING: &[(By, bool)] = &[ 46 | (By::Size, true), 47 | (By::Count, true), 48 | (By::Name, false), 49 | (By::Path, false), 50 | ]; 51 | static CALC_DIR_SIZES: OnceLock = OnceLock::new(); 52 | 53 | impl Refine for List { 54 | type Media = Media; 55 | const OPENING_LINE: &'static str = "List files"; 56 | const T_MODE: TraversalMode = TraversalMode::ContentOverDirs; 57 | 58 | fn tweak(&mut self, info: &InputInfo) { 59 | self.rev ^= ORDERING.iter().find(|(b, _)| *b == self.by).unwrap().1; 60 | if self.by == By::Path && !self.paths { 61 | self.paths = true; 62 | eprintln!("Enabling file paths due to sorting by paths.\n"); 63 | } 64 | if info.num_valid > 1 && !self.paths { 65 | self.paths = true; 66 | eprintln!("Enabling file paths due to multiple input paths.\n"); 67 | } 68 | CALC_DIR_SIZES.set(!self.no_calc_dirs).unwrap(); 69 | } 70 | 71 | fn refine(&self, mut medias: Vec) -> Result<()> { 72 | // step: sort the files by size, count, name, or path. 73 | let compare: fn(&Media, &Media) -> _ = match self.by { 74 | By::Size => |m, n| { 75 | m.size_count 76 | .map(|(s, _)| s) 77 | .cmp(&n.size_count.map(|(s, _)| s)) 78 | }, 79 | By::Count => |m, n| { 80 | m.size_count 81 | .map(|(_, c)| c) 82 | .cmp(&n.size_count.map(|(_, c)| c)) 83 | }, 84 | By::Name => |m, n| natural_cmp(m.entry.file_name(), n.entry.file_name()), 85 | By::Path => |_, _| Ordering::Equal, // bypass to the secondary sort. 86 | }; 87 | let compare: &dyn Fn(&_, &_) -> _ = match self.rev { 88 | false => &compare, 89 | true => &|m, n| compare(m, n).reverse(), 90 | }; 91 | medias.sort_unstable_by(|m, n| { 92 | compare(m, n).then_with(|| natural_cmp(m.entry.to_str(), n.entry.to_str())) // primary + secondary sort. 93 | }); 94 | 95 | // step: display the results. 96 | medias.iter().for_each(|m| { 97 | let (size, count) = match m.size_count { 98 | Some((s, c)) => (&*format!("{}", s.human_count_bytes()), &*format!("{c}")), 99 | None => ("?", "?"), 100 | }; 101 | match self.paths { 102 | true => print!("{size:>8} {}", m.entry.display_path()), 103 | false => print!("{size:>8} {}", m.entry.display_filename()), 104 | }; 105 | if m.entry.is_dir() && m.size_count.is_some() { 106 | print!(" {} files", count.paint(Color::Blue).linger()); 107 | } 108 | println!("{}", "".resetting()); 109 | }); 110 | 111 | // step: display a summary receipt. 112 | if !medias.is_empty() { 113 | println!(); 114 | } 115 | let (mut size, mut count) = (0, 0); 116 | medias 117 | .iter() 118 | .filter_map(|m| m.size_count) 119 | .for_each(|(s, c)| { 120 | size += s; 121 | count += c; 122 | }); 123 | println!("listed entries: {}{}", medias.len(), display_abort(true),); 124 | println!(" total: {} in {count} files", size.human_count("B"),); 125 | 126 | Ok(()) 127 | } 128 | } 129 | 130 | impl TryFrom for Media { 131 | type Error = (Entry, anyhow::Error); 132 | 133 | fn try_from(entry: Entry) -> Result { 134 | let size_count = match (entry.is_dir(), CALC_DIR_SIZES.get().unwrap()) { 135 | (true, false) => None, 136 | (true, true) => { 137 | let fetcher = Fetcher::single(&entry, Recurse::Full); 138 | let mut count = 0; 139 | let sum = fetcher 140 | .fetch(TraversalMode::Files) 141 | .map(|e| { 142 | count += 1; 143 | e.metadata().map_or(0, |md| md.len()) 144 | }) 145 | .sum::(); 146 | Some((sum, count)) 147 | } 148 | (false, _) => { 149 | let size = entry.metadata().map_or(0, |md| md.len()); 150 | Some((size, 1)) 151 | } 152 | }; 153 | Ok(Self { entry, size_count }) 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /src/utils/natural.rs: -------------------------------------------------------------------------------- 1 | use std::cmp::Ordering; 2 | use std::iter::Peekable; 3 | use std::str::Chars; 4 | 5 | /// Compare two strings in a natural order, case-insensitive. 6 | pub fn natural_cmp(a: impl AsRef, b: impl AsRef) -> Ordering { 7 | let mut a_chars = a.as_ref().chars().peekable(); 8 | let mut b_chars = b.as_ref().chars().peekable(); 9 | 10 | while let Some(a_peek) = a_chars.peek() 11 | && let Some(b_peek) = b_chars.peek() 12 | { 13 | let a_is_digit = a_peek.is_ascii_digit(); 14 | let b_is_digit = b_peek.is_ascii_digit(); 15 | 16 | let ordering = match (a_is_digit, b_is_digit) { 17 | (true, true) => compare_num_chunks(&mut a_chars, &mut b_chars), 18 | (false, false) => compare_text_chunks(&mut a_chars, &mut b_chars), 19 | (true, false) => Ordering::Less, // numbers come before text. 20 | (false, true) => Ordering::Greater, // text comes after numbers. 21 | }; 22 | 23 | if ordering != Ordering::Equal { 24 | return ordering; 25 | } 26 | } 27 | 28 | // check for remaining characters in either string. 29 | a_chars.peek().is_some().cmp(&b_chars.peek().is_some()) 30 | } 31 | 32 | /// Compare numeric chunks directly from the character iterator. 33 | fn compare_num_chunks(a_chars: &mut Peekable, b_chars: &mut Peekable) -> Ordering { 34 | fn parse_number(chars: &mut Peekable) -> (u64, usize) { 35 | let (mut value, mut length) = (0u64, 0); 36 | 37 | while let Some(&c) = chars.peek() 38 | && c.is_ascii_digit() 39 | { 40 | chars.next(); // consume the character. 41 | value = value 42 | .saturating_mul(10) // saturating to prevent overflow for very large numbers. 43 | .saturating_add((c as u32 - '0' as u32) as u64); 44 | length += 1; 45 | } 46 | 47 | (value, length) 48 | } 49 | 50 | let (num_a, len_a) = parse_number(a_chars); 51 | let (num_b, len_b) = parse_number(b_chars); 52 | 53 | // compare numeric values first, then original length for leading zeros. 54 | num_a.cmp(&num_b).then_with(|| len_a.cmp(&len_b)) 55 | } 56 | 57 | /// Compare text chunks case-insensitively directly from the character iterators. 58 | fn compare_text_chunks(a_chars: &mut Peekable, b_chars: &mut Peekable) -> Ordering { 59 | fn consume_remaining_text(chars: &mut Peekable) { 60 | while let Some(&c) = chars.peek() 61 | && !c.is_ascii_digit() 62 | { 63 | chars.next(); // consume remaining text characters. 64 | } 65 | } 66 | 67 | while let Some(&a) = a_chars.peek() 68 | && !a.is_ascii_digit() 69 | && let Some(&b) = b_chars.peek() 70 | && !b.is_ascii_digit() 71 | { 72 | a_chars.next(); 73 | b_chars.next(); 74 | 75 | // fast path for ASCII characters - direct case conversion is much faster. 76 | let ordering = if a.is_ascii() && b.is_ascii() { 77 | a.to_ascii_lowercase().cmp(&b.to_ascii_lowercase()) 78 | } else { 79 | a.to_lowercase().cmp(b.to_lowercase()) 80 | }; 81 | 82 | if ordering != Ordering::Equal { 83 | // consume remaining text characters from both iterators before returning. 84 | consume_remaining_text(a_chars); 85 | consume_remaining_text(b_chars); 86 | return ordering; 87 | } 88 | } 89 | 90 | // peek at both iterators to check for digits or end. 91 | match (a_chars.peek(), b_chars.peek()) { 92 | // a still has text, b has digit or end. 93 | (Some(a_char), _) if !a_char.is_ascii_digit() => { 94 | consume_remaining_text(a_chars); 95 | Ordering::Greater 96 | } 97 | // b still has text, a has digit or end. 98 | (_, Some(b_char)) if !b_char.is_ascii_digit() => { 99 | consume_remaining_text(b_chars); 100 | Ordering::Less 101 | } 102 | // both have digits or both are at end. 103 | _ => Ordering::Equal, 104 | } 105 | } 106 | 107 | #[cfg(test)] 108 | mod tests { 109 | use super::*; 110 | 111 | #[test] 112 | fn numbers() { 113 | let mut values = vec!["2", "10", "1"]; 114 | values.sort_unstable_by(|a, b| natural_cmp(a, b)); 115 | assert_eq!(values, ["1", "2", "10"]); 116 | } 117 | 118 | #[test] 119 | fn text() { 120 | let mut values = vec!["b", "c", "a"]; 121 | values.sort_unstable_by(|a, b| natural_cmp(a, b)); 122 | assert_eq!(values, ["a", "b", "c"]); 123 | } 124 | 125 | #[test] 126 | fn mixed() { 127 | let mut values = vec!["file1", "file10", "file2"]; 128 | values.sort_unstable_by(|a, b| natural_cmp(a, b)); 129 | assert_eq!(values, ["file1", "file2", "file10"]); 130 | } 131 | 132 | #[test] 133 | fn mixed_suffix() { 134 | let mut values = vec!["file1", "file10", "file2"]; 135 | values.sort_unstable_by(|a, b| natural_cmp(a, b)); 136 | assert_eq!(values, ["file1", "file2", "file10"]); 137 | } 138 | 139 | #[test] 140 | fn complex() { 141 | let mut values = vec!["file1", "file1B", "file00", "file11", "file0002"]; 142 | values.sort_unstable_by(|a, b| natural_cmp(a, b)); 143 | assert_eq!(values, ["file00", "file1", "file1B", "file0002", "file11"]); 144 | } 145 | 146 | #[test] 147 | fn hierarchical() { 148 | let mut values = vec!["file-10", "file-1", "file-1-2", "file-2", "file-1-10"]; 149 | values.sort_unstable_by(|a, b| natural_cmp(a, b)); 150 | assert_eq!( 151 | values, 152 | ["file-1", "file-1-2", "file-1-10", "file-2", "file-10",] 153 | ); 154 | } 155 | 156 | #[test] 157 | fn with_zeros() { 158 | let mut values = vec!["file01", "file1", "file10", "file001"]; 159 | values.sort_unstable_by(|a, b| natural_cmp(a, b)); 160 | assert_eq!(values, ["file1", "file01", "file001", "file10"]); 161 | } 162 | 163 | #[test] 164 | fn empty_strings() { 165 | let mut values = vec!["", "file1", ""]; 166 | values.sort_unstable_by(|a, b| natural_cmp(a, b)); 167 | assert_eq!(values, ["", "", "file1"]); 168 | } 169 | 170 | #[test] 171 | fn actual_strings() { 172 | let mut values = vec!["file2".to_string(), "file10".to_string()]; 173 | values.sort_unstable_by(|a, b| natural_cmp(a, b)); 174 | assert_eq!(values, ["file2", "file10"]); 175 | } 176 | } 177 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 3.0.0 - Sep 05, 2025 4 | - dupes: totally rewritten search algorithm with similarity detection combining fuzzy string matching (Levenshtein, Sørensen-Dice) and rare-token scoring, normalization, parallel processing, and advanced text processing 5 | - dupes: sampling uses a three-point strategy (beginning, middle, end) 6 | - dupes: `--sample` is now measured in KB instead of B 7 | - list: auto enable "Show full file paths" option when multiple root dirs are given 8 | - list: use natural sorting for displaying entries 9 | - join: display how clashes were resolved, the target directory, and whether it will move or copy files there 10 | - rebuild: include support for comments in collections after the sequence number, so you can add notes to your files 11 | - rebuild: uses new pattern format `name~sequence[comment]`, with support for migrating old collections to the new one 12 | - rebuild: do not fix gaps anymore when partial mode is enabled, which was causing unexpected renames when some files were moved; now only full mode will fix gaps 13 | - rename: display better clashes and how they are (or not) resolved 14 | - global: new `path_in` and `path_ex` fetch options for including and excluding full paths (and letting `dir_in` and `dir_ex` options for current dirs) 15 | - global: use natural sorting in `--show` option for displaying entries 16 | - global: fix `-i` failing to look for '…$' regexes, which required to remove the file extension first 17 | - new "recipe type" options in naming rules which are advanced transformations with ready to use regexes, starting with `throw` prefixes to the end as suffixes 18 | - new support for adding on-demand separators `{S}` in naming rules regexes to match `-`, ` ` (space), `.`, and `,` (`_` is not included as it might be part of names) 19 | - new partial enclosing brackets support in naming rules' before and after options 20 | 21 | ## 2.0.0 - Mar 24, 2025: 22 | - global support for **colors**! 23 | - list: command is greatly improved with support for listing directories, including their number of files and full sizes 24 | - global: new precise recursion feature 25 | - global: new `--view` option 26 | - global: input paths can now be relative. 27 | 28 | ## 1.4.0 - Feb 28, 2025: 29 | - new `probe` command 30 | - rebuild: new `--case` option to keep original case 31 | - rename: included support for handling clashes by inserting sequences in the filenames 32 | 33 | ## 1.3.1 - Feb 04, 2025: 34 | - rebuild: fix full mode, which wouldn't reset sequences 35 | 36 | ## 1.3.0 - Jan 31, 2025: 37 | - list: smarter list command, which hides full paths by default (with a flag for showing them if needed) and uses by default descending order for size and ascending for name and path (with a flag to reverse it if needed) 38 | - join: change no_remove flag to parents (n -> p) and some clash options 39 | - rebuild: change simple_match flag to simple and fix full mode, which was not resetting sequences 40 | - global: general polishing 41 | 42 | ## 1.2.1 - Nov 19, 2024: 43 | - global: upgrade regex dependency, so deps badge won't show "maybe insecure" 44 | 45 | ## 1.2.0 - Nov 19, 2024: 46 | - rebuild: much improved partial mode which can alter groups of filenames while preserving sequences, and even detect and fix gaps in sequences caused by deleted files 47 | 48 | ## 1.1.0 - Oct 10, 2024: 49 | - join: support not empty target folders and resolve clashes accordingly 50 | - join: fix join by copy still moving files 51 | - global: include support for aliases in several enum CLI arguments 52 | 53 | ## 1.0.0 - Oct 09, 2024: 54 | - rebuild: new partial mode, new replace feature, auto-enable partial mode in case not all directories are available 55 | - global: major overhaul 56 | 57 | ## 0.18.0 - Aug 27, 2024: 58 | - rebuild: new force implementation that is easier to use with improved memory usage 59 | 60 | ## 0.17.1 - Aug 15, 2024: 61 | - global: fix `--shallow` option 62 | 63 | ## 0.17.0 - Aug 05, 2024: 64 | - join: new clash resolve option 65 | - global: dedup input directories 66 | - global: support for selecting only files by filtering extensions 67 | 68 | ## 0.16.0 - Ago 01, 2024: 69 | - new `join` command 70 | - rename: include full directory support 71 | - global: scan with directory support 72 | - global: new magic filter options 73 | - global: new filter options 74 | 75 | ## 0.15.0 - Jul 18, 2024: 76 | - rename: nicer command output by parent directory 77 | - global: new threaded yes/no prompt that can be aborted with CTRL-C 78 | 79 | ## 0.14.0 - Jul 11, 2024: 80 | - rename: disallow by default changes in directories where clashes are detected, including new `--clashes` option to allow them 81 | 82 | ## 0.13.0 - Jul 10, 2024: 83 | - rename: new replace feature 84 | - dupes: remove case sensitivity option 85 | - global: make strip rules also remove `.` and `_` 86 | - global: `--include` and `--exclude` options do not check file extensions 87 | 88 | ## 0.12.0 - Jul 09, 2024: 89 | - global: new `--dir-in` and `--dir-out` options 90 | 91 | ## 0.11.0 - Jul 08, 2024: 92 | - new `rename` command 93 | - rebuild, rename: improve strip exact rules 94 | 95 | ## 0.10.0 - Jul 02, 2024: 96 | - global: new `--exclude` option 97 | 98 | ## 0.9.0 - Jul 01, 2024: 99 | - global: support for CTRL-C 100 | 101 | ## 0.8.0 - Jun 30, 2024: 102 | - new `list` command 103 | 104 | ## 0.7.1 - Jun 28, 2024: 105 | - rebuild: fix smart detect not grouping some files 106 | - global: `--include` is now case-insensitive 107 | - global: strip rules remove hyphens too 108 | 109 | ## 0.7.0 - Jun 27, 2024: 110 | - rebuild: new `--force`, new interactive mode, new `--yes`, auto fix rename errors, smaller memory consumption 111 | - dupes: improved performance 112 | - global: new `--include` option 113 | 114 | ## 0.6.0 - Jun 24, 2024: 115 | - new `rebuild` command 116 | - global: general polishing overall 117 | 118 | ## 0.5.0 - Jun 20, 2024: 119 | - dupes: ignores repetition systems 120 | - global: support for shallow scan, verbose mode 121 | 122 | ## 0.4.0 - Jun 17, 2024: 123 | - global: new subcommands structure 124 | - new `dupes` command, with support for matching case and changing sample size 125 | 126 | ## 0.3.0 - Nov 07, 2023: 127 | - include support for dedup by both size and name 128 | 129 | ## 0.2.2 - Jun 04, 2022: 130 | - use 2KB sample size 131 | 132 | ## 0.2.1 - Jun 04, 2022: 133 | - improve error handling 134 | 135 | ## 0.2.0 - Jun 01, 2022: 136 | - publish as `refine`, use split crate `human-repr` 137 | 138 | ## 0.1.1 - May 27, 2022: 139 | - samples the center of the files, which seems to fix false positives 140 | 141 | ## 0.1.0 - May 25, 2022: 142 | - first release, detects duplicated files, simple sampling strategy (1KB from the start of the files) 143 | -------------------------------------------------------------------------------- /src/entries/filter.rs: -------------------------------------------------------------------------------- 1 | use super::Entry; 2 | use anyhow::{Context, Result, anyhow}; 3 | use clap::Args; 4 | use clap::builder::NonEmptyStringValueParser; 5 | use regex::Regex; 6 | 7 | /// A set of rules that allow the user to specify which files and directories to include or exclude. 8 | #[derive(Debug, Args)] 9 | pub struct Filter { 10 | /// Include only files. 11 | #[arg(short = 'F', long, global = true, conflicts_with = "only_dirs", help_heading = Some("Fetch"))] 12 | only_files: bool, 13 | /// Include only directories. 14 | #[arg(short = 'D', long, global = true, conflicts_with = "only_files", help_heading = Some("Fetch"))] 15 | only_dirs: bool, 16 | /// Include everything that matches this (regardless of files or directories/paths). 17 | #[arg(short = 'i', long, global = true, help_heading = Some("Fetch"), value_name = "REGEX", allow_hyphen_values = true, value_parser = NonEmptyStringValueParser::new())] 18 | all_in: Option, 19 | /// Include only these current directories. 20 | #[arg(short = 'I', long, global = true, help_heading = Some("Fetch"), value_name = "REGEX", allow_hyphen_values = true, value_parser = NonEmptyStringValueParser::new())] 21 | dir_in: Option, 22 | /// Include only these paths. 23 | #[arg(long, global = true, help_heading = Some("Fetch"), value_name = "REGEX", allow_hyphen_values = true, value_parser = NonEmptyStringValueParser::new())] 24 | path_in: Option, 25 | /// Include only these filenames. 26 | #[arg(long, global = true, help_heading = Some("Fetch"), value_name = "REGEX", allow_hyphen_values = true, value_parser = NonEmptyStringValueParser::new())] 27 | file_in: Option, 28 | /// Include only these extensions. 29 | #[arg(long, global = true, help_heading = Some("Fetch"), value_name = "REGEX", allow_hyphen_values = true, value_parser = NonEmptyStringValueParser::new())] 30 | ext_in: Option, 31 | /// Exclude everything that matches this (regardless of files or directories/paths). 32 | #[arg(short = 'x', long, global = true, help_heading = Some("Fetch"), value_name = "REGEX", allow_hyphen_values = true, value_parser = NonEmptyStringValueParser::new())] 33 | all_ex: Option, 34 | /// Exclude these current directories. 35 | #[arg(short = 'X', long, global = true, help_heading = Some("Fetch"), value_name = "REGEX", allow_hyphen_values = true, value_parser = NonEmptyStringValueParser::new())] 36 | dir_ex: Option, 37 | /// Exclude these paths. 38 | #[arg(long, global = true, help_heading = Some("Fetch"), value_name = "REGEX", allow_hyphen_values = true, value_parser = NonEmptyStringValueParser::new())] 39 | path_ex: Option, 40 | /// Exclude these filenames. 41 | #[arg(long, global = true, help_heading = Some("Fetch"), value_name = "REGEX", allow_hyphen_values = true, value_parser = NonEmptyStringValueParser::new())] 42 | file_ex: Option, 43 | /// Exclude these extensions. 44 | #[arg(long, global = true, help_heading = Some("Fetch"), value_name = "REGEX", allow_hyphen_values = true, value_parser = NonEmptyStringValueParser::new())] 45 | ext_ex: Option, 46 | } 47 | 48 | /// The engine that applies the [Filter] rules to a collection of entries. 49 | #[derive(Debug, Default)] 50 | pub struct FilterRules { 51 | only_files: bool, 52 | only_dirs: bool, 53 | all: Constraint, 54 | dir: Constraint, 55 | path: Constraint, 56 | file: Constraint, 57 | ext: Constraint, 58 | } 59 | 60 | impl FilterRules { 61 | pub fn is_in(&self, entry: &Entry) -> bool { 62 | self.is_included(entry).unwrap_or_default() 63 | } 64 | 65 | fn is_included(&self, entry: &Entry) -> Option { 66 | let (stem, ext) = entry.filename_parts(); 67 | (!stem.starts_with('.')).then_some(())?; // exclude hidden files and directories. 68 | 69 | let parent = entry.parent()?; 70 | let full = format!("{}{stem}", parent.to_str()); // generate the full path without extension. 71 | let ret = self.all.is_match(&full) 72 | && match entry.is_dir() { 73 | true => { 74 | self.dir.is_match(entry.file_name()) // entry is a directory. 75 | && self.path.is_match(entry.to_str()) // the str is the full path. 76 | && !self.only_files 77 | } 78 | false => { 79 | self.file.is_match(stem) 80 | && self.ext.is_match(ext) 81 | && self.dir.is_match(parent.file_name()) 82 | && self.path.is_match(parent.to_str()) 83 | && !self.only_dirs 84 | } 85 | }; 86 | Some(ret) 87 | } 88 | } 89 | 90 | /// A pair of regexes that check strings for inclusion and exclusion. 91 | #[derive(Debug, Default)] 92 | pub struct Constraint { 93 | re_in: Option, 94 | re_ex: Option, 95 | } 96 | 97 | impl Constraint { 98 | fn is_match(&self, s: &str) -> bool { 99 | self.re_ex.as_ref().is_none_or(|re_ex| !re_ex.is_match(s)) 100 | && self.re_in.as_ref().is_none_or(|re_in| re_in.is_match(s)) 101 | } 102 | } 103 | 104 | type Param<'a> = (Option, &'a str); 105 | 106 | impl TryFrom<[Param<'_>; 2]> for Constraint { 107 | type Error = anyhow::Error; 108 | 109 | fn try_from([(re_in, p_in), (re_ex, p_ex)]: [Param; 2]) -> Result { 110 | Ok(Self { 111 | re_in: compile(re_in, p_in)?, 112 | re_ex: compile(re_ex, p_ex)?, 113 | }) 114 | } 115 | } 116 | 117 | impl TryFrom for FilterRules { 118 | type Error = anyhow::Error; 119 | 120 | fn try_from(s: Filter) -> Result { 121 | Ok(FilterRules { 122 | only_files: s.only_files, 123 | only_dirs: s.only_dirs, 124 | all: [(s.all_in, "all-in"), (s.all_ex, "all-ex")].try_into()?, 125 | dir: [(s.dir_in, "dir-in"), (s.dir_ex, "dir-ex")].try_into()?, 126 | path: [(s.path_in, "path-in"), (s.path_ex, "path-ex")].try_into()?, 127 | file: [(s.file_in, "file-in"), (s.file_ex, "file-ex")].try_into()?, 128 | ext: [(s.ext_in, "ext-in"), (s.ext_ex, "ext-ex")].try_into()?, 129 | }) 130 | } 131 | } 132 | 133 | // Compile an optional regular expression (case-insensitive). 134 | fn compile(value: Option, param: &str) -> Result> { 135 | let compiler = |r| { 136 | Regex::new(&format!("(?i){r}")) 137 | .with_context(|| format!("compiling regex: {r:?}")) 138 | .map_err(|err| anyhow!("error: invalid --{param}: {err:?}")) 139 | }; 140 | value.map(compiler).transpose() 141 | } 142 | -------------------------------------------------------------------------------- /src/commands/rename.rs: -------------------------------------------------------------------------------- 1 | use crate::commands::Refine; 2 | use crate::entries::{Entry, TraversalMode}; 3 | use crate::medias::{FileOps, Naming}; 4 | use crate::utils; 5 | use crate::{impl_new_name, impl_new_name_mut, impl_source_entry}; 6 | use anyhow::Result; 7 | use clap::{Args, ValueEnum}; 8 | use std::cmp::Reverse; 9 | use std::fmt::{Display, Write}; 10 | 11 | #[derive(Debug, Args)] 12 | pub struct Rename { 13 | #[command(flatten)] 14 | naming: Naming, 15 | /// How to resolve clashes. 16 | #[arg(short = 'c', long, default_value_t = Clashes::Sequence, value_name = "STR", value_enum)] 17 | clashes: Clashes, 18 | /// Skip the confirmation prompt, useful for automation. 19 | #[arg(short = 'y', long)] 20 | yes: bool, 21 | } 22 | 23 | #[derive(Debug, Clone, Copy, ValueEnum)] 24 | pub enum Clashes { 25 | #[value(aliases = ["s", "seq"])] 26 | Sequence, 27 | #[value(aliases = ["i", "ig"])] 28 | Ignore, 29 | #[value(aliases = ["f", "ff"])] 30 | Forbid, 31 | } 32 | 33 | #[derive(Debug)] 34 | pub struct Media { 35 | /// The original path to the file. 36 | entry: Entry, 37 | /// The new generated filename. 38 | new_name: String, 39 | /// A cached version of the file extension. 40 | ext: &'static str, 41 | /// Marks resolution of clashes. 42 | resolution: &'static str, 43 | } 44 | 45 | impl Refine for Rename { 46 | type Media = Media; 47 | const OPENING_LINE: &'static str = "Rename files"; 48 | const T_MODE: TraversalMode = TraversalMode::DirsAndContent; 49 | 50 | fn refine(&self, mut medias: Vec) -> Result<()> { 51 | let total_files = medias.len(); 52 | 53 | // step: apply naming rules. 54 | let mut blocked = self.naming.compile()?.apply(&mut medias); 55 | 56 | // step: re-include extension in the names. 57 | medias 58 | .iter_mut() 59 | .filter(|m| !m.ext.is_empty()) 60 | .try_for_each(|m| write!(m.new_name, ".{}", m.ext))?; 61 | 62 | // step: clashes resolution. 63 | let mut clashes = 0; 64 | medias.sort_unstable_by(|m, n| { 65 | (m.entry.parent(), &m.new_name).cmp(&(n.entry.parent(), &n.new_name)) 66 | }); 67 | medias 68 | .chunk_by_mut(|m, n| m.entry.parent() == n.entry.parent()) // only by parent. 69 | .filter(|_| utils::is_running()) 70 | .filter(|g| { 71 | g.chunk_by(|m, n| m.new_name == n.new_name) 72 | .any(|g| g.len() > 1) // this should be way faster than using a hashmap as before. 73 | }) 74 | .for_each(|g| { 75 | eprintln!("warning: names clash in: {}", g[0].entry.parent().unwrap()); 76 | g.chunk_by(|m, n| m.new_name == n.new_name) 77 | .filter(|g| g.len() > 1) 78 | .for_each(|g| { 79 | let k = &g[0].new_name; 80 | let list = g 81 | .iter() 82 | .map(|m| m.entry.file_name()) 83 | .filter(|f| f != k) 84 | .collect::>(); 85 | clashes += list.len(); 86 | use yansi::Paint; 87 | let msg = match g.len() != list.len() { 88 | true => " name already exists", 89 | false => " multiple names clash", 90 | }; 91 | eprintln!( 92 | " > {} --> {k}{}", 93 | list.join(", "), 94 | msg.paint(yansi::Color::BrightMagenta) 95 | ); 96 | }); 97 | match self.clashes { 98 | Clashes::Forbid => { 99 | let count = g.iter().filter(|m| m.is_changed()).count(); 100 | blocked += count; 101 | eprintln!(" ...blocked {count} changes in this folder"); 102 | g.iter_mut().for_each(|m| m.new_name.clear()); 103 | } 104 | Clashes::Ignore => g 105 | .chunk_by_mut(|m, n| m.new_name == n.new_name) 106 | .filter(|g| g.len() > 1) 107 | .for_each(|g| g.iter_mut().for_each(|m| m.new_name.clear())), 108 | Clashes::Sequence => { 109 | g.chunk_by_mut(|m, n| m.new_name == n.new_name) 110 | .filter(|g| g.len() > 1) 111 | .for_each(|g| { 112 | g.iter_mut().filter(|m| m.is_changed()).zip(1..).for_each( 113 | |(m, i)| { 114 | m.new_name.truncate(m.new_name.len() - m.ext.len() - 1); 115 | write!(m.new_name, "-{i}.{}", m.ext).unwrap(); 116 | m.resolution = " (added sequence number)"; 117 | }, 118 | ) 119 | }) 120 | } 121 | } 122 | }); 123 | 124 | utils::aborted()?; 125 | 126 | // step: settle changes. 127 | medias.retain(|m| !m.new_name.is_empty() && m.is_changed()); 128 | 129 | // step: display the results by parent directory. 130 | medias.sort_unstable_by(|m, n| { 131 | // requires a post-order like traversal to avoid move errors. 132 | // but since I couldn't find a way to do that, I just reverse the order. 133 | // that way, the deepest directories are processed first, before their parents. 134 | (Reverse(m.entry.parent()), &m.entry).cmp(&(Reverse(n.entry.parent()), &n.entry)) 135 | }); 136 | medias 137 | .chunk_by(|m, n| m.entry.parent() == n.entry.parent()) 138 | .for_each(|g| { 139 | println!("{}", g[0].entry.parent().unwrap()); 140 | use yansi::Paint; 141 | g.iter().for_each(|m| { 142 | println!( 143 | " {} --> {}{}", 144 | m.entry.display_filename(), 145 | m.new_name, 146 | m.resolution.paint(yansi::Color::BrightBlue) 147 | ) 148 | }); 149 | }); 150 | 151 | // step: display a summary receipt. 152 | if !medias.is_empty() || blocked > 0 { 153 | println!(); 154 | } 155 | println!("total files: {total_files}"); 156 | println!(" changes: {}", medias.len()); 157 | println!(" clashes: {clashes} ({})", self.clashes); 158 | println!(" blocked: {blocked}"); 159 | if medias.is_empty() { 160 | return Ok(()); 161 | } 162 | 163 | // step: apply changes if the user agrees. 164 | if !self.yes { 165 | utils::prompt_yes_no("apply changes?")?; 166 | } 167 | FileOps::rename_move(&mut medias); 168 | 169 | match medias.is_empty() { 170 | true => println!("done"), 171 | false => println!("found {} errors", medias.len()), 172 | } 173 | Ok(()) 174 | } 175 | } 176 | 177 | impl Display for Clashes { 178 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 179 | match self { 180 | Clashes::Sequence => write!(f, "resolved by adding a sequence number"), 181 | Clashes::Ignore => write!(f, "ignored, folders processed as usual"), 182 | Clashes::Forbid => write!(f, "whole folders with clashes blocked"), 183 | } 184 | } 185 | } 186 | 187 | impl_source_entry!(Media); 188 | impl_new_name!(Media); 189 | impl_new_name_mut!(Media); 190 | 191 | impl Media { 192 | fn is_changed(&self) -> bool { 193 | self.new_name != self.entry.file_name() 194 | } 195 | } 196 | 197 | impl TryFrom for Media { 198 | type Error = (Entry, anyhow::Error); 199 | 200 | fn try_from(entry: Entry) -> Result { 201 | let (stem, ext) = entry.filename_parts(); 202 | Ok(Media { 203 | new_name: stem.trim().to_owned(), 204 | ext: utils::intern(ext), 205 | entry, 206 | resolution: "", 207 | }) 208 | } 209 | } 210 | -------------------------------------------------------------------------------- /src/commands/probe.rs: -------------------------------------------------------------------------------- 1 | use crate::commands::Refine; 2 | use crate::entries::{Entry, InputInfo, TraversalMode}; 3 | use crate::utils::{self, display_abort}; 4 | use Verdict::*; 5 | use anyhow::{Context, Result, anyhow}; 6 | use clap::{Args, ValueEnum}; 7 | use regex::Regex; 8 | use std::fmt::Display; 9 | use std::io::{Write, stdout}; 10 | use std::time::Duration; 11 | use ureq::Agent; 12 | use ureq::http::StatusCode; 13 | 14 | #[derive(Debug, Args)] 15 | pub struct Probe { 16 | /// Pick a subset of the files to probe. 17 | #[arg(short = 'p', long, value_name = "REGEX")] 18 | pick: Option, 19 | /// The URL to probe filenames against (use `$` as placeholder, e.g. https://example.com/$/). 20 | #[arg(short = 'u', long)] 21 | url: String, 22 | /// The HTTP connection and read timeouts in milliseconds. 23 | #[arg(short = 't', long, default_value_t = 2000, value_name = "INT")] 24 | timeout: u64, 25 | /// The initial time to wait between retries in milliseconds. 26 | #[arg(short = 'n', long, default_value_t = 1000, value_name = "INT")] 27 | min_wait: u64, 28 | /// The factor by which to increase the time to wait between retries. 29 | #[arg(short = 'b', long, default_value_t = 1.5, value_name = "FLOAT")] 30 | backoff: f64, 31 | /// The maximum time to wait between retries in milliseconds. 32 | #[arg(short = 'a', long, default_value_t = 5000, value_name = "INT")] 33 | max_wait: u64, 34 | /// The maximum number of retries; use 0 to disable and -1 to retry indefinitely. 35 | #[arg(short = 'r', long, default_value_t = -1, value_name = "INT")] 36 | retries: i32, 37 | /// Specify when to display errors. 38 | #[arg(short = 'e', long, default_value_t = Errors::Each10, value_name = "STR", value_enum)] 39 | errors: Errors, 40 | // /// The HTTP request method to use. 41 | // #[arg(short = 'm', long, default_value = "HEAD", value_name = "STR")] 42 | // method: Method, 43 | // /// The number of concurrent connections. 44 | // #[arg(short = 'c', long, default_value = "10", value_name = "INT")] 45 | // connections: u8, 46 | // /// The rate limit in requests per second. 47 | // #[arg(short = 'r', long, default_value = "10", value_name = "INT")] 48 | // rate: u16, 49 | } 50 | 51 | #[derive(Debug, Clone, Copy, PartialEq, ValueEnum)] 52 | pub enum Errors { 53 | #[value(alias = "n")] 54 | Never, 55 | #[value(alias = "l")] 56 | Last, 57 | #[value(alias = "a")] 58 | Always, 59 | #[value(aliases = ["e", "10"])] 60 | Each10, 61 | } 62 | 63 | #[derive(Debug)] 64 | pub struct Media { 65 | name: String, 66 | verdict: Verdict, 67 | } 68 | 69 | #[derive(Debug, Copy, Clone, Eq, PartialEq)] 70 | enum Verdict { 71 | Pending, 72 | Valid, 73 | Invalid, 74 | Failed, 75 | } 76 | 77 | impl Refine for Probe { 78 | type Media = Media; 79 | const OPENING_LINE: &'static str = "Probe collection names online"; 80 | const T_MODE: TraversalMode = TraversalMode::Files; 81 | 82 | fn tweak(&mut self, _: &InputInfo) { 83 | if self.retries < 0 && self.errors == Errors::Last { 84 | eprintln!("Displaying \"last\" error won't show anything with indefinite retries.\n"); 85 | self.errors = Errors::Never; 86 | } 87 | } 88 | 89 | fn refine(&self, mut medias: Vec) -> Result<()> { 90 | // make sure the URL contains a single `$` placeholder. 91 | if self.url.bytes().filter(|&b| b == b'$').count() != 1 { 92 | return Err(anyhow!("URL must contain a single `$` placeholder")) 93 | .with_context(|| format!("invalid URL: {:?}", self.url)); 94 | } 95 | 96 | // make sure the URL is valid, but parsing it as a URI always succeeds. 97 | // it seems the only way to check it is by actually sending a request. 98 | ureq::head(&self.url) 99 | .config() 100 | .http_status_as_error(false) 101 | .build() 102 | .call() 103 | .with_context(|| format!("invalid URL: {:?}", self.url))?; 104 | 105 | // step: keep only unique file names (sequences were already removed). 106 | medias.sort_unstable_by(|m, n| m.name.cmp(&n.name)); 107 | medias.dedup_by(|m, n| m.name == n.name); 108 | 109 | // step: pick a subset of the files to probe. 110 | match &self.pick { 111 | Some(s) => { 112 | let re = Regex::new(s).context("invalid regex")?; 113 | medias.retain(|m| re.is_match(&m.name)); 114 | println!("probing names matching {s:?}: {}", medias.len()); 115 | } 116 | None => println!("probing all names: {}", medias.len()), 117 | } 118 | 119 | let total_names = medias.len(); 120 | 121 | // step: probe each file name. 122 | let client = Agent::config_builder() 123 | .timeout_global(Some(Duration::from_millis(self.timeout))) 124 | .http_status_as_error(false) 125 | .build() 126 | .into(); 127 | for media in &mut medias { 128 | print!(" {}: ", media.name); 129 | stdout().flush()?; 130 | media.verdict = match self.probe_one(&media.name, &client) { 131 | Ok(verdict) => verdict, 132 | Err(_) => break, 133 | }; 134 | } 135 | 136 | // step: display the results. 137 | let valid = medias.iter().filter(|m| m.verdict == Valid).count(); 138 | let failed = medias.iter().filter(|m| m.verdict == Failed).count(); 139 | let pending = medias.iter().filter(|m| m.verdict == Pending).count(); 140 | medias.retain(|m| m.verdict == Invalid); 141 | if !medias.is_empty() { 142 | println!("\ninvalid names:"); 143 | medias.iter().for_each(|m| println!(" {}", m.name)); 144 | } 145 | 146 | // step: display a summary receipt. 147 | println!("\ntotal names: {total_names}"); 148 | println!(" valid : {valid}"); 149 | println!(" invalid: {}", medias.len()); 150 | if failed > 0 { 151 | println!(" failed : {failed}"); 152 | } 153 | if pending > 0 { 154 | println!(" pending: {pending}{}", display_abort(true)); 155 | } 156 | 157 | Ok(()) 158 | } 159 | } 160 | 161 | impl Probe { 162 | fn probe_one(&self, name: &str, client: &Agent) -> Result { 163 | let url = self.url.replace("$", name); 164 | let (mut wait, mut spaces, mut retry) = (self.min_wait, 0, 0); 165 | let verdict = loop { 166 | utils::aborted()?; 167 | let (full, brief): (&dyn Display, _) = match client.head(&url).call() { 168 | Ok(resp) => match resp.status() { 169 | StatusCode::OK | StatusCode::FORBIDDEN => break Valid, 170 | StatusCode::NOT_FOUND => break Invalid, 171 | StatusCode::TOO_MANY_REQUESTS => (&"too many requests", "."), 172 | _ => (&resp.status().to_string(), "x"), 173 | }, 174 | Err(err) => (&format!("{err}"), "!"), 175 | }; 176 | let show = match self.errors { 177 | Errors::Never => false, 178 | Errors::Last => retry == self.retries, 179 | Errors::Always => true, 180 | Errors::Each10 => (retry + 1) % 10 == 0, 181 | }; 182 | if show { 183 | if spaces != 4 { 184 | println!(); 185 | spaces = 4; 186 | } 187 | println!(" - {full}"); 188 | } else { 189 | if spaces == 4 { 190 | print!(" "); 191 | } 192 | print!("{brief}"); 193 | stdout().flush()?; 194 | spaces = 1; 195 | } 196 | retry += 1; 197 | if self.retries >= 0 && retry > self.retries { 198 | break Failed; 199 | } 200 | std::thread::sleep(Duration::from_millis(wait)); 201 | wait = ((wait as f64 * self.backoff) as u64).min(self.max_wait); 202 | }; 203 | utils::aborted()?; // avoid printing a verdict in the wrong place if aborted. 204 | println!("{}{verdict:?}", " ".repeat(spaces)); 205 | Ok(verdict) 206 | } 207 | } 208 | 209 | impl TryFrom for Media { 210 | type Error = (Entry, anyhow::Error); 211 | 212 | fn try_from(entry: Entry) -> Result { 213 | let (name, _, _, _, _) = entry.collection_parts(); 214 | Ok(Media { 215 | name: name.to_owned(), 216 | verdict: Pending, 217 | }) 218 | } 219 | } 220 | -------------------------------------------------------------------------------- /src/commands/rebuild.rs: -------------------------------------------------------------------------------- 1 | use crate::commands::Refine; 2 | use crate::entries::{Entry, InputInfo, TraversalMode}; 3 | use crate::medias::{FileOps, Naming}; 4 | use crate::utils::{self, PromptError}; 5 | use crate::{impl_new_name, impl_new_name_mut, impl_source_entry}; 6 | use anyhow::Result; 7 | use clap::Args; 8 | use clap::builder::NonEmptyStringValueParser; 9 | use regex::Regex; 10 | use std::borrow::Cow; 11 | use std::fs; 12 | use std::sync::{LazyLock, OnceLock}; 13 | use std::time::SystemTime; 14 | 15 | #[derive(Debug, Args)] 16 | pub struct Rebuild { 17 | #[command(flatten)] 18 | naming: Naming, 19 | /// Disable smart matching, so "foo bar.mp4", "FooBar.mp4" and "foo__bar.mp4" are different. 20 | #[arg(short = 's', long)] 21 | simple: bool, 22 | /// Force to overwrite filenames (use the Global options to filter files). 23 | #[arg(short = 'f', long, value_name = "STR", conflicts_with_all = ["strip_before", "strip_after", "strip_exact", "replace", "throw", "simple", "partial"], value_parser = NonEmptyStringValueParser::new())] 24 | force: Option, 25 | /// Assume not all directories are available, which retains current sequences (but fixes gaps). 26 | #[arg(short = 'p', long)] 27 | partial: bool, 28 | /// Keep the original case of filenames, otherwise they are lowercased. 29 | #[arg(short = 'c', long)] 30 | case: bool, 31 | /// Skip the confirmation prompt, useful for automation. 32 | #[arg(short = 'y', long)] 33 | yes: bool, 34 | } 35 | 36 | #[derive(Debug)] 37 | pub struct Media { 38 | /// The original path to the file. 39 | entry: Entry, 40 | /// The new generated filename. 41 | new_name: String, 42 | /// The resulting smart match (if enabled and new_name has spaces or _). 43 | group_name: Option, 44 | /// The sequence number, which will be kept in partial mode and disambiguate `created` in all modes. 45 | seq: Option, 46 | /// A comment for the file. 47 | comment: String, 48 | /// A cached version of the file extension. 49 | ext: &'static str, 50 | /// The creation time of the file. 51 | created: SystemTime, 52 | } 53 | 54 | static CASE_FN: OnceLock String> = OnceLock::new(); 55 | 56 | impl Refine for Rebuild { 57 | type Media = Media; 58 | const OPENING_LINE: &'static str = "Rebuild collection filenames"; 59 | const T_MODE: TraversalMode = TraversalMode::Files; 60 | 61 | fn tweak(&mut self, info: &InputInfo) { 62 | let f = match self.case { 63 | false => str::to_lowercase, 64 | true => str::to_owned, 65 | }; 66 | CASE_FN.set(f).unwrap(); 67 | 68 | if info.has_invalid && !self.partial && self.force.is_none() { 69 | self.partial = true; 70 | eprintln!("Enabling partial mode due to missing directories.\n"); 71 | } 72 | } 73 | 74 | fn refine(&self, mut medias: Vec) -> Result<()> { 75 | let total_files = medias.len(); 76 | 77 | // detect if migration is needed. 78 | static RE: LazyLock = LazyLock::new(|| Regex::new(r"^(\w+)-(\d+)$").unwrap()); 79 | if medias 80 | .iter() 81 | .any(|m| m.seq.is_none() && RE.is_match(m.entry.filename_parts().0)) 82 | { 83 | eprintln!("warning: detected old-style filenames."); 84 | match utils::prompt_yes_no(r#"migrate to new style "name~9"?"#) { 85 | Ok(()) => { 86 | medias.iter_mut().for_each(|m| { 87 | if let Some(caps) = RE.captures(m.entry.filename_parts().0) { 88 | m.new_name.truncate(caps[1].len()); // truncate to the actual name length. 89 | m.seq = caps[2].parse().ok(); // find the actual sequence number. 90 | } 91 | }); 92 | } 93 | Err(PromptError::No) => { 94 | eprintln!("filenames might be inconsistent."); 95 | } 96 | Err(err) => { 97 | return Err(err.into()); 98 | } 99 | } 100 | } 101 | 102 | // step: apply naming rules. 103 | let blocked = self.naming.compile()?.apply(&mut medias); 104 | 105 | // step: reset names if forcing a new one. 106 | if let Some(force) = &self.force { 107 | medias.iter_mut().for_each(|m| { 108 | m.new_name.clone_from(force); 109 | }); 110 | } 111 | 112 | // step: prepare smart matching groups. 113 | if !self.simple { 114 | static RE: LazyLock = LazyLock::new(|| Regex::new(r"[\s_]+").unwrap()); 115 | 116 | medias.iter_mut().for_each(|m| { 117 | if let Cow::Owned(x) = RE.replace_all(&m.new_name, "") { 118 | m.group_name = Some(x); 119 | } 120 | }); 121 | } 122 | 123 | // step: sort medias according to partial or full mode. 124 | let seq = match self.partial { 125 | true => |m: &Media| m.seq.unwrap_or(usize::MAX), // no sequence goes to the end in partial mode. 126 | false => |_: &Media| 0, // ignore sequences in full mode. 127 | }; 128 | medias.sort_unstable_by(|m, n| { 129 | // unfortunately, some file systems have low-resolution creation time, HFS+ for example, 130 | // so m.seq is used to disambiguate `created`, which seems to repeat a lot sometimes. 131 | (m.group(), seq(m), m.created, m.seq).cmp(&(n.group(), seq(n), n.created, n.seq)) 132 | }); 133 | 134 | // step: generate new names. 135 | let name_idx = if self.simple { 136 | |_g: &[Media]| 0 // all the names are exactly the same within a group. 137 | } else if self.case { 138 | // smart matching which chooses the name with the most uppercase characters. 139 | |g: &[Media]| { 140 | g.iter() 141 | .enumerate() 142 | .max_by_key(|&(_, m)| m.new_name.chars().filter(|c| c.is_uppercase()).count()) 143 | .unwrap() 144 | .0 145 | } 146 | } else { 147 | // smart matching which chooses the longest name, i.e., the one with the most space and _ characters. 148 | |g: &[Media]| { 149 | g.iter() 150 | .enumerate() 151 | .max_by_key(|&(_, m)| m.new_name.len()) // find the longer one. 152 | .unwrap() 153 | .0 154 | } 155 | }; 156 | let seq_gen = match self.partial { 157 | true => |m: &Media, last_seq: usize| m.seq.unwrap_or_else(|| last_seq + 1), 158 | false => |_: &Media, last_seq: usize| last_seq + 1, 159 | }; 160 | let mut unique_names = 0; 161 | medias 162 | .chunk_by_mut(|m, n| m.group() == n.group()) 163 | .for_each(|g| { 164 | unique_names += 1; 165 | let base = std::mem::take(&mut g[name_idx(g)].new_name); // must be taken because `g` will be modified below. 166 | let mut seq = 0; // keep track of the last sequence number used. 167 | g.iter_mut().for_each(|m| { 168 | seq = seq_gen(m, seq); 169 | let dot = if m.ext.is_empty() { "" } else { "." }; 170 | m.new_name = format!("{base}~{seq}{}{dot}{}", m.comment, m.ext); 171 | }); 172 | }); 173 | 174 | utils::aborted()?; 175 | 176 | // step: settle changes, and display the results. 177 | medias.retain(|m| m.new_name != m.entry.file_name()); 178 | medias 179 | .iter() 180 | .for_each(|m| println!("{} --> {}", m.entry, m.new_name)); 181 | 182 | // step: display a summary receipt. 183 | if !medias.is_empty() || blocked > 0 { 184 | println!(); 185 | } 186 | println!("total files: {total_files} ({unique_names} unique names)"); 187 | println!(" changes: {}", medias.len()); 188 | println!(" blocked: {blocked}"); 189 | if medias.is_empty() { 190 | return Ok(()); 191 | } 192 | 193 | // step: apply changes if the user agrees. 194 | if !self.yes { 195 | utils::prompt_yes_no("apply changes?")?; 196 | } 197 | FileOps::rename_move(&mut medias); 198 | if medias.is_empty() { 199 | println!("done"); 200 | return Ok(()); 201 | } 202 | 203 | // step: fix file already exists errors. 204 | println!("attempting to fix {} errors", medias.len()); 205 | medias.iter_mut().for_each(|m| { 206 | let temp = format!("__refine+{}__", m.new_name); 207 | let dest = m.entry.with_file_name(&temp); 208 | match fs::rename(&m.entry, &dest) { 209 | Ok(()) => m.entry = dest, 210 | Err(err) => eprintln!("error: {err}: {} --> {temp:?}", m.entry), 211 | } 212 | }); 213 | FileOps::rename_move(&mut medias); 214 | 215 | match medias.is_empty() { 216 | true => println!("done"), 217 | false => println!("still {} errors, giving up", medias.len()), 218 | } 219 | Ok(()) 220 | } 221 | } 222 | 223 | impl_source_entry!(Media); 224 | impl_new_name!(Media); 225 | impl_new_name_mut!(Media); 226 | 227 | impl Media { 228 | /// The group name will either be the smart match or the new name. 229 | fn group(&self) -> &str { 230 | self.group_name.as_deref().unwrap_or(&self.new_name) 231 | } 232 | } 233 | 234 | impl TryFrom for Media { 235 | type Error = (Entry, anyhow::Error); 236 | 237 | fn try_from(entry: Entry) -> Result { 238 | let (name, _, seq, comment, ext) = entry.collection_parts(); 239 | let created = entry.metadata().map_or(None, |m| m.created().ok()); 240 | Ok(Media { 241 | new_name: CASE_FN.get().unwrap()(name.trim()), 242 | group_name: None, 243 | seq, 244 | comment: comment.to_string(), 245 | ext: utils::intern(ext), 246 | created: created.unwrap_or(SystemTime::now()), 247 | entry, 248 | }) 249 | } 250 | } 251 | -------------------------------------------------------------------------------- /src/commands/join.rs: -------------------------------------------------------------------------------- 1 | use crate::commands::Refine; 2 | use crate::entries::{Entry, Fetcher, ROOT, Recurse, TraversalMode}; 3 | use crate::impl_source_entry; 4 | use crate::medias::{FileOps, NewEntry, SourceEntry}; 5 | use crate::utils; 6 | use anyhow::{Context, Result, anyhow}; 7 | use clap::{Args, ValueEnum}; 8 | use std::collections::HashSet; 9 | use std::fmt::Display; 10 | use std::fs; 11 | use std::path::PathBuf; 12 | use std::sync::OnceLock; 13 | 14 | #[derive(Debug, Args)] 15 | pub struct Join { 16 | /// The target directory; will be created if it doesn't exist. 17 | #[arg(short = 't', long, default_value = ".", value_name = "PATH")] 18 | target: PathBuf, 19 | /// The type of join to perform. 20 | #[arg(short = 'b', long, default_value_t = By::Move, value_name = "STR", value_enum)] 21 | by: By, 22 | /// How to resolve clashes. 23 | #[arg(short = 'c', long, default_value_t = Clashes::NameSequence, value_name = "STR", value_enum)] 24 | clashes: Clashes, 25 | /// Force joining already in place files and directories, i.e. in subdirectories of the target. 26 | #[arg(short = 'f', long)] 27 | force: bool, 28 | /// Do not remove empty parent directories after joining files. 29 | #[arg(short = 'p', long)] 30 | parents: bool, 31 | /// Skip the confirmation prompt, useful for automation. 32 | #[arg(short = 'y', long)] 33 | yes: bool, 34 | } 35 | 36 | #[derive(Debug, Clone, Copy, ValueEnum)] 37 | pub enum By { 38 | #[value(aliases = ["m", "mv"])] 39 | Move, 40 | #[value(aliases = ["c", "cp"])] 41 | Copy, 42 | } 43 | 44 | #[derive(Debug, Clone, Copy, ValueEnum)] 45 | pub enum Clashes { 46 | #[value(aliases = ["s", "sq", "seq", "ns"])] 47 | NameSequence, 48 | #[value(aliases = ["pn"])] 49 | ParentName, 50 | #[value(aliases = ["np"])] 51 | NameParent, 52 | #[value(aliases = ["i", "ig"])] 53 | Ignore, 54 | } 55 | 56 | #[derive(Debug)] 57 | pub struct Media { 58 | entry: Entry, 59 | new_name: Option, 60 | skip: Skip, 61 | } 62 | 63 | #[derive(Debug, Clone, Copy)] 64 | enum Skip { 65 | Yes, 66 | No, 67 | Target, 68 | } 69 | 70 | #[derive(Debug)] 71 | struct Shared { 72 | target: Entry, 73 | force: bool, 74 | } 75 | 76 | static SHARED: OnceLock = OnceLock::new(); 77 | 78 | impl Refine for Join { 79 | type Media = Media; 80 | const OPENING_LINE: &'static str = "Join files"; 81 | const T_MODE: TraversalMode = TraversalMode::DirsStop; 82 | 83 | fn refine(&self, mut medias: Vec) -> Result<()> { 84 | if self.target.is_file() { 85 | return Err(anyhow!("invalid target: must be a directory or not exist")); 86 | } // target is either a directory or doesn't exist. 87 | let target = Entry::try_new(&self.target, true)?.resolve()?; 88 | 89 | let shared = Shared { 90 | target: target.clone(), 91 | force: self.force, 92 | }; 93 | SHARED.set(shared).unwrap(); 94 | let total = medias.len(); 95 | 96 | // step: read the target directory, which might not be empty, to detect outer clashes (not in medias). 97 | let mut target_names = Vec::new(); 98 | if target.exists() { 99 | // if target happens to be inside any input path and is not empty, this will dup the files. 100 | let fetcher = Fetcher::single(&target, Recurse::Shallow); 101 | let in_target = fetcher.fetch(Join::T_MODE).collect::>(); 102 | target_names.extend(in_target.iter().map(|e| e.file_name().to_string())); 103 | medias.extend(in_target.into_iter().map(|entry| Media { 104 | entry, 105 | new_name: None, 106 | skip: Skip::Target, 107 | })); 108 | } 109 | 110 | // step: detect clashes (files with the same name in different directories), and resolve them. 111 | medias.sort_unstable_by(|m, n| { 112 | // put files already in place first. 113 | (m.entry.file_name(), !m.is_in_place()).cmp(&(n.entry.file_name(), !n.is_in_place())) 114 | }); 115 | medias.dedup_by(|m, n| m.entry.to_str() == n.entry.to_str()); // remove target dup files. 116 | let mut clashes = 0; 117 | medias 118 | .chunk_by_mut(|m, n| m.entry.file_name() == n.entry.file_name()) 119 | .filter(|g| g.len() > 1) 120 | .for_each(|g| { 121 | clashes += g.len() - 1; // one is (or will be) in target, the others are clashes. 122 | let (stem, ext) = g[0].entry.filename_parts(); 123 | let (stem, ext) = (stem.to_owned(), ext.to_owned()); // g must not be borrowed. 124 | let dot = if ext.is_empty() { "" } else { "." }; 125 | match self.clashes { 126 | Clashes::NameSequence => { 127 | let mut seq = 2..; 128 | g.iter_mut().skip(1).for_each(|m| { 129 | let new_name = (&mut seq) 130 | .map(|i| format!("{stem}-{i}{dot}{ext}")) 131 | .find(|s| target_names.iter().all(|t| s != t)) 132 | .unwrap(); 133 | m.new_name = Some(new_name); 134 | }); 135 | } 136 | Clashes::ParentName | Clashes::NameParent => g.iter_mut().for_each(|m| { 137 | let par = m.entry.parent().unwrap_or(ROOT.clone()); 138 | let par = par.file_name(); 139 | if let Clashes::ParentName = self.clashes { 140 | m.new_name = Some(format!("{par}-{stem}{dot}{ext}")); 141 | } else { 142 | m.new_name = Some(format!("{stem}-{par}{dot}{ext}")); 143 | } 144 | }), 145 | Clashes::Ignore => g.iter_mut().for_each(|m| m.skip = Skip::Yes), 146 | } 147 | }); 148 | 149 | // step: settle results by removing the files that are in place or skipped. 150 | medias.sort_unstable_by(|m, n| m.entry.cmp(&n.entry)); 151 | let mut in_place = 0; 152 | medias.retain(|m| match (m.skip, m.is_in_place()) { 153 | (Skip::No, false) => true, 154 | (Skip::No, true) => { 155 | in_place += 1; 156 | println!("already in place: {}", m.entry); 157 | false 158 | } 159 | (Skip::Yes, _) => { 160 | println!("clash skipped: {}", m.entry); 161 | false 162 | } 163 | (Skip::Target, _) => false, 164 | }); 165 | 166 | // step: display the results. 167 | medias.iter().for_each(|m| match &m.new_name { 168 | Some(name) => println!("{} -> {name}", m.entry), 169 | None => println!("{}", m.entry), 170 | }); 171 | 172 | // step: display summary receipt. 173 | if !medias.is_empty() || in_place > 0 || clashes > 0 { 174 | println!(); 175 | } 176 | println!("total entries: {total}"); 177 | let resolved: &dyn Display = if clashes > 0 { &self.clashes } else { &"" }; 178 | println!(" clashes: {clashes}{resolved}"); 179 | println!(" in place: {in_place}"); 180 | println!("\njoin [by {:?}] to: {target}", self.by); 181 | 182 | // step: ask for confirmation. 183 | if medias.is_empty() { 184 | println!("nothing to do"); 185 | return Ok(()); 186 | } 187 | if !self.yes { 188 | utils::prompt_yes_no("apply changes?")?; 189 | } 190 | 191 | // step: grab the files' parent directories before the consuming operations. 192 | let dirs = match self.parents { 193 | true => HashSet::new(), 194 | false => medias 195 | .iter() 196 | .map(|m| m.entry.parent().unwrap()) 197 | .collect::>(), 198 | }; 199 | 200 | // step: apply changes if the user agrees. 201 | fs::create_dir_all(&target).with_context(|| format!("creating {target:?}"))?; 202 | match self.by { 203 | By::Move => FileOps::rename_move(&mut medias), 204 | By::Copy => FileOps::copy(&mut medias), 205 | }; 206 | 207 | // step: recover from CrossDevice errors. 208 | if !medias.is_empty() 209 | && let By::Move = self.by 210 | { 211 | println!("attempting to fix {} errors", medias.len()); 212 | FileOps::cross_move(&mut medias); 213 | } 214 | 215 | // step: remove the empty parent directories. 216 | if !self.parents { 217 | let mut dirs = dirs.into_iter().collect::>(); 218 | dirs.sort_unstable_by(|m, n| m.cmp(n).reverse()); 219 | dirs.into_iter().for_each(|dir| { 220 | if let Ok(rd) = fs::read_dir(&dir) { 221 | const DS_STORE: &str = ".DS_Store"; 222 | if rd // .DS_Store might exist on macOS but should be removed if it is the only file in there. 223 | .map(|r| r.is_ok_and(|d| d.file_name() == DS_STORE).then_some(())) 224 | .collect::>>() 225 | .is_some_and(|v| !v.is_empty()) // an empty iterator is collected into Some([]). 226 | { 227 | let dstore = dir.join(DS_STORE); 228 | if let Err(err) = fs::remove_file(&dstore) { 229 | eprintln!("error: {err}: {dstore:?}"); 230 | } 231 | } 232 | } 233 | if let Ok(()) = fs::remove_dir(&dir) { 234 | println!(" removed empty dir: {dir}") 235 | } 236 | }); 237 | } 238 | 239 | match (medias.is_empty(), self.by) { 240 | (true, _) => println!("done"), 241 | (false, By::Move) => println!("still {} errors, giving up", medias.len()), 242 | (false, By::Copy) => println!("found {} errors", medias.len()), 243 | } 244 | Ok(()) 245 | } 246 | } 247 | 248 | impl Display for Clashes { 249 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 250 | match self { 251 | Clashes::NameSequence => write!(f, " (resolved by name-sequence)"), 252 | Clashes::ParentName => write!(f, " (resolved by parent-name)"), 253 | Clashes::NameParent => write!(f, " (resolved by name-parent)"), 254 | Clashes::Ignore => write!(f, " (ignored)"), 255 | } 256 | } 257 | } 258 | 259 | impl Media { 260 | fn is_in_place(&self) -> bool { 261 | let shared = SHARED.get().unwrap(); 262 | 263 | let target = &shared.target; 264 | if shared.force { 265 | return self.entry.parent().unwrap() == *target; 266 | } 267 | 268 | match self.entry.is_dir() { 269 | true => self.entry.starts_with(target), 270 | false => self.entry.parent().unwrap().starts_with(target), 271 | } 272 | } 273 | } 274 | 275 | impl_source_entry!(Media); 276 | 277 | impl NewEntry for Media { 278 | fn new_entry(&self) -> Entry { 279 | let name = self.new_name.as_ref().map(|s| s.as_ref()); 280 | let path = &SHARED.get().unwrap().target; 281 | path.join(name.unwrap_or_else(|| self.src_entry().file_name())) 282 | } 283 | } 284 | 285 | impl TryFrom for Media { 286 | type Error = (Entry, anyhow::Error); 287 | 288 | fn try_from(entry: Entry) -> Result { 289 | Ok(Media { 290 | new_name: None, 291 | skip: Skip::No, 292 | entry, 293 | }) 294 | } 295 | } 296 | -------------------------------------------------------------------------------- /src/medias/naming.rs: -------------------------------------------------------------------------------- 1 | use super::{NewNameMut, SourceEntry}; 2 | use crate::utils; 3 | use anyhow::{Context, Result}; 4 | use clap::Args; 5 | use clap::builder::NonEmptyStringValueParser; 6 | use regex::Regex; 7 | use std::borrow::Cow; 8 | use std::sync::LazyLock; 9 | 10 | /// A set of rules that allows the user to customize filenames. 11 | #[derive(Debug, Args)] 12 | pub struct Naming { 13 | /// Strip from the start till occurrence; includes separators nearby, use {S} if needed. 14 | #[arg(short = 'b', long, value_name = "STR|REGEX", allow_hyphen_values = true, value_parser = NonEmptyStringValueParser::new())] 15 | strip_before: Vec, 16 | /// Strip from occurrence till the end; includes separators nearby, use {S} if needed. 17 | #[arg(short = 'a', long, value_name = "STR|REGEX", allow_hyphen_values = true, value_parser = NonEmptyStringValueParser::new())] 18 | strip_after: Vec, 19 | /// Strip exact occurrences; includes separators nearby, use {S} if needed. 20 | #[arg(short = 'e', long, value_name = "STR|REGEX", allow_hyphen_values = true, value_parser = NonEmptyStringValueParser::new())] 21 | strip_exact: Vec, 22 | /// Replace occurrences in the filename; separators are not touched, use {S} if needed. 23 | #[arg(short = 'r', long, value_name = "STR|REGEX=STR|$N", allow_hyphen_values = true, value_parser = utils::parse_key_value::)] 24 | replace: Vec<(String, String)>, 25 | /// recipe: Throw some prefix to the end; use {S} if needed. 26 | #[arg(short = 'w', long, value_name = "STR|REGEX=STR", allow_hyphen_values = true, value_parser = utils::parse_key_value::)] 27 | throw: Vec<(String, String)>, 28 | } 29 | 30 | impl Naming { 31 | /// Compile this set of rules. 32 | pub fn compile(&self) -> Result { 33 | NamingRules::compile( 34 | [&self.strip_before, &self.strip_after, &self.strip_exact], 35 | &self.replace, 36 | &self.throw, 37 | ) 38 | } 39 | } 40 | 41 | #[derive(Debug)] 42 | pub struct NamingRules(Vec<(Regex, String)>); 43 | 44 | impl NamingRules { 45 | fn compile( 46 | strip_rules: [&[impl AsRef]; 3], 47 | replace_rules: &[(impl AsRef, impl AsRef)], 48 | throw_rules: &[(impl AsRef, impl AsRef)], 49 | ) -> Result { 50 | const O: &str = r"[(\[{]"; // enclosing opening. 51 | const C: &str = r"[)\]}]"; // enclosing closing. 52 | const SEP: &str = r"[-\s.,]"; 53 | let before = |rule| format!("^.*{rule}{C}*{SEP}*"); 54 | let after = |rule| format!("{SEP}*{O}*{rule}.*$"); 55 | let exact = |rule| { 56 | static RE: LazyLock = LazyLock::new(|| Regex::new(r"\w$").unwrap()); 57 | let b = if RE.is_match(rule) { r"\b" } else { r"\B" }; 58 | format!( 59 | r"^{O}*{rule}{C}*{SEP}+|{SEP}+{O}*{rule}{C}*$|{SEP}+{O}*{rule}{C}*{b}|{O}*{rule}{C}*" 60 | ) 61 | }; 62 | let replace_key = |rule: &str| rule.to_owned(); 63 | let throw_key = |rule| format!(r"^{rule}{SEP}+(.+)$"); 64 | let throw_value = |val| format!(r"$1 - {val}"); 65 | 66 | let rules = strip_rules 67 | .into_iter() 68 | .map(|g| { 69 | g.iter() 70 | .map(|r| (r.as_ref(), String::new())) 71 | .collect::>() 72 | }) 73 | .chain([replace_rules 74 | .iter() 75 | .map(|(k, v)| (k.as_ref(), v.as_ref().to_owned())) 76 | .collect()]) 77 | .chain([throw_rules 78 | .iter() 79 | .map(|(k, v)| (k.as_ref(), throw_value(v.as_ref()))) 80 | .collect()]) 81 | .zip([before, after, exact, replace_key, throw_key]) 82 | .flat_map(|(g, f)| g.into_iter().map(move |(k, v)| (k, v, f))) 83 | .map(|(rule, to, f)| { 84 | Regex::new(&format!("(?i){}", f(rule).replace("{S}", SEP))) // support {S} for separators. 85 | .with_context(|| format!("compiling regex: {rule:?}")) 86 | .map(|re| (re, to)) 87 | }) 88 | .collect::>()?; 89 | Ok(NamingRules(rules)) 90 | } 91 | 92 | /// Apply these rules to a list of media, consuming the entries that got their names cleared. 93 | /// 94 | /// The [NewNameMut] is used as the starting point, and is mutated in place. 95 | /// It returns the number of entries that were cleared by the rules. 96 | pub fn apply(&self, medias: &mut Vec) -> usize { 97 | // this is just so that warnings are printed in a consistent order. 98 | medias.sort_unstable_by(|m, n| m.src_entry().cmp(n.src_entry())); 99 | 100 | // apply all rules in order. 101 | let total = medias.len(); 102 | medias.retain_mut(|m| { 103 | let mut name = std::mem::take(m.new_name_mut()); 104 | self.0.iter().for_each(|(re, to)| { 105 | if let Cow::Owned(x) = re.replace_all(&name, to) { 106 | name = x; 107 | } 108 | }); 109 | 110 | if name.is_empty() { 111 | eprintln!("blocked: rules cleared name: {}", m.src_entry()); 112 | return false; 113 | } 114 | *m.new_name_mut() = name; 115 | true 116 | }); 117 | total - medias.len() 118 | } 119 | } 120 | 121 | #[cfg(test)] 122 | mod tests { 123 | use super::*; 124 | use crate::entries::{Entry, ROOT}; 125 | 126 | const NO_STRIP: [&[&str]; 3] = [&[], &[], &[]]; 127 | const NO_REPLACE: &[(&str, &str)] = &[]; 128 | const NO_THROW: &[(&str, &str)] = &[]; 129 | 130 | /// A dummy type that expects it is always changed. 131 | #[derive(Debug, PartialEq)] 132 | struct Media(String); 133 | impl NewNameMut for Media { 134 | fn new_name_mut(&mut self) -> &mut String { 135 | &mut self.0 136 | } 137 | } 138 | impl SourceEntry for Media { 139 | fn src_entry(&self) -> &Entry { 140 | &ROOT 141 | } 142 | } 143 | 144 | #[test] 145 | fn strip_rules() { 146 | #[track_caller] 147 | fn case(rule: &[&str], idx: usize, stem: &str, new_name: &str) { 148 | let mut strip_rules = [[].as_ref(); 3]; 149 | strip_rules[idx] = rule; 150 | let mut medias = vec![Media(stem.to_owned())]; 151 | let rules = NamingRules::compile(strip_rules, NO_REPLACE, NO_THROW).unwrap(); 152 | let warnings = rules.apply(&mut medias); 153 | assert_eq!(warnings, 0); 154 | assert_eq!(medias[0].0, new_name); 155 | } 156 | 157 | case(&["Before"], 0, "beforefoo", "foo"); 158 | case(&["Before"], 0, "Before__foo", "__foo"); 159 | case(&["Before"], 0, "before foo", "foo"); 160 | case(&["before"], 0, "Before - foo", "foo"); 161 | case(&["before"], 0, "before.foo", "foo"); 162 | case(&["before"], 0, "Before\t. foo", "foo"); 163 | 164 | case(&["After"], 1, "fooafter", "foo"); 165 | case(&["After"], 1, "foo__After", "foo__"); 166 | case(&["After"], 1, "foo after", "foo"); 167 | case(&["after"], 1, "foo - After", "foo"); 168 | case(&["after"], 1, "foo.after", "foo"); 169 | case(&["after"], 1, "foo\t. After", "foo"); 170 | 171 | // exact: {BOUND}+{rule}$ 172 | case(&["Exact"], 2, "foo__Exact", "foo__"); 173 | case(&["Exact"], 2, "foo exact", "foo"); 174 | case(&["exact"], 2, "foo - Exact", "foo"); 175 | case(&["exact"], 2, "foo.exact", "foo"); 176 | case(&["exact"], 2, "foo\t. Exact", "foo"); 177 | 178 | // exact: ^{rule}{BOUND}+ 179 | case(&["Exact"], 2, "Exact__foo", "__foo"); 180 | case(&["Exact"], 2, "exact foo", "foo"); 181 | case(&["exact"], 2, "Exact - foo", "foo"); 182 | case(&["exact"], 2, "exact.foo", "foo"); 183 | case(&["exact"], 2, "Exact\t. foo", "foo"); 184 | 185 | // exact: {BOUND}+{rule} 186 | case(&["Exact"], 2, "foo__Exactbar", "foo__bar"); 187 | case(&["Exact"], 2, "foo exact bar", "foo bar"); 188 | case(&["exact"], 2, "foo.exact.bar", "foo.bar"); 189 | case(&["exact"], 2, "foo\t. Exact - bar", "foo - bar"); 190 | 191 | // exact: new boundaries 192 | case(&["exact"], 2, "foo - Exactbar", "foo - bar"); 193 | case(&["Exact"], 2, "foo__Exact bar", "foo__ bar"); 194 | case(&["Exact"], 2, "fooExact bar", "foo bar"); 195 | case(&["(exact)"], 2, "foo - (Exact)bar", "foo - bar"); 196 | case(&["(Exact)"], 2, "foo__(Exact) bar", "foo__ bar"); 197 | case(&["Exact"], 2, "foo(Exact) bar", "foo bar"); 198 | 199 | // exact: {rule} 200 | case(&["Exact"], 2, "fexactoo", "foo"); 201 | case(&["Exact"], 2, "fexactoExacto", "foo"); 202 | case(&["exact"], 2, "Exactfoo bar", "foo bar"); 203 | } 204 | 205 | #[test] 206 | fn replace_rules() { 207 | #[track_caller] 208 | fn case(replace_rules: &[(&str, &str)], stem: &str, new_name: &str) { 209 | let mut medias = vec![Media(stem.to_owned())]; 210 | let rules = NamingRules::compile(NO_STRIP, replace_rules, NO_THROW).unwrap(); 211 | let warnings = rules.apply(&mut medias); 212 | assert_eq!(warnings, 0); 213 | assert_eq!(medias[0].0, new_name); 214 | } 215 | 216 | case(&[("-+", "-")], "foo---bar", "foo-bar"); 217 | case(&[(r"(\w+) +(\w+)", "$2 $1")], "foo bar", "bar foo"); 218 | case(&[(r"(.+)(S0\dE0\d)", "$2.$1")], "fooS03E05", "S03E05.foo"); 219 | } 220 | 221 | #[test] 222 | fn throw_rules() { 223 | #[track_caller] 224 | fn case(throw_rules: &[(&str, &str)], stem: &str, new_name: &str) { 225 | let mut medias = vec![Media(stem.to_owned())]; 226 | let rules = NamingRules::compile(NO_STRIP, NO_REPLACE, throw_rules).unwrap(); 227 | let warnings = rules.apply(&mut medias); 228 | assert_eq!(warnings, 0); 229 | assert_eq!(medias[0].0, new_name); 230 | } 231 | 232 | case( 233 | &[("God.?of.?War", "God of War")], 234 | "other things", 235 | "other things", 236 | ); 237 | case( 238 | &[("God.?of.?War", "God of War")], 239 | "God of War media", 240 | "media - God of War", 241 | ); 242 | case( 243 | &[("God.?of.?War", "God of War")], 244 | "godofwar - media", 245 | "media - God of War", 246 | ); 247 | 248 | case( 249 | &[("God{S}of{S}War", "God of War")], 250 | "other things", 251 | "other things", 252 | ); 253 | case( 254 | &[("God{S}of{S}War", "God of War")], 255 | "God of War media", 256 | "media - God of War", 257 | ); 258 | case( 259 | &[("God{S}*of{S}*War", "God of War")], 260 | "godofwar media", 261 | "media - God of War", 262 | ); 263 | case( 264 | &[("God{S}of{S}War", "God of War")], 265 | "God-of-War media", 266 | "media - God of War", 267 | ); 268 | case( 269 | &[("God{S}*of{S}*War", "God of War")], 270 | "godofwar - media", 271 | "media - God of War", 272 | ); 273 | } 274 | 275 | #[test] 276 | fn cleared() { 277 | let mut medias = vec![ 278 | Media("file".to_owned()), 279 | Media("batch".to_owned()), 280 | Media("collection".to_owned()), 281 | Media("refine".to_owned()), 282 | Media("foobar".to_owned()), 283 | ]; 284 | let rules = 285 | NamingRules::compile([&["e"], &["b"], &["c.*i"]], &[("on", "")], NO_THROW).unwrap(); 286 | let warnings = rules.apply(&mut medias); 287 | assert_eq!(warnings, 4); 288 | assert_eq!(medias, vec![Media("foo".to_owned())]); 289 | } 290 | } 291 | -------------------------------------------------------------------------------- /src/entries/entry.rs: -------------------------------------------------------------------------------- 1 | use anyhow::{Result, anyhow}; 2 | use regex::Regex; 3 | use std::cmp::Ordering; 4 | use std::convert::Into; 5 | use std::env; 6 | use std::fmt::{self, Display}; 7 | use std::fs::Metadata; 8 | use std::hash::{Hash, Hasher}; 9 | use std::ops::Deref; 10 | use std::path::{Component, Path, PathBuf}; 11 | use std::sync::LazyLock; 12 | use yansi::{Paint, Style}; 13 | 14 | /// A file or directory entry that is guaranteed to have a valid UTF-8 representation. 15 | #[derive(Debug, Clone, Eq)] // Hash, PartialEq, Ord, and PartialOrd are below. 16 | pub struct Entry { 17 | path: PathBuf, 18 | is_dir: bool, 19 | } 20 | 21 | /// Create a new entry from a path, checking that it has a valid UTF-8 representation. 22 | /// 23 | /// The path must exist. 24 | impl TryFrom for Entry { 25 | type Error = (PathBuf, anyhow::Error); 26 | 27 | fn try_from(path: PathBuf) -> Result { 28 | let path_err = |err: anyhow::Error| (path.clone(), err); 29 | let is_dir = path 30 | .metadata() 31 | .map_err(Into::into) 32 | .map_err(path_err)? 33 | .is_dir(); // verify that the path exists and is a directory. 34 | if is_dir { 35 | path.file_name() 36 | .unwrap_or_default() // the root dir has no name. 37 | .to_str() 38 | .ok_or_else(|| anyhow!("no UTF-8 dir name: {path:?}")) 39 | .map_err(path_err)?; 40 | } else { 41 | path.file_stem() 42 | .ok_or_else(|| anyhow!("no file stem: {path:?}")) 43 | .map_err(path_err)? 44 | .to_str() 45 | .ok_or_else(|| anyhow!("no UTF-8 file stem: {path:?}")) 46 | .map_err(path_err)?; 47 | path.extension() 48 | .unwrap_or_default() 49 | .to_str() 50 | .ok_or_else(|| anyhow!("no UTF-8 file extension: {path:?}")) 51 | .map_err(path_err)?; 52 | } 53 | // I could just check that the entire path is valid UTF-8, but I want to give better error messages. 54 | if let Some(pp) = path.parent() { 55 | // the root dir has no parent. 56 | pp.to_str() 57 | .ok_or_else(|| anyhow!("no UTF-8 parent: {pp:?}")) 58 | .map_err(path_err)?; 59 | } 60 | Ok(Entry { path, is_dir }) 61 | } 62 | } 63 | 64 | pub static ROOT: LazyLock = LazyLock::new(|| Entry::try_new("/", true).unwrap()); 65 | 66 | impl Entry { 67 | /// Create a new entry that, in case the path does not exist, will assume the given directory flag. 68 | /// If it does exist, check that it has the correct directory flag or panic. 69 | pub fn try_new(path: impl Into, is_dir: bool) -> Result { 70 | let path = path.into(); 71 | if path.to_str().is_none() { 72 | return Err(anyhow!("invalid UTF-8 path: {path:?}")); 73 | } 74 | 75 | // panic if the entry exists and the directory flag doesn't match. 76 | // it should never happen in normal program logic, so if it does it's a bug. 77 | match path.try_exists() { 78 | Ok(true) => assert_eq!(path.is_dir(), is_dir, "is_dir error in {path:?}: {is_dir}"), 79 | Ok(false) => {} // the path was verified to not exist, cool. 80 | Err(err) => println!("warning: couldn't verify {path:?}: {err}"), 81 | } 82 | 83 | Ok(Entry { path, is_dir }) 84 | } 85 | 86 | /// Create a new entry with the given name adjoined without checking UTF-8 again. 87 | pub fn join(&self, name: impl AsRef) -> Entry { 88 | let path = self.path.join(name.as_ref()); 89 | let is_dir = path.is_dir(); 90 | Entry { path, is_dir } 91 | } 92 | 93 | /// Create a new entry with the given name without checking UTF-8 again. 94 | pub fn with_file_name(&self, name: impl AsRef) -> Entry { 95 | let path = self.path.with_file_name(name.as_ref()); 96 | let is_dir = path.is_dir(); 97 | Entry { path, is_dir } 98 | } 99 | 100 | /// Get the stem and extension from files, or name from directories. 101 | pub fn filename_parts(&self) -> (&str, &str) { 102 | match self.is_dir { 103 | true => (self.file_name(), ""), 104 | false => ( 105 | self.path.file_stem().unwrap().to_str().unwrap(), 106 | self.path.extension().unwrap_or_default().to_str().unwrap(), 107 | ), 108 | } 109 | } 110 | 111 | /// Get the canonical name, source alias, sequence, comment, and extension from collections. 112 | pub fn collection_parts(&self) -> (&str, Option<&str>, Option, &str, &str) { 113 | // regex: name~24 or name+alias~24. 114 | static RE: LazyLock = 115 | LazyLock::new(|| Regex::new(r"^(\w+)(?:\+(\w+))?~(\d+)(.*)$").unwrap()); 116 | 117 | let (stem, ext) = self.filename_parts(); 118 | let Some(caps) = RE.captures(stem) else { 119 | return (stem, None, None, "", ext); 120 | }; 121 | let canonical = caps.get(1).unwrap().as_str(); // regex guarantees name is present. 122 | let alias = caps.get(2).map(|m| m.as_str()); 123 | let seq = caps.get(3).and_then(|m| m.as_str().parse().ok()); 124 | let comment = caps.get(4).map_or("", |m| m.as_str()); 125 | (canonical, alias, seq, comment, ext) 126 | } 127 | 128 | /// Return a cached directory flag, which does not touch the filesystem again. 129 | pub fn is_dir(&self) -> bool { 130 | self.is_dir 131 | } 132 | 133 | /// Get the filename from entries directly as a &str. 134 | pub fn file_name(&self) -> &str { 135 | self.path 136 | .file_name() 137 | .map(|n| n.to_str().unwrap()) 138 | .unwrap_or_default() 139 | } 140 | 141 | pub fn to_str(&self) -> &str { 142 | self.path.to_str().unwrap() 143 | } 144 | 145 | /// Get the parent directory as an entry, without checking UTF-8 again. 146 | pub fn parent(&self) -> Option { 147 | self.path.parent().map(|p| Entry { 148 | path: p.to_owned(), 149 | is_dir: true, 150 | }) 151 | } 152 | 153 | pub fn metadata(&self) -> Result { 154 | self.path.metadata().map_err(Into::into) 155 | } 156 | 157 | pub fn display_path(&self) -> impl Display { 158 | DisplayPath(self) 159 | } 160 | 161 | pub fn display_filename(&self) -> impl Display { 162 | DisplayFilename(self) 163 | } 164 | 165 | pub fn resolve(&self) -> Result { 166 | let mut it = self.path.components(); 167 | let mut res = match it.next().unwrap() { 168 | Component::Normal(x) if x == "~" => { 169 | dirs::home_dir().ok_or_else(|| anyhow!("no home dir"))? 170 | } 171 | Component::Normal(x) => { 172 | let mut dir = env::current_dir()?; 173 | dir.push(x); 174 | dir 175 | } 176 | Component::CurDir => env::current_dir()?, 177 | Component::ParentDir => { 178 | let mut dir = env::current_dir()?; 179 | dir.pop(); 180 | dir 181 | } 182 | x => PathBuf::from(x.as_os_str()), 183 | }; 184 | for comp in it { 185 | match comp { 186 | Component::RootDir => res.push(comp), // windows might have returned Prefix above, so RootDir comes here. 187 | Component::Normal(_) => res.push(comp), 188 | Component::ParentDir => { 189 | if !res.pop() { 190 | return Err(anyhow!("invalid path: {self}")); 191 | } 192 | } 193 | _ => unreachable!(), 194 | } 195 | } 196 | Entry::try_new(res, self.is_dir) // the paths prepended above are NOT guaranteed to be valid UTF-8. 197 | } 198 | } 199 | 200 | /// A [Display] implementation for [Entry] that print its full path. 201 | #[derive(Debug)] 202 | pub struct DisplayPath<'a>(&'a Entry); 203 | 204 | /// A [Display] implementation for [Entry] that print only its file name. 205 | #[derive(Debug)] 206 | pub struct DisplayFilename<'a>(&'a Entry); 207 | 208 | const DIR_STYLE: (Style, Style) = { 209 | let parent_dir = Style::new().yellow(); 210 | (parent_dir, parent_dir.bold()) 211 | }; 212 | const FILE_STYLE: (Style, Style) = { 213 | let parent_file = Style::new().cyan(); 214 | (parent_file, parent_file.bold()) 215 | }; 216 | 217 | impl Display for DisplayPath<'_> { 218 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 219 | let entry = self.0; 220 | let (parent, name, symbol) = display_parts(entry); 221 | let (p_style, n_style) = if entry.is_dir { DIR_STYLE } else { FILE_STYLE }; 222 | write!( 223 | f, 224 | "{}{}{}", 225 | parent.paint(p_style), 226 | name.paint(n_style), 227 | symbol.paint(n_style) 228 | ) 229 | } 230 | } 231 | 232 | impl Display for DisplayFilename<'_> { 233 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 234 | let entry = self.0; 235 | let (_, name, symbol) = display_parts(entry); 236 | let (_, style) = if entry.is_dir { DIR_STYLE } else { FILE_STYLE }; 237 | write!(f, "{}{}", name.paint(style), symbol.paint(style)) 238 | } 239 | } 240 | 241 | /// Get the parent directory, name, and directory symbol for an entry. 242 | /// They are used by [DisplayPath] and [DisplayFilename] implementations, which style them. 243 | fn display_parts(entry: &Entry) -> (&str, &str, &str) { 244 | let full = entry.to_str(); 245 | let (parent, name) = match entry.path.file_name().map(|s| s.to_str().unwrap()) { 246 | Some(name) => { 247 | let pos = full.rfind(name).unwrap(); 248 | (&full[..pos], name) 249 | } 250 | None => ("", full), 251 | }; 252 | let dir_id = match entry.is_dir && !name.ends_with('/') { 253 | true => "/", 254 | false => "", 255 | }; 256 | (parent, name, dir_id) 257 | } 258 | 259 | impl Display for Entry { 260 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 261 | self.display_path().fmt(f) 262 | } 263 | } 264 | 265 | impl Deref for Entry { 266 | type Target = Path; 267 | 268 | fn deref(&self) -> &Self::Target { 269 | &self.path 270 | } 271 | } 272 | 273 | impl AsRef for Entry { 274 | fn as_ref(&self) -> &Path { 275 | &self.path 276 | } 277 | } 278 | 279 | impl Hash for Entry { 280 | fn hash(&self, state: &mut H) { 281 | self.path.hash(state) 282 | } 283 | } 284 | 285 | impl Ord for Entry { 286 | fn cmp(&self, other: &Self) -> Ordering { 287 | self.path.cmp(&other.path) 288 | } 289 | } 290 | 291 | impl PartialOrd for Entry { 292 | fn partial_cmp(&self, other: &Self) -> Option { 293 | Some(self.cmp(other)) 294 | } 295 | } 296 | 297 | impl PartialEq for Entry { 298 | fn eq(&self, other: &Self) -> bool { 299 | self.path == other.path 300 | } 301 | } 302 | 303 | #[cfg(test)] 304 | mod tests { 305 | use super::*; 306 | 307 | #[test] 308 | fn filename_parts() { 309 | #[track_caller] 310 | fn case(p: impl Into, is_dir: bool, out: (&str, &str)) { 311 | let entry = Entry { 312 | path: p.into(), 313 | is_dir, 314 | }; 315 | assert_eq!(out, entry.filename_parts()) 316 | } 317 | 318 | case("foo", false, ("foo", "")); 319 | case("foo.bar", false, ("foo", "bar")); 320 | case("foo.bar.baz", false, ("foo.bar", "baz")); 321 | 322 | case(".foo", false, (".foo", "")); 323 | case(".foo.bar", false, (".foo", "bar")); 324 | case(".foo.bar.baz", false, (".foo.bar", "baz")); 325 | 326 | case("foo", true, ("foo", "")); 327 | case("foo.bar", true, ("foo.bar", "")); 328 | case("foo.bar.baz", true, ("foo.bar.baz", "")); 329 | 330 | case(".foo", true, (".foo", "")); 331 | case(".foo.bar", true, (".foo.bar", "")); 332 | case(".foo.bar.baz", true, (".foo.bar.baz", "")); 333 | } 334 | 335 | #[test] 336 | fn collection_parts() { 337 | #[track_caller] 338 | fn case(base: &str, out: (&str, Option<&str>, Option, &str)) { 339 | let (name, alias, seq, comment) = out; 340 | let entry = Entry::try_new(format!("{base}.ext"), false).unwrap(); 341 | let out = (name, alias, seq, comment, "ext"); 342 | assert_eq!(out, entry.collection_parts()); 343 | } 344 | 345 | // stem only. 346 | case("foo", ("foo", None, None, "")); 347 | case("foo bar", ("foo bar", None, None, "")); 348 | case("foo bar - baz", ("foo bar - baz", None, None, "")); 349 | case("foo - 2025 - 24", ("foo - 2025 - 24", None, None, "")); 350 | case("_foo_-24", ("_foo_-24", None, None, "")); 351 | case("foo ~ 24", ("foo ~ 24", None, None, "")); 352 | case("foo~ 24", ("foo~ 24", None, None, "")); 353 | case("foo+bar", ("foo+bar", None, None, "")); 354 | case("foo+bar,baz", ("foo+bar,baz", None, None, "")); 355 | case("foo+bar ~ 24", ("foo+bar ~ 24", None, None, "")); 356 | case("foo ~24", ("foo ~24", None, None, "")); 357 | case("foo bar~24", ("foo bar~24", None, None, "")); 358 | case("foo bar ~24", ("foo bar ~24", None, None, "")); 359 | case("_foo_ ~24", ("_foo_ ~24", None, None, "")); 360 | case("foo - 33~24", ("foo - 33~24", None, None, "")); 361 | case("foo+ ~24", ("foo+ ~24", None, None, "")); 362 | case("foo+ asd~24", ("foo+ asd~24", None, None, "")); 363 | case("foo+asd ~24", ("foo+asd ~24", None, None, "")); 364 | case("foo+~24", ("foo+~24", None, None, "")); 365 | case(",~24", (",~24", None, None, "")); 366 | case("foo+,~24", ("foo+,~24", None, None, "")); 367 | case("foo+bar,~24", ("foo+bar,~24", None, None, "")); 368 | case("foo+bar,~24 cool", ("foo+bar,~24 cool", None, None, "")); 369 | 370 | // name and seq. 371 | case("foo~24", ("foo", None, Some(24), "")); 372 | case("foo_~24", ("foo_", None, Some(24), "")); 373 | case("__foo~24", ("__foo", None, Some(24), "")); 374 | case("_foo__~24", ("_foo__", None, Some(24), "")); 375 | 376 | // name, aliases and seq. 377 | case("foo+bar~24", ("foo", Some("bar"), Some(24), "")); 378 | case( 379 | "foo_bar__+_baz__~24", 380 | ("foo_bar__", Some("_baz__"), Some(24), ""), 381 | ); 382 | 383 | // name, seq, and comment. 384 | case("foo~24cool", ("foo", None, Some(24), "cool")); 385 | case("foo~24 cool", ("foo", None, Some(24), " cool")); 386 | case("foo_~24-nice!", ("foo_", None, Some(24), "-nice!")); 387 | case("__foo~24 ?why?", ("__foo", None, Some(24), " ?why?")); 388 | case("_foo__~24 - cut", ("_foo__", None, Some(24), " - cut")); 389 | 390 | // name, aliases, seq, and comment. 391 | case( 392 | "foo+bar~24 seen 3 times", 393 | ("foo", Some("bar"), Some(24), " seen 3 times"), 394 | ); 395 | case( 396 | "_foo+__bar_~24 with comment!", 397 | ("_foo", Some("__bar_"), Some(24), " with comment!"), 398 | ); 399 | } 400 | 401 | #[test] 402 | fn fn_display_parts() { 403 | #[track_caller] 404 | fn case(p: impl Into, is_dir: bool, out: (&str, &str, &str)) { 405 | let entry = Entry { 406 | path: p.into(), 407 | is_dir, 408 | }; 409 | assert_eq!(out, display_parts(&entry)); 410 | } 411 | 412 | // Directory cases (fixed) 413 | case(".", true, ("", ".", "/")); 414 | case("..", true, ("", "..", "/")); 415 | case("/", true, ("", "/", "")); 416 | case("./", true, ("", "./", "")); 417 | case("../", true, ("", "../", "")); 418 | case("dir", true, ("", "dir", "/")); 419 | case("dir/", true, ("", "dir", "/")); 420 | case("dir/.", true, ("", "dir", "/")); 421 | case("./dir", true, ("./", "dir", "/")); 422 | case("./dir/", true, ("./", "dir", "/")); 423 | case("./dir/.", true, ("./", "dir", "/")); 424 | 425 | // File cases 426 | case("file.txt", false, ("", "file.txt", "")); 427 | case("./file.txt", false, ("./", "file.txt", "")); 428 | case("dir/file.txt", false, ("dir/", "file.txt", "")); 429 | case("./dir/file.txt", false, ("./dir/", "file.txt", "")); 430 | case(".hidden", false, ("", ".hidden", "")); 431 | case("./dir/.hidden", false, ("./dir/", ".hidden", "")); 432 | } 433 | } 434 | -------------------------------------------------------------------------------- /src/commands/dupes.rs: -------------------------------------------------------------------------------- 1 | use crate::commands::Refine; 2 | use crate::entries::{Entry, InputInfo, TraversalMode}; 3 | use crate::utils::{self, display_abort}; 4 | use anyhow::Result; 5 | use clap::{Args, ValueEnum}; 6 | use deunicode::deunicode; 7 | use human_repr::HumanCount; 8 | use mime_guess::MimeGuess; 9 | use rayon::prelude::*; 10 | use regex::Regex; 11 | use std::boxed::Box; 12 | use std::cmp::{Ordering, Reverse}; 13 | use std::collections::{HashMap, HashSet}; 14 | use std::fs::File; 15 | use std::io::{self, Read, Seek, SeekFrom}; 16 | use std::sync::atomic::{AtomicUsize, Ordering as AtomicOrdering}; 17 | use std::sync::{Arc, LazyLock, Mutex}; 18 | use std::time::{Duration, Instant}; 19 | 20 | // TODO find some way to mark files/groups as "not a dupe". 21 | // TODO allow the user to specify custom stopwords, e.g., via a config file or command line argument. 22 | // TODO allow media type to be used as a fetch option (include in Entry perhaps) for all commands. 23 | 24 | #[derive(Debug, Args)] 25 | pub struct Dupes { 26 | /// Identical (size and sample), or similar (rare tokens and fuzzy matching). 27 | #[arg(short = 'm', long, default_value_t = SearchMode::All, value_name = "STR", value_enum)] 28 | mode: SearchMode, 29 | /// Sample size in kbytes (0 to disable). 30 | #[arg(short = 's', long, default_value_t = 4, value_name = "INT")] 31 | sample: usize, 32 | /// The threshold for similarity checks (0.0 to 1.0). 33 | #[arg(short = 't', long, default_value_t = 0.7, value_name = "FLOAT")] 34 | threshold: f64, 35 | /// Show the cleaned filenames for similarity checks. 36 | #[arg(short = 'v', long)] 37 | verbose: bool, 38 | } 39 | 40 | #[derive(Debug, Copy, Clone, ValueEnum)] 41 | enum SearchMode { 42 | #[value(alias = "i")] 43 | Identical, 44 | #[value(alias = "s")] 45 | Similar, 46 | #[value(alias = "a")] 47 | All, 48 | } 49 | 50 | #[derive(Debug)] 51 | pub struct Media { 52 | entry: Entry, 53 | size: u64, 54 | cleaned_name: String, // cleaned name for similarity checks. 55 | kind: &'static str, // guessed from both the MIME type and the file extension. 56 | sample: Option>>, // only populated if needed, and double to remember when already tried. 57 | } 58 | 59 | impl Refine for Dupes { 60 | type Media = Media; 61 | const OPENING_LINE: &'static str = "Detect duplicate files"; 62 | const T_MODE: TraversalMode = TraversalMode::Files; 63 | 64 | fn tweak(&mut self, _: &InputInfo) { 65 | if self.threshold < 0.0 || self.threshold > 1.0 { 66 | self.threshold = self.threshold.clamp(0.0, 1.0); 67 | eprintln!( 68 | "warning: invalid similarity threshold, using {:.1}", 69 | self.threshold 70 | ); 71 | } 72 | } 73 | 74 | fn refine(&self, mut medias: Vec) -> Result<()> { 75 | let (mut by_size, mut by_name) = (0, 0); 76 | 77 | // step: detect duplicates by content. 78 | if let SearchMode::Identical | SearchMode::All = self.mode { 79 | println!("by identical size and {}KB sample:", self.sample); 80 | by_size = self.find_identical(&mut medias, |size, g| { 81 | println!("\n{} x{}", size.human_count_bytes(), g.len()); 82 | g.iter().for_each(|&m| println!("{}", m.entry)); 83 | }); 84 | if by_size == 0 { 85 | println!("\nnone found!"); 86 | } 87 | println!(); 88 | } 89 | 90 | // step: detect duplicates by name. 91 | if let SearchMode::Similar | SearchMode::All = self.mode { 92 | println!("by name similarity:"); 93 | by_name = self.find_similar(&medias, |sim, g| { 94 | println!("\n{sim:.1}% similar x{}", g.len()); 95 | let show = if self.verbose { 96 | |m: &Media, s| println!("{s:>7}: {} [{}]", m.entry, m.cleaned_name) 97 | } else { 98 | |m: &Media, s| println!("{s:>7}: {}", m.entry) 99 | }; 100 | for m in g { 101 | let s = m.size.human_count_bytes().to_string(); // TODO: wait for human_repr to support size. 102 | show(m, s); 103 | } 104 | }); 105 | if by_name == 0 { 106 | println!("\nnone found!"); 107 | } 108 | println!(); 109 | } 110 | 111 | // step: display a summary receipt. 112 | let total = medias.len(); 113 | println!("total files: {total}"); 114 | if let SearchMode::Identical | SearchMode::All = self.mode { 115 | println!(" by size: {by_size} dupes{}", display_abort(by_name == 0)); 116 | } 117 | if let SearchMode::Similar | SearchMode::All = self.mode { 118 | println!(" by name: {by_name} dupes{}", display_abort(true)); 119 | } 120 | Ok(()) 121 | } 122 | } 123 | 124 | impl Dupes { 125 | /// Find identical files based on size and sample checks. 126 | fn find_identical(&self, medias: &mut [Media], show: FS) -> usize 127 | where 128 | FS: Fn(u64, Vec<&Media>), 129 | { 130 | let group = |m: &Media| (Reverse(m.size), m.kind); 131 | medias.sort_by_cached_key(group); 132 | medias 133 | .chunk_by_mut(|m, m2| group(m) == group(m2)) 134 | .filter(|_| utils::is_running()) 135 | .filter(|g| g.len() > 1) 136 | .flat_map(|g| { 137 | g.iter_mut().for_each(|m| { 138 | m.cache_sample(self.sample * 1024); // warm up samples for groups with at least 2 files. 139 | }); 140 | let mut split = HashMap::with_capacity(g.len()); 141 | g.iter() 142 | .map(|m| (m, m.sample.as_ref().unwrap())) // sample is always populated by cache_sample. 143 | .for_each(|(m, sample)| split.entry(sample).or_insert_with(Vec::new).push(m)); 144 | split.into_values().filter(|v| v.len() > 1) 145 | }) 146 | .map(|mut g| { 147 | g.sort_unstable_by(|m, n| m.entry.cmp(&n.entry)); 148 | show(g[0].size, g); 149 | }) 150 | .count() 151 | } 152 | 153 | /// Find similar files based on name similarity. 154 | fn find_similar(&self, medias: &[Media], show: FS) -> usize 155 | where 156 | FS: Fn(f64, Vec<&Media>), 157 | { 158 | // build token frequency map for rare token scoring. 159 | let token_freq = medias 160 | .iter() 161 | .flat_map(|m| m.cleaned_name.split_ascii_whitespace()) 162 | .fold(HashMap::new(), |mut acc, token| { 163 | *acc.entry(token).or_insert(0) += 1; 164 | acc 165 | }); 166 | 167 | // pre-calculate token sets for each media. 168 | let media_token_sets = medias 169 | .iter() 170 | .map(|m| { 171 | m.cleaned_name 172 | .split_ascii_whitespace() 173 | .collect::>() 174 | }) 175 | .collect::>(); 176 | 177 | // build inverted index for tokens. 178 | let mut token_blocks = HashMap::new(); 179 | medias.iter().enumerate().for_each(|(i, media)| { 180 | media 181 | .cleaned_name 182 | .split_ascii_whitespace() 183 | .for_each(|token| token_blocks.entry(token).or_insert_with(Vec::new).push(i)); 184 | }); 185 | 186 | // setup union-find. 187 | let mut parent = (0..medias.len()).collect::>(); 188 | let mut group_sim = HashMap::new(); // root -> (sum, count) 189 | fn find(parent: &mut [usize], x: usize) -> usize { 190 | if parent[x] != x { 191 | parent[x] = find(parent, parent[x]); 192 | } 193 | parent[x] 194 | } 195 | fn union( 196 | parent: &mut [usize], 197 | group_sim: &mut HashMap, 198 | x: usize, 199 | y: usize, 200 | sim: f64, 201 | ) { 202 | let xr = find(parent, x); 203 | let yr = find(parent, y); 204 | if xr != yr { 205 | // merge groups and update sum/count. 206 | let (sum1, count1) = group_sim.remove(&xr).unwrap_or((0.0, 0)); 207 | let (sum2, count2) = group_sim.remove(&yr).unwrap_or((0.0, 0)); 208 | parent[yr] = xr; 209 | group_sim.insert(xr, (sum1 + sum2 + sim, count1 + count2 + 1)); 210 | } else { 211 | // update sum/count for the group. 212 | let entry = group_sim.entry(xr).or_insert((0.0, 0)); 213 | entry.0 += sim; 214 | entry.1 += 1; 215 | } 216 | } 217 | 218 | // prepare to compare pairs of media. 219 | let total_pairs = { 220 | let mut seen_pairs = HashSet::new(); 221 | token_blocks 222 | .values() 223 | .flat_map(|g| { 224 | (0..g.len()).flat_map(move |i| (i + 1..g.len()).map(move |j| (g[i], g[j]))) 225 | }) 226 | .filter(|_| utils::is_running()) 227 | .filter(|&(a, b)| seen_pairs.insert((a.min(b), a.max(b)))) 228 | .count() 229 | }; 230 | 231 | // compare each unique pair only once and in parallel. 232 | const SPINNER: &str = "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"; 233 | let spinner_len = SPINNER.chars().count(); 234 | let counter = Arc::new(AtomicUsize::new(0)); 235 | let spin_counter = Arc::new(AtomicUsize::new(0)); // a separate counter for the spinner animation. 236 | let seen_pairs = Arc::new(Mutex::new(HashSet::new())); 237 | let progress_state = Arc::new(Mutex::new((Instant::now(), -1))); // contains (last_update, last_percent). 238 | let similar = token_blocks 239 | .values() 240 | .par_bridge() 241 | .flat_map_iter(|g| { 242 | (0..g.len()).flat_map(move |i| (i + 1..g.len()).map(move |j| (g[i], g[j]))) 243 | }) 244 | .filter(|_| utils::is_running()) 245 | .filter(|&(a, b)| seen_pairs.lock().unwrap().insert((a.min(b), a.max(b)))) // the mutex is not expected to be poisoned. 246 | .inspect(|_| { 247 | let count = counter.fetch_add(1, AtomicOrdering::Relaxed); 248 | let mut state = progress_state.lock().unwrap(); // the mutex is not expected to be poisoned. 249 | let (last_update, last_percent) = *state; 250 | let percent = (count as f64 / total_pairs as f64 * 100.0) as i32; 251 | 252 | // update if time has passed or a % threshold is crossed, and if progress has advanced. 253 | if (last_update.elapsed() > Duration::from_millis(100) 254 | || percent / 5 > last_percent / 5) 255 | && percent >= last_percent 256 | { 257 | let spin_idx = spin_counter.fetch_add(1, AtomicOrdering::Relaxed); 258 | let spin = SPINNER.chars().nth(spin_idx % spinner_len).unwrap(); // spinner_len is non-zero. 259 | eprint!("\r{spin} {percent:.0}%"); 260 | *state = (Instant::now(), percent); 261 | } 262 | }) 263 | .filter(|&(a, b)| medias[a].kind == medias[b].kind) 264 | .filter(|&(a, b)| { 265 | // ensure there's at least one shared non-numeric token. 266 | media_token_sets[a] 267 | .intersection(&media_token_sets[b]) 268 | .any(|token| token.chars().any(|c| !c.is_ascii_digit())) 269 | }) 270 | .filter_map(|(a, b)| { 271 | let clean1 = &medias[a].cleaned_name; 272 | let clean2 = &medias[b].cleaned_name; 273 | let sim = { 274 | let lev = strsim::normalized_levenshtein(clean1, clean2); 275 | let dice = strsim::sorensen_dice(clean1, clean2); 276 | let rare_token_boost = rare_token_similarity(clean1, clean2, &token_freq); 277 | // combine all three metrics: 40% string similarity, 60% rare token similarity. 278 | (lev.max(dice) * 0.4) + (rare_token_boost * 0.6) 279 | }; 280 | (sim >= self.threshold).then_some((a, b, sim)) 281 | }) 282 | .collect::>(); 283 | eprint!("\r \r"); // clear spinner/percent. 284 | 285 | // sequentially union similar pairs. 286 | similar.into_iter().for_each(|(a, b, sim)| { 287 | union(&mut parent, &mut group_sim, a, b, sim); 288 | }); 289 | 290 | // collect groups by root. 291 | let mut groups = HashMap::new(); 292 | (0..medias.len()).for_each(|i| { 293 | let root = find(&mut parent, i); 294 | groups.entry(root).or_insert(vec![]).push(i); 295 | }); 296 | 297 | // collect groups with more than one member, and filter out sequential ones. 298 | let mut group_infos = groups 299 | .values() 300 | .filter(|g| g.len() > 1) 301 | .map(|g| { 302 | // collect group medias. 303 | let group_medias = g.iter().map(|&idx| &medias[idx]).collect::>(); 304 | let root = find(&mut parent, g[0]); 305 | // safe unwrap: group_sim always has an entry for each root. 306 | let (sum, count) = group_sim.get(&root).copied().unwrap_or((0.0, 1)); 307 | let avg_sim = if count > 0 { sum / count as f64 } else { 1.0 }; 308 | (avg_sim, group_medias) 309 | }) 310 | .filter(|(_, g)| { 311 | // check for TV series, episode sequences, etc., and hide them. 312 | !is_likely_sequential(g) 313 | }) 314 | .collect::>(); 315 | 316 | // sort groups by average similarity in descending order. 317 | group_infos.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(Ordering::Equal)); 318 | 319 | // display each group. 320 | group_infos 321 | .into_iter() 322 | .map(|(avg_sim, mut g)| { 323 | g.sort_unstable_by(|m, n| m.entry.cmp(&n.entry)); 324 | show(avg_sim * 100.0, g); 325 | }) 326 | .count() 327 | } 328 | } 329 | 330 | /// Check if a group of files looks like episodes from a TV series or a sequence. 331 | /// If it is, it is not considered a group of duplicates. 332 | fn is_likely_sequential(group: &[&Media]) -> bool { 333 | // simple pattern to extract all numbers from filenames. 334 | static NUMBERS: LazyLock = LazyLock::new(|| Regex::new(r"\d+").unwrap()); 335 | 336 | if group.len() < 2 { 337 | return false; // not a series if less than 2 files. 338 | } 339 | 340 | // extract number sequences from each filename. 341 | let number_sequences = group 342 | .iter() 343 | .map(|m| { 344 | NUMBERS 345 | .find_iter(&m.cleaned_name) 346 | .map(|m| m.as_str().parse::().unwrap_or(-1)) // parse numbers, fallback to -1. 347 | .collect::>() 348 | }) 349 | .collect::>(); 350 | 351 | // filter out files that do not contain any numbers. 352 | let sequences_with_numbers = number_sequences 353 | .iter() 354 | .filter(|s| !s.is_empty()) 355 | .collect::>(); 356 | 357 | // allow a small number of files without numbers (e.g., a base file and its numbered extras). 358 | let files_without_numbers = group.len() - sequences_with_numbers.len(); 359 | if files_without_numbers > 1 && files_without_numbers as f64 / group.len() as f64 > 0.1 { 360 | return false; 361 | } 362 | 363 | // find the most common length of number sequences. 364 | let mut lengths = HashMap::new(); 365 | for seq in &sequences_with_numbers { 366 | *lengths.entry(seq.len()).or_insert(0) += 1; 367 | } 368 | let common_len = lengths.into_iter().max_by_key(|&(_, count)| count); 369 | 370 | // if no common length, or common length is zero, it's not a clear sequence. 371 | let (len, _) = match common_len { 372 | // use the most common length, even if it appears only once. 373 | Some((len, count)) if len > 0 && count >= 1 => (len, count), 374 | _ => return false, 375 | }; 376 | 377 | // filter for sequences with a length close to the common length. 378 | let sequences_with_common_len = sequences_with_numbers 379 | .iter() 380 | .filter(|s| s.len().abs_diff(len) <= 1) 381 | .collect::>(); 382 | 383 | // not a series if not enough files match the common length. 384 | if sequences_with_common_len.len() < 2 { 385 | return false; 386 | } 387 | 388 | // count how many number positions are constant vs. varying. 389 | let mut varying_indices = HashSet::new(); 390 | for i in 0..len { 391 | let mut values = HashSet::new(); 392 | for seq in &sequences_with_common_len { 393 | // check if the index is valid for the current sequence. 394 | if let Some(&val) = seq.get(i) { 395 | values.insert(val); 396 | } 397 | } 398 | if values.len() > 1 { 399 | varying_indices.insert(i); 400 | } 401 | } 402 | 403 | // it's a series if at least one number varies. 404 | !varying_indices.is_empty() 405 | } 406 | 407 | /// Calculates similarity between two strings based on rare tokens. 408 | fn rare_token_similarity(a: &str, b: &str, token_freq: &HashMap<&str, usize>) -> f64 { 409 | let a_tokens = a.split_ascii_whitespace().collect::>(); 410 | let b_tokens = b.split_ascii_whitespace().collect::>(); 411 | 412 | // calculate the weighted score for a set of tokens. 413 | let score = |tokens: &HashSet<&str>| -> f64 { 414 | tokens 415 | .iter() 416 | .map(|token| { 417 | let freq = token_freq.get(token).copied().unwrap_or(1); 418 | 1.0 / (freq as f64).ln_1p() // the score is the inverse of the log of frequency. 419 | }) 420 | .sum() 421 | }; 422 | 423 | let a_score = score(&a_tokens); 424 | let b_score = score(&b_tokens); 425 | 426 | if a_score == 0.0 || b_score == 0.0 { 427 | return 0.0; 428 | } 429 | 430 | let intersection = a_tokens.intersection(&b_tokens).copied().collect(); 431 | let intersection_score = score(&intersection); 432 | 433 | // calculate base similarity. 434 | let base_sim = if a_tokens.is_subset(&b_tokens) || b_tokens.is_subset(&a_tokens) { 435 | // for subsets, similarity is the ratio of the intersection to the smaller set's score. 436 | intersection_score / a_score.min(b_score) 437 | } else { 438 | // for others, use a weighted jaccard index. 439 | let union_score = a_score + b_score - intersection_score; 440 | if union_score == 0.0 { 441 | return if intersection_score > 0.0 { 1.0 } else { 0.0 }; 442 | } 443 | intersection_score / union_score 444 | }; 445 | 446 | // penalize based on the difference in token count. 447 | let len_a = a_tokens.len() as f64; 448 | let len_b = b_tokens.len() as f64; 449 | let length_ratio = len_a.min(len_b) / len_a.max(len_b); 450 | 451 | // use a stricter penalty for few shared tokens, and a more lenient one for more shared tokens. 452 | let shared_tokens = a_tokens.intersection(&b_tokens).count(); 453 | let exponent = if shared_tokens <= 1 { 0.6 } else { 1.0 / 3.0 }; 454 | let penalty = length_ratio.powf(exponent); 455 | 456 | base_sim * penalty 457 | } 458 | 459 | impl Media { 460 | fn cache_sample(&mut self, size: usize) { 461 | if self.sample.is_none() { 462 | let grab_sample = || { 463 | let mut file = File::open(&self.entry)?; 464 | let file_len = self.size; 465 | 466 | if file_len <= size as u64 { 467 | // read the whole file if it's smaller than the sample size. 468 | let mut buf = Vec::with_capacity(file_len as usize); 469 | file.read_to_end(&mut buf)?; 470 | return Ok::<_, io::Error>(buf); 471 | } 472 | 473 | // allocate buffer for all chunks. 474 | let mut buf = vec![0; size]; 475 | let chunk_size = size / 3; // may not be divisible by 3, but that's okay. 476 | 477 | // read from the start. 478 | file.read_exact(&mut buf[..chunk_size])?; 479 | 480 | // read from the middle. 481 | let mid_pos = file_len / 2 - chunk_size as u64 / 2; 482 | file.seek(SeekFrom::Start(mid_pos))?; 483 | file.read_exact(&mut buf[chunk_size..chunk_size * 2])?; 484 | 485 | // read from the end; this last chunk must compensate for the remainder of division. 486 | let end_pos = file_len - (size - chunk_size * 2) as u64; 487 | file.seek(SeekFrom::Start(end_pos))?; 488 | file.read_exact(&mut buf[chunk_size * 2..])?; 489 | 490 | Ok(buf) 491 | }; 492 | 493 | self.sample = match grab_sample() { 494 | Ok(buf) => Some(Some(buf.into_boxed_slice())), 495 | Err(err) => { 496 | eprintln!("error: load sample: {err:?}."); 497 | Some(None) 498 | } 499 | }; 500 | } 501 | } 502 | } 503 | 504 | /// Cleans the filename by normalizing it, removing diacritics, and filtering out common words. 505 | fn clean_words(name: &str) -> String { 506 | static WORDS: LazyLock = LazyLock::new(|| Regex::new(r"[\p{L}0-9]+").unwrap()); // accented letters, digits, no underscores. 507 | static TAGS_MULTI: LazyLock = LazyLock::new(|| { 508 | const SEP: &str = r"[ .-]?"; 509 | const TAGS: &[&[&str]] = &[ 510 | &["web", "dl"], 511 | &["blu", "ray"], 512 | &["(web|dvd|bd|br|hd)", "rip"], 513 | &["hd", "tv"], 514 | &["5\\.1"], 515 | &["6", "ch"], 516 | &["ac", "3"], 517 | &["[hx]", "26[45]"], 518 | ]; 519 | Regex::new( 520 | &TAGS 521 | .iter() 522 | .map(|t| t.join(SEP)) 523 | .collect::>() 524 | .join("|"), 525 | ) 526 | .unwrap() 527 | }); 528 | static STOPWORDS: LazyLock> = LazyLock::new(|| { 529 | #[rustfmt::skip] 530 | const SET: &[&str] = &[ 531 | // non-content words, common release types, resolutions, codecs. 532 | "the", "a", "an", "of", "and", "in", "on", "at", "to", "by", "as", 533 | "e", "o", "os", "um", "uma", "uns", "umas", "ao", "aos", "à", "às", "da", "de", "do", "em", "das", "dos", 534 | "cam", "ts", "tc", "r5", "dvdscr", "dvdscreener", 535 | "repack", "limited", "internal", "remux", "fullhd", "hd", "1400mb", 536 | "ac", "dts", "aac", "ddp", "mp3", "1080p", "720p", "2160p", "4k", "mp4", 537 | "hevc", "psa", "xvid", "xvidhd", "10bit", "8bit", 538 | ]; 539 | SET.iter().copied().collect() 540 | }); 541 | 542 | // transliterate to ascii, removing accents and special characters. 543 | let base = deunicode(name).to_ascii_lowercase(); 544 | 545 | let cleaned = TAGS_MULTI.replace_all(&base, ""); 546 | let cleaned = WORDS 547 | .find_iter(&cleaned) 548 | .map(|m| m.as_str()) 549 | .filter(|word| !STOPWORDS.contains(word)) 550 | .map(|word| word.to_owned()) 551 | .collect::>(); 552 | 553 | match cleaned.is_empty() { 554 | true => base, 555 | false => cleaned.join(" "), 556 | } 557 | } 558 | 559 | fn classify_media_kind(ext: &str) -> &'static str { 560 | let ext = ext.to_ascii_lowercase(); 561 | let ext = ext.as_str(); 562 | // guess the mime type from the extension. 563 | let mime = MimeGuess::from_ext(ext).first_raw().unwrap_or_default(); 564 | let top = mime.split('/').next().unwrap_or_default(); 565 | 566 | match top { 567 | "video" | "audio" | "image" | "text" => top, 568 | "application" => match ext { 569 | // video extensions that are misclassified as application. 570 | "mkv" | "webm" | "rmvb" | "m2ts" | "mts" | "f4v" | "vob" | "ogv" => "video", 571 | // document. 572 | "pdf" | "doc" | "docx" | "xls" | "xlsx" | "ppt" | "pptx" | "odt" | "ods" | "odp" 573 | | "rtf" => "document", 574 | // archive. 575 | "zip" | "rar" | "7z" | "tar" | "gz" | "bz2" | "xz" | "lz" | "lzma" | "iso" | "cab" 576 | | "arj" | "z" => "archive", 577 | // subtitle. 578 | "srt" | "ass" | "ssa" | "sub" | "vtt" | "idx" | "sup" => "subtitle", 579 | // text (some application/* are actually text). 580 | "csv" | "json" | "xml" | "yaml" | "yml" | "ini" | "conf" => "text", 581 | _ => "application", 582 | }, 583 | _ => "unknown", 584 | } 585 | } 586 | 587 | impl TryFrom for Media { 588 | type Error = (Entry, anyhow::Error); 589 | 590 | fn try_from(entry: Entry) -> Result { 591 | let (stem, ext) = entry.filename_parts(); 592 | Ok(Media { 593 | size: entry.metadata().map_or(0, |m| m.len()), 594 | cleaned_name: clean_words(stem), 595 | kind: classify_media_kind(ext), 596 | entry, 597 | sample: None, 598 | }) 599 | } 600 | } 601 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 4 4 | 5 | [[package]] 6 | name = "adler2" 7 | version = "2.0.1" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" 10 | 11 | [[package]] 12 | name = "aho-corasick" 13 | version = "1.1.3" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" 16 | dependencies = [ 17 | "memchr", 18 | ] 19 | 20 | [[package]] 21 | name = "anstream" 22 | version = "0.6.20" 23 | source = "registry+https://github.com/rust-lang/crates.io-index" 24 | checksum = "3ae563653d1938f79b1ab1b5e668c87c76a9930414574a6583a7b7e11a8e6192" 25 | dependencies = [ 26 | "anstyle", 27 | "anstyle-parse", 28 | "anstyle-query", 29 | "anstyle-wincon", 30 | "colorchoice", 31 | "is_terminal_polyfill", 32 | "utf8parse", 33 | ] 34 | 35 | [[package]] 36 | name = "anstyle" 37 | version = "1.0.11" 38 | source = "registry+https://github.com/rust-lang/crates.io-index" 39 | checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd" 40 | 41 | [[package]] 42 | name = "anstyle-parse" 43 | version = "0.2.7" 44 | source = "registry+https://github.com/rust-lang/crates.io-index" 45 | checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" 46 | dependencies = [ 47 | "utf8parse", 48 | ] 49 | 50 | [[package]] 51 | name = "anstyle-query" 52 | version = "1.1.4" 53 | source = "registry+https://github.com/rust-lang/crates.io-index" 54 | checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2" 55 | dependencies = [ 56 | "windows-sys 0.60.2", 57 | ] 58 | 59 | [[package]] 60 | name = "anstyle-wincon" 61 | version = "3.0.10" 62 | source = "registry+https://github.com/rust-lang/crates.io-index" 63 | checksum = "3e0633414522a32ffaac8ac6cc8f748e090c5717661fddeea04219e2344f5f2a" 64 | dependencies = [ 65 | "anstyle", 66 | "once_cell_polyfill", 67 | "windows-sys 0.60.2", 68 | ] 69 | 70 | [[package]] 71 | name = "anyhow" 72 | version = "1.0.99" 73 | source = "registry+https://github.com/rust-lang/crates.io-index" 74 | checksum = "b0674a1ddeecb70197781e945de4b3b8ffb61fa939a5597bcf48503737663100" 75 | 76 | [[package]] 77 | name = "base64" 78 | version = "0.22.1" 79 | source = "registry+https://github.com/rust-lang/crates.io-index" 80 | checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" 81 | 82 | [[package]] 83 | name = "bitflags" 84 | version = "2.9.4" 85 | source = "registry+https://github.com/rust-lang/crates.io-index" 86 | checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394" 87 | 88 | [[package]] 89 | name = "bytes" 90 | version = "1.10.1" 91 | source = "registry+https://github.com/rust-lang/crates.io-index" 92 | checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" 93 | 94 | [[package]] 95 | name = "cc" 96 | version = "1.2.36" 97 | source = "registry+https://github.com/rust-lang/crates.io-index" 98 | checksum = "5252b3d2648e5eedbc1a6f501e3c795e07025c1e93bbf8bbdd6eef7f447a6d54" 99 | dependencies = [ 100 | "find-msvc-tools", 101 | "shlex", 102 | ] 103 | 104 | [[package]] 105 | name = "cfg-if" 106 | version = "1.0.3" 107 | source = "registry+https://github.com/rust-lang/crates.io-index" 108 | checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9" 109 | 110 | [[package]] 111 | name = "cfg_aliases" 112 | version = "0.2.1" 113 | source = "registry+https://github.com/rust-lang/crates.io-index" 114 | checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" 115 | 116 | [[package]] 117 | name = "clap" 118 | version = "4.5.47" 119 | source = "registry+https://github.com/rust-lang/crates.io-index" 120 | checksum = "7eac00902d9d136acd712710d71823fb8ac8004ca445a89e73a41d45aa712931" 121 | dependencies = [ 122 | "clap_builder", 123 | "clap_derive", 124 | ] 125 | 126 | [[package]] 127 | name = "clap_builder" 128 | version = "4.5.47" 129 | source = "registry+https://github.com/rust-lang/crates.io-index" 130 | checksum = "2ad9bbf750e73b5884fb8a211a9424a1906c1e156724260fdae972f31d70e1d6" 131 | dependencies = [ 132 | "anstream", 133 | "anstyle", 134 | "clap_lex", 135 | "strsim", 136 | ] 137 | 138 | [[package]] 139 | name = "clap_derive" 140 | version = "4.5.47" 141 | source = "registry+https://github.com/rust-lang/crates.io-index" 142 | checksum = "bbfd7eae0b0f1a6e63d4b13c9c478de77c2eb546fba158ad50b4203dc24b9f9c" 143 | dependencies = [ 144 | "heck", 145 | "proc-macro2", 146 | "quote", 147 | "syn", 148 | ] 149 | 150 | [[package]] 151 | name = "clap_lex" 152 | version = "0.7.5" 153 | source = "registry+https://github.com/rust-lang/crates.io-index" 154 | checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" 155 | 156 | [[package]] 157 | name = "colorchoice" 158 | version = "1.0.4" 159 | source = "registry+https://github.com/rust-lang/crates.io-index" 160 | checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" 161 | 162 | [[package]] 163 | name = "crc32fast" 164 | version = "1.5.0" 165 | source = "registry+https://github.com/rust-lang/crates.io-index" 166 | checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" 167 | dependencies = [ 168 | "cfg-if", 169 | ] 170 | 171 | [[package]] 172 | name = "crossbeam-deque" 173 | version = "0.8.6" 174 | source = "registry+https://github.com/rust-lang/crates.io-index" 175 | checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" 176 | dependencies = [ 177 | "crossbeam-epoch", 178 | "crossbeam-utils", 179 | ] 180 | 181 | [[package]] 182 | name = "crossbeam-epoch" 183 | version = "0.9.18" 184 | source = "registry+https://github.com/rust-lang/crates.io-index" 185 | checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" 186 | dependencies = [ 187 | "crossbeam-utils", 188 | ] 189 | 190 | [[package]] 191 | name = "crossbeam-utils" 192 | version = "0.8.21" 193 | source = "registry+https://github.com/rust-lang/crates.io-index" 194 | checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" 195 | 196 | [[package]] 197 | name = "ctrlc" 198 | version = "3.4.7" 199 | source = "registry+https://github.com/rust-lang/crates.io-index" 200 | checksum = "46f93780a459b7d656ef7f071fe699c4d3d2cb201c4b24d085b6ddc505276e73" 201 | dependencies = [ 202 | "nix", 203 | "windows-sys 0.59.0", 204 | ] 205 | 206 | [[package]] 207 | name = "deunicode" 208 | version = "1.6.2" 209 | source = "registry+https://github.com/rust-lang/crates.io-index" 210 | checksum = "abd57806937c9cc163efc8ea3910e00a62e2aeb0b8119f1793a978088f8f6b04" 211 | 212 | [[package]] 213 | name = "dirs" 214 | version = "6.0.0" 215 | source = "registry+https://github.com/rust-lang/crates.io-index" 216 | checksum = "c3e8aa94d75141228480295a7d0e7feb620b1a5ad9f12bc40be62411e38cce4e" 217 | dependencies = [ 218 | "dirs-sys", 219 | ] 220 | 221 | [[package]] 222 | name = "dirs-sys" 223 | version = "0.5.0" 224 | source = "registry+https://github.com/rust-lang/crates.io-index" 225 | checksum = "e01a3366d27ee9890022452ee61b2b63a67e6f13f58900b651ff5665f0bb1fab" 226 | dependencies = [ 227 | "libc", 228 | "option-ext", 229 | "redox_users", 230 | "windows-sys 0.61.0", 231 | ] 232 | 233 | [[package]] 234 | name = "either" 235 | version = "1.15.0" 236 | source = "registry+https://github.com/rust-lang/crates.io-index" 237 | checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" 238 | 239 | [[package]] 240 | name = "find-msvc-tools" 241 | version = "0.1.1" 242 | source = "registry+https://github.com/rust-lang/crates.io-index" 243 | checksum = "7fd99930f64d146689264c637b5af2f0233a933bef0d8570e2526bf9e083192d" 244 | 245 | [[package]] 246 | name = "flate2" 247 | version = "1.1.2" 248 | source = "registry+https://github.com/rust-lang/crates.io-index" 249 | checksum = "4a3d7db9596fecd151c5f638c0ee5d5bd487b6e0ea232e5dc96d5250f6f94b1d" 250 | dependencies = [ 251 | "crc32fast", 252 | "miniz_oxide", 253 | ] 254 | 255 | [[package]] 256 | name = "fnv" 257 | version = "1.0.7" 258 | source = "registry+https://github.com/rust-lang/crates.io-index" 259 | checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" 260 | 261 | [[package]] 262 | name = "getrandom" 263 | version = "0.2.16" 264 | source = "registry+https://github.com/rust-lang/crates.io-index" 265 | checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" 266 | dependencies = [ 267 | "cfg-if", 268 | "libc", 269 | "wasi", 270 | ] 271 | 272 | [[package]] 273 | name = "heck" 274 | version = "0.5.0" 275 | source = "registry+https://github.com/rust-lang/crates.io-index" 276 | checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" 277 | 278 | [[package]] 279 | name = "http" 280 | version = "1.3.1" 281 | source = "registry+https://github.com/rust-lang/crates.io-index" 282 | checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565" 283 | dependencies = [ 284 | "bytes", 285 | "fnv", 286 | "itoa", 287 | ] 288 | 289 | [[package]] 290 | name = "httparse" 291 | version = "1.10.1" 292 | source = "registry+https://github.com/rust-lang/crates.io-index" 293 | checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" 294 | 295 | [[package]] 296 | name = "human-repr" 297 | version = "1.1.0" 298 | source = "registry+https://github.com/rust-lang/crates.io-index" 299 | checksum = "f58b778a5761513caf593693f8951c97a5b610841e754788400f32102eefdff1" 300 | 301 | [[package]] 302 | name = "is_terminal_polyfill" 303 | version = "1.70.1" 304 | source = "registry+https://github.com/rust-lang/crates.io-index" 305 | checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" 306 | 307 | [[package]] 308 | name = "itoa" 309 | version = "1.0.15" 310 | source = "registry+https://github.com/rust-lang/crates.io-index" 311 | checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" 312 | 313 | [[package]] 314 | name = "libc" 315 | version = "0.2.175" 316 | source = "registry+https://github.com/rust-lang/crates.io-index" 317 | checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543" 318 | 319 | [[package]] 320 | name = "libredox" 321 | version = "0.1.9" 322 | source = "registry+https://github.com/rust-lang/crates.io-index" 323 | checksum = "391290121bad3d37fbddad76d8f5d1c1c314cfc646d143d7e07a3086ddff0ce3" 324 | dependencies = [ 325 | "bitflags", 326 | "libc", 327 | ] 328 | 329 | [[package]] 330 | name = "log" 331 | version = "0.4.28" 332 | source = "registry+https://github.com/rust-lang/crates.io-index" 333 | checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" 334 | 335 | [[package]] 336 | name = "memchr" 337 | version = "2.7.5" 338 | source = "registry+https://github.com/rust-lang/crates.io-index" 339 | checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" 340 | 341 | [[package]] 342 | name = "mime" 343 | version = "0.3.17" 344 | source = "registry+https://github.com/rust-lang/crates.io-index" 345 | checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" 346 | 347 | [[package]] 348 | name = "mime_guess" 349 | version = "2.0.5" 350 | source = "registry+https://github.com/rust-lang/crates.io-index" 351 | checksum = "f7c44f8e672c00fe5308fa235f821cb4198414e1c77935c1ab6948d3fd78550e" 352 | dependencies = [ 353 | "mime", 354 | "unicase", 355 | ] 356 | 357 | [[package]] 358 | name = "miniz_oxide" 359 | version = "0.8.9" 360 | source = "registry+https://github.com/rust-lang/crates.io-index" 361 | checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" 362 | dependencies = [ 363 | "adler2", 364 | ] 365 | 366 | [[package]] 367 | name = "nix" 368 | version = "0.30.1" 369 | source = "registry+https://github.com/rust-lang/crates.io-index" 370 | checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6" 371 | dependencies = [ 372 | "bitflags", 373 | "cfg-if", 374 | "cfg_aliases", 375 | "libc", 376 | ] 377 | 378 | [[package]] 379 | name = "once_cell" 380 | version = "1.21.3" 381 | source = "registry+https://github.com/rust-lang/crates.io-index" 382 | checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" 383 | 384 | [[package]] 385 | name = "once_cell_polyfill" 386 | version = "1.70.1" 387 | source = "registry+https://github.com/rust-lang/crates.io-index" 388 | checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" 389 | 390 | [[package]] 391 | name = "option-ext" 392 | version = "0.2.0" 393 | source = "registry+https://github.com/rust-lang/crates.io-index" 394 | checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" 395 | 396 | [[package]] 397 | name = "percent-encoding" 398 | version = "2.3.2" 399 | source = "registry+https://github.com/rust-lang/crates.io-index" 400 | checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" 401 | 402 | [[package]] 403 | name = "proc-macro2" 404 | version = "1.0.101" 405 | source = "registry+https://github.com/rust-lang/crates.io-index" 406 | checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" 407 | dependencies = [ 408 | "unicode-ident", 409 | ] 410 | 411 | [[package]] 412 | name = "quote" 413 | version = "1.0.40" 414 | source = "registry+https://github.com/rust-lang/crates.io-index" 415 | checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" 416 | dependencies = [ 417 | "proc-macro2", 418 | ] 419 | 420 | [[package]] 421 | name = "rayon" 422 | version = "1.11.0" 423 | source = "registry+https://github.com/rust-lang/crates.io-index" 424 | checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" 425 | dependencies = [ 426 | "either", 427 | "rayon-core", 428 | ] 429 | 430 | [[package]] 431 | name = "rayon-core" 432 | version = "1.13.0" 433 | source = "registry+https://github.com/rust-lang/crates.io-index" 434 | checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" 435 | dependencies = [ 436 | "crossbeam-deque", 437 | "crossbeam-utils", 438 | ] 439 | 440 | [[package]] 441 | name = "redox_users" 442 | version = "0.5.2" 443 | source = "registry+https://github.com/rust-lang/crates.io-index" 444 | checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac" 445 | dependencies = [ 446 | "getrandom", 447 | "libredox", 448 | "thiserror", 449 | ] 450 | 451 | [[package]] 452 | name = "refine" 453 | version = "3.0.0" 454 | dependencies = [ 455 | "anyhow", 456 | "clap", 457 | "ctrlc", 458 | "deunicode", 459 | "dirs", 460 | "human-repr", 461 | "mime_guess", 462 | "rayon", 463 | "regex", 464 | "strsim", 465 | "ureq", 466 | "yansi", 467 | ] 468 | 469 | [[package]] 470 | name = "regex" 471 | version = "1.11.2" 472 | source = "registry+https://github.com/rust-lang/crates.io-index" 473 | checksum = "23d7fd106d8c02486a8d64e778353d1cffe08ce79ac2e82f540c86d0facf6912" 474 | dependencies = [ 475 | "aho-corasick", 476 | "memchr", 477 | "regex-automata", 478 | "regex-syntax", 479 | ] 480 | 481 | [[package]] 482 | name = "regex-automata" 483 | version = "0.4.10" 484 | source = "registry+https://github.com/rust-lang/crates.io-index" 485 | checksum = "6b9458fa0bfeeac22b5ca447c63aaf45f28439a709ccd244698632f9aa6394d6" 486 | dependencies = [ 487 | "aho-corasick", 488 | "memchr", 489 | "regex-syntax", 490 | ] 491 | 492 | [[package]] 493 | name = "regex-syntax" 494 | version = "0.8.6" 495 | source = "registry+https://github.com/rust-lang/crates.io-index" 496 | checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001" 497 | 498 | [[package]] 499 | name = "ring" 500 | version = "0.17.14" 501 | source = "registry+https://github.com/rust-lang/crates.io-index" 502 | checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" 503 | dependencies = [ 504 | "cc", 505 | "cfg-if", 506 | "getrandom", 507 | "libc", 508 | "untrusted", 509 | "windows-sys 0.52.0", 510 | ] 511 | 512 | [[package]] 513 | name = "rustls" 514 | version = "0.23.31" 515 | source = "registry+https://github.com/rust-lang/crates.io-index" 516 | checksum = "c0ebcbd2f03de0fc1122ad9bb24b127a5a6cd51d72604a3f3c50ac459762b6cc" 517 | dependencies = [ 518 | "log", 519 | "once_cell", 520 | "ring", 521 | "rustls-pki-types", 522 | "rustls-webpki", 523 | "subtle", 524 | "zeroize", 525 | ] 526 | 527 | [[package]] 528 | name = "rustls-pemfile" 529 | version = "2.2.0" 530 | source = "registry+https://github.com/rust-lang/crates.io-index" 531 | checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" 532 | dependencies = [ 533 | "rustls-pki-types", 534 | ] 535 | 536 | [[package]] 537 | name = "rustls-pki-types" 538 | version = "1.12.0" 539 | source = "registry+https://github.com/rust-lang/crates.io-index" 540 | checksum = "229a4a4c221013e7e1f1a043678c5cc39fe5171437c88fb47151a21e6f5b5c79" 541 | dependencies = [ 542 | "zeroize", 543 | ] 544 | 545 | [[package]] 546 | name = "rustls-webpki" 547 | version = "0.103.4" 548 | source = "registry+https://github.com/rust-lang/crates.io-index" 549 | checksum = "0a17884ae0c1b773f1ccd2bd4a8c72f16da897310a98b0e84bf349ad5ead92fc" 550 | dependencies = [ 551 | "ring", 552 | "rustls-pki-types", 553 | "untrusted", 554 | ] 555 | 556 | [[package]] 557 | name = "shlex" 558 | version = "1.3.0" 559 | source = "registry+https://github.com/rust-lang/crates.io-index" 560 | checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" 561 | 562 | [[package]] 563 | name = "strsim" 564 | version = "0.11.1" 565 | source = "registry+https://github.com/rust-lang/crates.io-index" 566 | checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" 567 | 568 | [[package]] 569 | name = "subtle" 570 | version = "2.6.1" 571 | source = "registry+https://github.com/rust-lang/crates.io-index" 572 | checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" 573 | 574 | [[package]] 575 | name = "syn" 576 | version = "2.0.106" 577 | source = "registry+https://github.com/rust-lang/crates.io-index" 578 | checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6" 579 | dependencies = [ 580 | "proc-macro2", 581 | "quote", 582 | "unicode-ident", 583 | ] 584 | 585 | [[package]] 586 | name = "thiserror" 587 | version = "2.0.16" 588 | source = "registry+https://github.com/rust-lang/crates.io-index" 589 | checksum = "3467d614147380f2e4e374161426ff399c91084acd2363eaf549172b3d5e60c0" 590 | dependencies = [ 591 | "thiserror-impl", 592 | ] 593 | 594 | [[package]] 595 | name = "thiserror-impl" 596 | version = "2.0.16" 597 | source = "registry+https://github.com/rust-lang/crates.io-index" 598 | checksum = "6c5e1be1c48b9172ee610da68fd9cd2770e7a4056cb3fc98710ee6906f0c7960" 599 | dependencies = [ 600 | "proc-macro2", 601 | "quote", 602 | "syn", 603 | ] 604 | 605 | [[package]] 606 | name = "unicase" 607 | version = "2.8.1" 608 | source = "registry+https://github.com/rust-lang/crates.io-index" 609 | checksum = "75b844d17643ee918803943289730bec8aac480150456169e647ed0b576ba539" 610 | 611 | [[package]] 612 | name = "unicode-ident" 613 | version = "1.0.18" 614 | source = "registry+https://github.com/rust-lang/crates.io-index" 615 | checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" 616 | 617 | [[package]] 618 | name = "untrusted" 619 | version = "0.9.0" 620 | source = "registry+https://github.com/rust-lang/crates.io-index" 621 | checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" 622 | 623 | [[package]] 624 | name = "ureq" 625 | version = "3.1.0" 626 | source = "registry+https://github.com/rust-lang/crates.io-index" 627 | checksum = "00432f493971db5d8e47a65aeb3b02f8226b9b11f1450ff86bb772776ebadd70" 628 | dependencies = [ 629 | "base64", 630 | "flate2", 631 | "log", 632 | "percent-encoding", 633 | "rustls", 634 | "rustls-pemfile", 635 | "rustls-pki-types", 636 | "ureq-proto", 637 | "utf-8", 638 | "webpki-roots", 639 | ] 640 | 641 | [[package]] 642 | name = "ureq-proto" 643 | version = "0.5.1" 644 | source = "registry+https://github.com/rust-lang/crates.io-index" 645 | checksum = "bbe120bb823a0061680e66e9075942fcdba06d46551548c2c259766b9558bc9a" 646 | dependencies = [ 647 | "base64", 648 | "http", 649 | "httparse", 650 | "log", 651 | ] 652 | 653 | [[package]] 654 | name = "utf-8" 655 | version = "0.7.6" 656 | source = "registry+https://github.com/rust-lang/crates.io-index" 657 | checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" 658 | 659 | [[package]] 660 | name = "utf8parse" 661 | version = "0.2.2" 662 | source = "registry+https://github.com/rust-lang/crates.io-index" 663 | checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" 664 | 665 | [[package]] 666 | name = "wasi" 667 | version = "0.11.1+wasi-snapshot-preview1" 668 | source = "registry+https://github.com/rust-lang/crates.io-index" 669 | checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" 670 | 671 | [[package]] 672 | name = "webpki-roots" 673 | version = "1.0.2" 674 | source = "registry+https://github.com/rust-lang/crates.io-index" 675 | checksum = "7e8983c3ab33d6fb807cfcdad2491c4ea8cbc8ed839181c7dfd9c67c83e261b2" 676 | dependencies = [ 677 | "rustls-pki-types", 678 | ] 679 | 680 | [[package]] 681 | name = "windows-link" 682 | version = "0.1.3" 683 | source = "registry+https://github.com/rust-lang/crates.io-index" 684 | checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a" 685 | 686 | [[package]] 687 | name = "windows-link" 688 | version = "0.2.0" 689 | source = "registry+https://github.com/rust-lang/crates.io-index" 690 | checksum = "45e46c0661abb7180e7b9c281db115305d49ca1709ab8242adf09666d2173c65" 691 | 692 | [[package]] 693 | name = "windows-sys" 694 | version = "0.52.0" 695 | source = "registry+https://github.com/rust-lang/crates.io-index" 696 | checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" 697 | dependencies = [ 698 | "windows-targets 0.52.6", 699 | ] 700 | 701 | [[package]] 702 | name = "windows-sys" 703 | version = "0.59.0" 704 | source = "registry+https://github.com/rust-lang/crates.io-index" 705 | checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" 706 | dependencies = [ 707 | "windows-targets 0.52.6", 708 | ] 709 | 710 | [[package]] 711 | name = "windows-sys" 712 | version = "0.60.2" 713 | source = "registry+https://github.com/rust-lang/crates.io-index" 714 | checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" 715 | dependencies = [ 716 | "windows-targets 0.53.3", 717 | ] 718 | 719 | [[package]] 720 | name = "windows-sys" 721 | version = "0.61.0" 722 | source = "registry+https://github.com/rust-lang/crates.io-index" 723 | checksum = "e201184e40b2ede64bc2ea34968b28e33622acdbbf37104f0e4a33f7abe657aa" 724 | dependencies = [ 725 | "windows-link 0.2.0", 726 | ] 727 | 728 | [[package]] 729 | name = "windows-targets" 730 | version = "0.52.6" 731 | source = "registry+https://github.com/rust-lang/crates.io-index" 732 | checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" 733 | dependencies = [ 734 | "windows_aarch64_gnullvm 0.52.6", 735 | "windows_aarch64_msvc 0.52.6", 736 | "windows_i686_gnu 0.52.6", 737 | "windows_i686_gnullvm 0.52.6", 738 | "windows_i686_msvc 0.52.6", 739 | "windows_x86_64_gnu 0.52.6", 740 | "windows_x86_64_gnullvm 0.52.6", 741 | "windows_x86_64_msvc 0.52.6", 742 | ] 743 | 744 | [[package]] 745 | name = "windows-targets" 746 | version = "0.53.3" 747 | source = "registry+https://github.com/rust-lang/crates.io-index" 748 | checksum = "d5fe6031c4041849d7c496a8ded650796e7b6ecc19df1a431c1a363342e5dc91" 749 | dependencies = [ 750 | "windows-link 0.1.3", 751 | "windows_aarch64_gnullvm 0.53.0", 752 | "windows_aarch64_msvc 0.53.0", 753 | "windows_i686_gnu 0.53.0", 754 | "windows_i686_gnullvm 0.53.0", 755 | "windows_i686_msvc 0.53.0", 756 | "windows_x86_64_gnu 0.53.0", 757 | "windows_x86_64_gnullvm 0.53.0", 758 | "windows_x86_64_msvc 0.53.0", 759 | ] 760 | 761 | [[package]] 762 | name = "windows_aarch64_gnullvm" 763 | version = "0.52.6" 764 | source = "registry+https://github.com/rust-lang/crates.io-index" 765 | checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" 766 | 767 | [[package]] 768 | name = "windows_aarch64_gnullvm" 769 | version = "0.53.0" 770 | source = "registry+https://github.com/rust-lang/crates.io-index" 771 | checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" 772 | 773 | [[package]] 774 | name = "windows_aarch64_msvc" 775 | version = "0.52.6" 776 | source = "registry+https://github.com/rust-lang/crates.io-index" 777 | checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" 778 | 779 | [[package]] 780 | name = "windows_aarch64_msvc" 781 | version = "0.53.0" 782 | source = "registry+https://github.com/rust-lang/crates.io-index" 783 | checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" 784 | 785 | [[package]] 786 | name = "windows_i686_gnu" 787 | version = "0.52.6" 788 | source = "registry+https://github.com/rust-lang/crates.io-index" 789 | checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" 790 | 791 | [[package]] 792 | name = "windows_i686_gnu" 793 | version = "0.53.0" 794 | source = "registry+https://github.com/rust-lang/crates.io-index" 795 | checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" 796 | 797 | [[package]] 798 | name = "windows_i686_gnullvm" 799 | version = "0.52.6" 800 | source = "registry+https://github.com/rust-lang/crates.io-index" 801 | checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" 802 | 803 | [[package]] 804 | name = "windows_i686_gnullvm" 805 | version = "0.53.0" 806 | source = "registry+https://github.com/rust-lang/crates.io-index" 807 | checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" 808 | 809 | [[package]] 810 | name = "windows_i686_msvc" 811 | version = "0.52.6" 812 | source = "registry+https://github.com/rust-lang/crates.io-index" 813 | checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" 814 | 815 | [[package]] 816 | name = "windows_i686_msvc" 817 | version = "0.53.0" 818 | source = "registry+https://github.com/rust-lang/crates.io-index" 819 | checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" 820 | 821 | [[package]] 822 | name = "windows_x86_64_gnu" 823 | version = "0.52.6" 824 | source = "registry+https://github.com/rust-lang/crates.io-index" 825 | checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" 826 | 827 | [[package]] 828 | name = "windows_x86_64_gnu" 829 | version = "0.53.0" 830 | source = "registry+https://github.com/rust-lang/crates.io-index" 831 | checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" 832 | 833 | [[package]] 834 | name = "windows_x86_64_gnullvm" 835 | version = "0.52.6" 836 | source = "registry+https://github.com/rust-lang/crates.io-index" 837 | checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" 838 | 839 | [[package]] 840 | name = "windows_x86_64_gnullvm" 841 | version = "0.53.0" 842 | source = "registry+https://github.com/rust-lang/crates.io-index" 843 | checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" 844 | 845 | [[package]] 846 | name = "windows_x86_64_msvc" 847 | version = "0.52.6" 848 | source = "registry+https://github.com/rust-lang/crates.io-index" 849 | checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" 850 | 851 | [[package]] 852 | name = "windows_x86_64_msvc" 853 | version = "0.53.0" 854 | source = "registry+https://github.com/rust-lang/crates.io-index" 855 | checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" 856 | 857 | [[package]] 858 | name = "yansi" 859 | version = "1.0.1" 860 | source = "registry+https://github.com/rust-lang/crates.io-index" 861 | checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049" 862 | 863 | [[package]] 864 | name = "zeroize" 865 | version = "1.8.1" 866 | source = "registry+https://github.com/rust-lang/crates.io-index" 867 | checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" 868 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # refine 2 | 3 | [![Crates.io](https://img.shields.io/crates/v/refine.svg)](https://crates.io/crates/refine) 4 | [![dependency status](https://deps.rs/repo/github/rsalmei/refine/status.svg)](https://deps.rs/repo/github/rsalmei/refine) 5 | ![Crates.io](https://img.shields.io/crates/d/refine) 6 | ![GitHub Sponsors](https://img.shields.io/github/sponsors/rsalmei) 7 | 8 | ### Refine your file collection using Rust! 9 | 10 | ## What it does 11 | 12 | This tool will revolutionize the way you manage your media collections! It can simultaneously scan multiple root directories and analyze all the files and directories found as a whole, performing some advanced operations on them. And it is very easy to use, with a simple and intuitive command line interface that will let you quickly get the results you want. 13 | 14 | It will help you reasonably find duplicated files both identically and by fuzzy filename similarity, seamlessly join them into a single directory with advanced name conflict resolution, quickly list files from multiple directories sorted together by various criteria, effortlessly rename files and directories using advanced regular expression rules, intelligently rebuild entire media collection names by identifying groups and s, and even reliably probe collection filenames against a remote server! It is a one-stop solution for all your media needs, allowing you to organize them in a way that makes sense to you. 15 | 16 | > Use it to _refine_ your photos, music, movies, porn, docs, or any other collections, cleaning up and organizing them in a way that makes sense to you. 17 | 18 | Note: every command is dry-run by default, so don't worry experimenting! This is a key feature of this tool, the commands are interactive unless you say otherwise, allowing you to preview what would be done and then confirm or abort the changes. You can thus try out different commands and options without the risk of changing or losing anything! 19 | 20 | > I've made this tool to be the fastest and easiest way to organize media collections. It helps me a lot, and I hope it can help you too. 21 | 22 | And yes, it is blazingly fast, like all Rust 🦀 software should be! 23 | 24 | Enjoy! 25 | 26 | ![refine 2.0 list](https://raw.githubusercontent.com/rsalmei/refine/main/img/list-2.0.png) 27 | 28 | ## How to use it 29 | 30 | Install `refine` with: 31 | 32 | ``` 33 | cargo install refine 34 | ``` 35 | 36 | And that's it, you're ready to go! You can now call it anywhere. 37 | 38 | ## What's new in 3.0 39 | 40 | I'm thrilled to announce a release packed with deep, carefully engineered improvements! This update is the culmination of a significant technical effort, bringing both new capabilities and cool new refinements. The straight jump from 2.0 to 3.0 reflects the magnitude of these changes, about 100 commits worth of work over several months, with a strong focus on enhancing the core algorithms and bringing new features to life. 41 | 42 | The flagship feature is the completely revamped `dupes` command, now equipped with a sophisticated detection algorithm that combines fuzzy string matching and a novel rare-token scoring system! This innovative approach vastly improves duplicate detection accuracy, even in tricky and non-exact cases. This means you’ll find more duplicates, not just exact matches, making cleanup much more effective! 43 | 44 | This new algorithm employs a multi-faceted approach: 45 | 46 | - Sophisticated Name Similarity Detection: 47 | - Fuzzy String Matching: Combines normalized Levenshtein distance with Sørensen-Dice coefficient for detecting both minor typos and structural similarities 48 | - Rare Token Scoring: Builds corpus-wide token frequency maps and weights matches by token rarity using inverse logarithmic frequency 49 | - Union-Find Clustering: Employs union-find data structures for efficient grouping with average similarity tracking across group merges 50 | - Intelligent Filtering: Includes semantic filters to exclude TV series and sequential files using number pattern analysis 51 | 52 | - Advanced Text Processing: 53 | - Unicode Normalization: Filenames are preprocessed with transliteration and accent removal to ensure consistent matching 54 | - Multi-language Stopword Filtering: Comprehensive stopword lists for English and Portuguese built-in 55 | - Media-specific Tag Recognition: Removal of common media tags (`web-dl`, `blu-ray`, codecs, resolutions, etc.) 56 | - MIME-aware Classification: Features media type detection combining MIME guessing with extension overrides, so movies are not tagged as similar to their own subtitles 57 | 58 | - Other Optimizations: 59 | - Parallel Processing: Parallel similarity computation with progress tracking, leveraging multicore CPUs 60 | - Inverted Token Indexing: Efficient candidate pair generation using token-based blocking 61 | - Advanced Content Sampling: Implements a new three-point sampling strategy (beginning, middle, end) still with configurable sample size, achieving high accuracy for large media files while avoiding full file reads 62 | 63 | Also in this release there are several other improvements, such as natural sorting for displaying entries in `list` and the global `--show`, better clash resolution reporting in `join` and `rename`, support for comments in collections in `rebuild`, new fetch options `path_in` and `path_ex` for including and excluding paths, new "recipe type" options in naming rules for advanced transformations, support for on-demand separators in naming rules regexes, etc. There's a lot to explore! 64 | 65 | --- 66 | 67 |
(previous versions) 68 | 69 | ### New in 2.0 70 | 71 | Yay! This is a major release, with a lot of new features! 72 |
The most exciting one is global support for COLORS, making files and directories much easier to read and distinguish! 73 | 74 | Also, the `list` command is greatly improved, with support for listing directory entries, complete with their number of files and full sizes! This was only possible with the new precise recursion feature, allowing you to choose how deep you want to go within directories. 75 |
You can also now sort the output by number of files, in addition to size (full recursive size), name, or path. 76 | 77 | Another great new feature is the global `--view` option, which allows you to bypass any command and quickly view the filtered files and directories that will be processed by it! Countless times I wanted to preview my filter results, forcing me to replace the command with `list`, remove all other arguments, execute it, study the output, and painstakingly reconstruct the original command—a hugely frustrating process. And now we can do it in any command without changing anything, just by adding `--view`! 78 | 79 | Everything is again more polished and optimized. Even the usage and help are much more user-friendly! 80 | 81 | And last but not least, the input paths can now be relative, which will make all output also be relative and thus easier to read. 82 | 83 | ### New in 1.4 84 | 85 | This version introduces the `probe` command, which allows you to probe filenames against a remote server! This can be used to validate the filenames of your media collections by checking whether a URL points to a valid file or page on a remote server. 86 | 87 | Also, the `rebuild` command has a new `--case` option, which allows you to keep the original case of the filenames, and the `rename` command has improved support for handling clashes, allowing you to insert sequence numbers in the filenames when you really want to let them be the same. 88 | 89 | ### New in 1.3 90 | 91 | This version is mostly about polishing, with some improvements and bug fixes. 92 | 93 | We have a smarter list command, which hides full paths by default and uses descending order for size and ascending for name and path; join: change no_remove flag to parents (n -> p) and some clash options; rebuild: change simple_match flag to simple and fix full mode, which was not resetting sequences; general polishing. 94 | 95 | ### New in 1.2 96 | 97 | Here is a much improved partial mode in Rebuild command, which can alter groups of filenames while preserving sequences, and even detect and fix gaps in sequences caused by deleted files. 98 | 99 | ### New in 1.1 100 | 101 | Revamped join command! 102 | It now supports non-empty target folders, and will resolve clashes accordingly. 103 | 104 | Also, several enum CLI arguments now support aliases, and I've fixed join command still moving files even when copy was requested. 105 | 106 | ### New in 1.0 107 | 108 | Yes, it is time. After a complete overhaul of the code, it's time to release 1.0! 109 |
It's an accomplishment I'm proud of, which took over 70 commits and a month's work, resulting in most of the code being rewritten. 110 | It is more mature, stable, and well-structured now. 111 | 112 | The major motivation for this version is the rebuild Partial mode! We can now rebuild collections even when some directories are not available! This means that files not affected by the specified naming rules will stay the same, keeping their sequence numbers, while new files are appended after the highest sequence found. It is handy for collections on external drives or cloud storage which are not always connected, allowing you to, even on the go, rebuild new files without messing up previous ones. 113 | 114 | And this also includes: 115 | 116 | - rebuild: new `--replace` option to replace all occurrences of some string or regex in the filenames with another one. 117 | - new internal CLI options handling, which enables commands to modify them prior to their execution. 118 | - the new rebuild partial mode is auto-enabled in case not all directories are currently available. 119 | 120 | ### New in 0.18 121 | 122 | - rebuild: new force implementation that is easier to use 123 | - it conflicts with any other options so must be used alone 124 | - now it just overwrites filenames without exceptions → best used with `-i` or on already organized collections 125 | - improved memory usage 126 | 127 | ### New in 0.17 128 | 129 | - join: new clash resolve option 130 | - by default, no changes are allowed in directories where clashes are detected 131 | - all directories with clashes are listed, showing exactly which files are in them 132 | 133 | ### New in 0.16 134 | 135 | - complete overhaul of the scan system, allowing directories to be extracted alongside files 136 | - new `join` command, already with directory support 137 | - new magic `-i` and `-x` options that filter both files and directories 138 | - new filter options for files, directories, and extensions 139 | - rename: include full directory support 140 | 141 | ### New in 0.15 142 | 143 | - nicer rename command output by parent directory 144 | - new threaded yes/no prompt that can be aborted with CTRL-C 145 | 146 | ### New in 0.14 147 | 148 | - rename: disallow by default changes in directories where clashes are detected 149 | - new `--clashes` option to allow them 150 | 151 | ### New in 0.13 152 | 153 | - rename: new replace feature, finally! 154 | - global: make strip rules also remove `.` and `_`, in addition to `-` and spaces 155 | - global: include and exclude options do not check extensions 156 | - dupes: remove case option, so everything is case-insensitive now 157 | 158 | ### New in 0.12 159 | 160 | - global: new `--dir-in` and `--dir-out` options. 161 | 162 | ### New in 0.11 163 | 164 | - new `rename` command 165 | - rebuild, rename: improve strip exact, not removing more spaces than needed 166 | 167 | ### New in 0.10 168 | 169 | - global: new `--exclude` option to exclude files 170 | 171 | ### New in 0.9 172 | 173 | - new support for Ctrl-C, to abort all operations and gracefully exit the program at any time. 174 | - all commands will stop collecting files when Ctrl-C is pressed 175 | - both `dupes` and `list` command will show partial results 176 | - the `rebuild` command will just exit, as it needs all the files to run 177 | 178 | ### New in 0.8 179 | 180 | - new "list" command 181 | 182 | ### New in 0.7 183 | 184 | - global: new `--include` option to filter input files 185 | - rebuild: new `--force` option to easily rename new files 186 | - rebuild: new interactive mode by default, making `--dry_run` obsolete (removed), with new `--yes` option to bypass it (good for automation) 187 | - rebuild: auto fix renaming errors 188 | - dupes: faster performance by ignoring groups with one file (thus avoiding loading samples) 189 | - rebuild: smaller memory consumption by caching file extensions 190 | 191 |
192 | 193 | ## Commands 194 | 195 | All commands will: 196 | 197 | 1. scan all the given directories recursively (excluding hidden `.folders`) 198 | - can optionally filter files and directories based on several options 199 | 2. load the metadata for each file like size and creation date, as required by some commands 200 | 3. execute the command and show the results 201 | 4. ask the user to perform the changes, if applicable 202 | 203 | ### The global refine command (help) 204 | 205 | > `$ refine --help` 206 | > 207 | > ``` 208 | > Refine your file collections using Rust! 209 | > 210 | > Usage: refine [DIRS]... [FETCH] [OPTIONS] 211 | > 212 | > Commands: 213 | > dupes Find possibly duplicated files by both size and filename 214 | > join Join files into a single directory with advanced conflict resolution 215 | > list List files from multiple disjoint directories sorted together 216 | > rebuild Rebuild entire media collections' filenames intelligently 217 | > rename Rename files and directories in batch using advanced regex rules 218 | > probe Probe collections' filenames against a remote server 219 | > help Print this message or the help of the given subcommand(s) 220 | > 221 | > Arguments: 222 | > [DIRS]... Directories to scan 223 | > 224 | > Options: 225 | > -h, --help Print help 226 | > -V, --version Print version 227 | > 228 | > Fetch: 229 | > -R, --recurse The maximum recursion depth; use 0 for unlimited [default: 0] 230 | > -F, --only-files Include only files 231 | > -D, --only-dirs Include only directories 232 | > -i, --include Include only these files and directories 233 | > -x, --exclude Exclude these files and directories 234 | > -I, --dir-in Include only these directories 235 | > -X, --dir-ex Exclude these directories 236 | > --file-in Include only these files 237 | > --file-ex Exclude these files 238 | > --ext-in Include only these extensions 239 | > --ext-ex Exclude these extensions 240 | > --view Bypass the command execution and preview the filter results to be processed 241 | > 242 | > For more information, see https://github.com/rsalmei/refine 243 | > ``` 244 | 245 | ### The `dupes` command 246 | 247 | The `dupes` command will analyze and report the possibly duplicated files, either by size or name. It will even load a sample from each file, to guarantee they are indeed duplicated. It is a small sample by default but can help reduce false positives a lot, and you can increase it if you want. 248 | 249 | 1. group all the files by size 250 | 2. for each group with the exact same value, load a sample of its files 251 | 3. compare the samples with each other and find possible duplicates 252 | 4. group all the files by words in their names 253 | - the word extractor ignores sequence numbers like file-1, file copy, file-3 copy 2, etc. 254 | 5. run 2. and 3. again, and print the results 255 | 256 | > `$ refine dupes --help` 257 | > 258 | > ``` 259 | > Find reasonably duplicated files by both size and filename 260 | > 261 | > Usage: refine dupes [DIRS]... [FETCH] [OPTIONS] 262 | > 263 | > Arguments: 264 | > [DIRS]... Directories to scan 265 | > 266 | > Options: 267 | > -s, --sample Sample size in bytes (0 to disable) [default: 2048] 268 | > -h, --help Print help 269 | > ``` 270 | 271 | > There's also the "Fetch" options, which are the same as for the global refine command. 272 | 273 | Example: 274 | 275 | ``` 276 | $ refine dupes ~/Downloads /Volumes/External --sample 20480 277 | ``` 278 | 279 | ### The `join` command 280 | 281 | The `join` command will let you grab all files and directories in the given directories and join them into a single one. You can filter files however you like, and choose whether they will be joined by moving or copying. It will even remove the empty parent directories after a move joining! 282 | 283 | > Note: any deletions are only performed after files and directories have been successfully moved/copied. So, in case any errors occur, the files and directories partially moved/copied will be found in the target directory, so you should manually delete them before trying again. 284 | 285 | 1. detect clashes, i.e. files with the same name in different directories, and apply the given clash strategy 286 | 2. detect already in-place files 287 | 3. print the resulting changes to the filenames and directories, and ask for confirmation 288 | 4. if the user confirms, apply the changes 289 | 5. remove any empty parent directories when moving files 290 | 291 | > `$ refine join --help` 292 | > 293 | > ``` 294 | > Join files into a single directory with advanced conflict resolution 295 | > 296 | > Usage: refine join [OPTIONS] [DIRS]... 297 | > 298 | > Options: 299 | > -t, --target The target directory; will be created if it doesn't exist [default: .] 300 | > -b, --by The type of join to perform [default: move] [possible values: move, copy] 301 | > -c, --clashes How to resolve clashes [default: name-sequence] [possible values: name-sequence, parent-name, name-parent, ignore] 302 | > -f, --force Force joining already in place files and directories, i.e. in subdirectories of the target 303 | > -p, --parents Do not remove empty parent directories after joining files 304 | > -y, --yes Skip the confirmation prompt, useful for automation 305 | > -h, --help Print help 306 | > ``` 307 | 308 | > There's also the "Fetch" options, which are the same as for the global refine command. 309 | 310 | Example: 311 | 312 | ``` 313 | $ refine join ~/media/ /Volumes/External/ -i 'proj-01' -X 'ongoing' -t /Volumes/External/proj-01 314 | ``` 315 | 316 | ### The `list` command 317 | 318 | The `list` command will gather all the files in the given directories, sort them by name, size, or path, and display them in a friendly format. 319 | 320 | 1. sort all files by either name, size, or path 321 | - ascending by default for name and path, descending for size, or optionally reverse 322 | 2. print the results 323 | 324 | > `$ refine list --help` 325 | > 326 | > ``` 327 | > List files from multiple directories sorted together 328 | > 329 | > Usage: refine list [DIRS]... [FETCH] [OPTIONS] 330 | > 331 | > Arguments: 332 | > [DIRS]... Directories to scan 333 | > 334 | > Options: 335 | > -b, --by Sort by [default: size] [possible values: size, count, name, path] 336 | > -r, --rev Reverse the default order (size/count:desc, name/path:asc) 337 | > -p, --paths Show full file paths 338 | > -c, --no-calc-dirs Do not calculate directory sizes 339 | > -h, --help Print help 340 | > ``` 341 | 342 | > There's also the "Fetch" options, which are the same as for the global refine command. 343 | 344 | Example: 345 | 346 | ``` 347 | $ refine list ~/Downloads /Volumes/External --by size --desc 348 | ``` 349 | 350 | ### The `rebuild` command 351 | 352 | I’m really proud of the `rebuild` command. It smartly rebuilds all the filenames of entire media collections, e.g., musics by album/singer and videos by streamers and even photos from your camera. Sequence numbers are removed, filenames are stripped according to your needs, similar names are intelligently matched, groups are sorted deterministically by creation date, sequence numbers are regenerated, and files are finally renamed! 353 | 354 | It's awesome to quickly find your collections neatly sorted automatically; it's like magic, getting all files cleaned up, sorted, and sequenced with a single command. And upon running it again, the tool will seem to recognize the new files that have been added, as it will regenerate everything but only display entries that need to be changed, as the rest are already correct! And in case you delete files, all the subsequent ones will be renamed accordingly! Quite impressive, don't you think? 355 | 356 | And don't worry as this tool is interactive, so you can review all changes before applying them. 357 | 358 | 1. apply naming rules to strip or replace parts of the filenames 359 | 2. extract and strip sequence numbers from names 360 | 3. if force mode is enabled, set all names to the forced value 361 | 4. if smart match is enabled, remove spaces and underscores from names 362 | 5. group the files by their resulting names 363 | 6. sort the groups according to the files' created dates 364 | 7. regenerate sequence numbers for each group; if partial mode is enabled, continue from the highest sequence found in the group 365 | > Note that these groups can contain files from different directories, and it will just work 366 | 8. print the resulting changes to the filenames, and ask for confirmation 367 | 9. if the user confirms, apply the changes 368 | 369 | > `$ refine rebuild --help` 370 | > 371 | > ``` 372 | > Rebuild entire media collections intelligently 373 | > 374 | > Usage: refine rebuild [DIRS]... [FETCH] [OPTIONS] 375 | > 376 | > Arguments: 377 | > [DIRS]... Directories to scan 378 | > 379 | > Options: 380 | > -b, --strip-before Strip from the start of the filename; separators nearby are automatically removed 381 | > -a, --strip-after Strip to the end of the filename; separators nearby are automatically removed 382 | > -e, --strip-exact Strip all occurrences in the filename; separators nearby are automatically removed 383 | > -r, --replace Replace all occurrences in the filename with another; separators are not touched 384 | > -s, --simple Disable smart matching, so "foo bar.mp4", "FooBar.mp4" and "foo__bar.mp4" are different 385 | > -f, --force Force to overwrite filenames (use the Global options to filter files) 386 | > -p, --partial Assume not all directories are available, which retains current sequences (but fixes gaps) 387 | > -c, --case Keep the original case of filenames, otherwise they are lowercased 388 | > -y, --yes Skip the confirmation prompt, useful for automation 389 | > -h, --help Print help 390 | > ``` 391 | 392 | > There's also the "Fetch" options, which are the same as for the global refine command. 393 | 394 | Example: 395 | 396 | ``` 397 | $ refine rebuild ~/media /Volumes/External -a 720p -a Bluray -b xpto -e old 398 | ``` 399 | 400 | ### The `rename` command 401 | 402 | The `rename` command will let you batch rename files like no other tool, seriously! You can quickly strip common prefixes, suffixes, and exact parts of the filenames, as well as apply any regex replacements you want. By default, in case a filename ends up clashing with other files in the same directory, that whole directory will be disallowed to make any changes. The list of clashes will be nicely formatted and printed, so you can manually check them. And you can optionally allow changes to other files in the same directory, removing only the clashes if you find it safe. 403 | 404 | 1. apply naming rules to strip or replace parts of the filenames 405 | 2. handle the clashes according to the given strategy, which can: 406 | - forbid any changes in the directory where clashes are detected 407 | - ignore the clashes, allowing other changes in the same directory 408 | - apply sequence numbers to the clashes, allowing all changes 409 | 3. print the resulting changes to the filenames and directories, and ask for confirmation 410 | 4. if the user confirms, apply the changes 411 | 412 | > `$ refine rename --help` 413 | > 414 | > ``` 415 | > Rename files and directories using advanced regular expression rules 416 | > 417 | > Usage: refine rename [DIRS]... [FETCH] [OPTIONS] 418 | > 419 | > Arguments: 420 | > [DIRS]... Directories to scan 421 | > 422 | > Options: 423 | > -b, --strip-before Strip from the start of the filename; separators nearby are automatically removed 424 | > -a, --strip-after Strip to the end of the filename; separators nearby are automatically removed 425 | > -e, --strip-exact Strip all occurrences in the filename; separators nearby are automatically removed 426 | > -r, --replace Replace all occurrences in the filename with another; separators are not touched 427 | > -c, --clashes How to resolve clashes [default: forbid] [possible values: forbid, ignore, name-sequence] 428 | > -y, --yes Skip the confirmation prompt, useful for automation 429 | > -h, --help Print help 430 | > ``` 431 | 432 | > There's also the "Fetch" options, which are the same as for the global refine command. 433 | 434 | Example: 435 | 436 | ``` 437 | $ refine rename ~/media /Volumes/External -b "^\d+_" -r '([^\.]*?)\.=$1 ' 438 | ``` 439 | 440 | ### The `probe` command 441 | 442 | The `probe` command allows you to probe filenames against a remote server, which can be very useful to validate the filenames of your media collections. It works by checking whether a URL points to a valid file or page on a remote server. 443 | 444 | The URL can be any valid HTTP(S) URL, and must have a placeholder for the filename. The command generates URLs by replacing the placeholder with the names of the files, and sends a HEAD request to each one, allowing you to use some advanced options to control the behavior, such as the timeout, number of retries, wait times, exponential backoff, and when to display errors. The request is expected to return: 445 | - a 200 OK or 403 Forbidden response to be considered valid; 446 | - a 404 Not Found to be considered invalid; 447 | - any other response is retried, with exponential backoff, until the maximum number of retries is reached, then it is considered failed. 448 | 449 | It does not support any kind of parallel connections or API rate limiting by design in order to not disturb the server too much. It thus only works in a sequential manner and may take a while to complete. It also does not support any kind of authentication, redirects, or custom headers, so it may not work for some servers. 450 | 451 | 1. extract the names from files (without sequence numbers and extension), and deduplicate them 452 | 2. pick the desired subset of them (by a regex) 453 | 3. prepare the URL for each name and probe it with a HEAD request 454 | 4. split the results into Valid, Invalid, Failed, and Pending (in case you press Ctrl+C) 455 | 5. print the invalid ones, along with a summary of the results 456 | 457 | > `$ refine probe --help` 458 | > 459 | > ``` 460 | > Probe filenames against a remote server 461 | > 462 | > Usage: refine probe [DIRS]... [FETCH] [OPTIONS] 463 | > 464 | > Arguments: 465 | > [DIRS]... Directories to scan 466 | > 467 | > Options: 468 | > -p, --pick Pick a subset of the files to probe 469 | > -u, --url The URL to probe filenames against (use `$` as placeholder, e.g. https://example.com/$/) 470 | > -t, --timeout The HTTP connection and read timeouts in milliseconds [default: 2000] 471 | > -n, --min-wait The initial time to wait between retries in milliseconds [default: 1000] 472 | > -b, --backoff The factor by which to increase the time to wait between retries [default: 1.5] 473 | > -a, --max-wait The maximum time to wait between retries in milliseconds [default: 5000] 474 | > -r, --retries The maximum number of retries; use 0 to disable and -1 to retry indefinitely [default: -1] 475 | > -e, --errors Specify when to display errors [default: each10] [possible values: never, last, always, each10] 476 | > -h, --help Print help 477 | > ``` 478 | 479 | > There's also the "Fetch" options, which are the same as for the global refine command. 480 | 481 | Example: 482 | 483 | ``` 484 | $ refine probe ~/media /Volumes/External --url 'https://example.com/$/' -r3 -el 485 | ``` 486 | 487 | ## Changelog 488 | 489 |
Complete [here](https://github.com/rsalmei/refine/blob/main/CHANGELOG.md). 490 | 491 | ## License 492 | 493 | This software is licensed under the MIT License. See the LICENSE file in the top distribution 494 | directory for the full license text. 495 | 496 | 497 | --- 498 | Maintaining an open source project is hard and time-consuming, and I've put much ❤️ and effort into this. 499 | 500 | If you've appreciated my work, you can back me up with a donation! Thank you. 😊 501 | 502 | [](https://www.buymeacoffee.com/rsalmei) 503 | [Donate with PayPal button](https://www.paypal.com/donate?business=6SWSHEB5ZNS5N&no_recurring=0&item_name=I%27m+the+author+of+alive-progress%2C+clearly+and+about-time.+Thank+you+for+appreciating+my+work%21¤cy_code=USD) 504 | 505 | --- 506 | --------------------------------------------------------------------------------