├── .gitignore ├── LICENSE ├── README.md └── duplicatefilefinder.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[co] 2 | 3 | # Packages 4 | *.egg 5 | *.egg-info 6 | dist 7 | build 8 | eggs 9 | parts 10 | bin 11 | var 12 | sdist 13 | develop-eggs 14 | .installed.cfg 15 | 16 | # Installer logs 17 | pip-log.txt 18 | 19 | # Unit test / coverage reports 20 | .coverage 21 | .tox 22 | 23 | #Translations 24 | *.mo 25 | 26 | #Mr Developer 27 | .mr.developer.cfg 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Michael Krisper 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # duplicate-file-finder 2 | This is a python script to find duplicate files in a directory structure. For optimal performance, files are checked in 3 iterations: 3 | 4 | 1. Group by file size 5 | 2. Group by sha256-hash of the first 1024 Bytes 6 | 3. Group by sha256-hash of the whole file 7 | 8 | After the whole directory structure is searched, duplicate files are displayed. 9 | 10 | ## Usage: 11 | usage: duplicatefilefinder.py [-h] [-fast] [--delete] [--script-friendly] 12 | [-a] [-top X] [--hidden] [--empty] 13 | [--min-file-size MIN_FILE_SIZE] 14 | one-or-more-directories 15 | 16 | positional arguments: 17 | directory the directory which should be checked for duplicate 18 | files 19 | 20 | options: 21 | -h, --help show this help message and exit 22 | -fast Enables a faster but less thorough search by pruning 23 | files between comparison stages. May result in fewer 24 | duplicates being found than actually exist. Ineffective 25 | when used with -a. Note: the default is doing a hash 26 | of the entire file. 27 | --delete delete older duplicate files 28 | --script-friendly use machine-readable output 29 | -a display all duplicate files. equal to -top 0 30 | -top X set the amount of displayed duplicates. If 0 is given, 31 | all results will be displayed. default=3 32 | --hidden check hidden files and hidden directories too 33 | --empty check empty files too 34 | --min-file-size MIN_FILE_SIZE 35 | set the file filter so that file must be at least min- 36 | file-size to be examined, defaults to 1 37 | 38 | ## EXAMPLES: 39 | (1) duplicatefilefinder.py ~/Downloads 40 | Description: Searches the Downloads directory for duplicate files and displays the top 3 duplicates (with the most files). 41 | 42 | (2) duplicatefilefinder.py ~/Downloads -top 3 43 | Description: Searches duplicates, but only displays the top 3 most duplicates 44 | 45 | (3) duplicatefilefinder.py ~/Downloads -top 3 --fast 46 | Description: Searches for the top 3 duplicates. May eventually get less than 3 results, even if they would exist. 47 | 48 | (4) duplicatefilefinder.py ~/Downloads -a 49 | Description: Searches duplicates and displays ALL results 50 | 51 | (5) duplicatefilefinder.py ~/Downloads --hidden --empty 52 | Description: Searches duplicates and also include hidden or empty files 53 | 54 | (6) duplicatefilefinder.py ~/Downloads --delete 55 | Description: Searches duplicates and deletes the older files, keeping the newest one. 56 | 57 | (7) duplicatefilefinder.py ~/Downloads --script-friendly 58 | Description: Searches duplicates and prints them in a machine-readable format. 59 | 60 | (9) duplicatefilefinder.py ~/Downloads --min-file-size 1024 61 | Description: Searches duplicates but only considers files that are at least 1024 bytes in size. 62 | 63 | ## Sample Output: 64 | duplicatefilefinder.py . --empty 65 | (By Size) 24 Files checked, 1 duplicates found (2 files) 66 | (By Partial Hash) 2 Files checked, 1 duplicates found (2 files) 67 | (By Full Hash) 2 Files checked, 1 duplicates found (2 files) 68 | 69 | (1) Found 2 duplicate files (size: 16 Bytes, sha256 'a1b2c3d4...') in ./test/: 70 | 1: copy of testfile 71 | 2: testfile 72 | 73 | Found 1 duplicates (2 duplicate files total) 74 | 75 | -------------------------------------------------------------------------------- /duplicatefilefinder.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: UTF-8 -*- 3 | """Module for traversing a directory structure, finding duplicate FILES and displaying them, but does NOT delete them.""" 4 | 5 | import os 6 | import argparse 7 | import hashlib 8 | import sys 9 | import time 10 | from functools import partial, reduce 11 | 12 | def parse_arguments(): 13 | """ Parses the Arguments """ 14 | 15 | epilog = """EXAMPLES: 16 | (1) %(prog)s ~/Downloads 17 | Description: Searches the Downloads directory for duplicate files and displays the top 3 duplicates (with the most files). 18 | 19 | (2) %(prog)s ~/Downloads ~/Documents -top 3 20 | Description: Searches duplicates in Downloads and Documents directories, but only displays the top 3 most duplicates 21 | 22 | (3) %(prog)s ~/Downloads ~/Documents -top 3 --fast 23 | Description: Searches for the top 3 duplicates across multiple directories. May eventually get less than 3 results, even if they would exist. 24 | 25 | (4) %(prog)s ~/Downloads ~/Documents ~/Pictures -a 26 | Description: Searches duplicates across multiple directories and displays ALL results 27 | 28 | (5) %(prog)s ~/Downloads --hidden --empty 29 | Description: Searches duplicates and also include hidden or empty files 30 | """ 31 | 32 | parser = argparse.ArgumentParser(description=__doc__, epilog=epilog, formatter_class=argparse.RawDescriptionHelpFormatter) 33 | parser.add_argument(dest="directories", nargs='+', help="one or more directories which should be checked for duplicate files") 34 | parser.add_argument("-f","--fast", dest="fast", action="store_true", 35 | help="Searches very fast for only for the top X duplicates. The fast check may return less than the \ 36 | top X, even if they would exist. Remarks: the fast option is ignored when -a is given.") 37 | parser.add_argument("-d", "--delete", dest="delete_old", action="store_true", help="delete older duplicate files") 38 | parser.add_argument("-s", "--script-friendly", dest="script_friendly", action="store_true", help="use machine-readable output") 39 | group = parser.add_mutually_exclusive_group() 40 | group.add_argument("-a", dest="show_all", action="store_true", help="display all duplicate files. equal to -top 0") 41 | group.add_argument("-t", "-top", dest="top", action="store", metavar="X", default=3, type=int, 42 | help="set the amount of displayed duplicates. If 0 is given, all results will be displayed. default=10") 43 | parser.add_argument("-H", "--hidden", dest="include_hidden", action="store_true", help="check hidden files and hidden directories too") 44 | parser.add_argument("-e", "--empty", dest="include_empty", action="store_true", help="check empty files too") 45 | group.add_argument("-m", "--min-file-size", dest="min_file_size", action="store", default=1, type=int, 46 | help="set the file filter so that file must be at least min-file-size to be examined, defaults to 1") 47 | args = parser.parse_args() 48 | 49 | if args.fast: 50 | args.fast = True 51 | else: 52 | args.fast = False 53 | 54 | if args.show_all or args.top == 0: 55 | args.top = None 56 | 57 | return args 58 | 59 | class UpdatePrinter(object): 60 | """Class for printing nice status output on the console.""" 61 | def __init__(self, refreshrate=0.05, stream=sys.stdout): 62 | self.__last = 0 63 | self.__last_text_length = 0 64 | self.refreshrate = refreshrate 65 | self.stream = stream 66 | 67 | def update(self, value, force=False, flush=True): 68 | """Updates the last line on the console. Overwrites previous output made with 69 | this method. Has a mechanism which prevents flickering. Use the force parameter to enforce output.""" 70 | if ((time.time() - self.__last) >= self.refreshrate) or force: 71 | print("\r%s%s" % (value, " " * (self.__last_text_length - len(value))), end=' ', file=self.stream) 72 | self.__last_text_length = len(value) 73 | if flush: 74 | self.stream.flush() 75 | self.__last = time.time() 76 | 77 | def print_duplicates_human_readable(files, displaycount=None): 78 | """Prints a list of duplicates in a human-readable format.""" 79 | try: 80 | sortedfiles = sorted(files, key=lambda x: (len(x[1]), os.path.getsize(x[1][0])), reverse=True) 81 | except OSError: 82 | sortedfiles = sorted(files, key=lambda x: len(x[1]), reverse=True) 83 | 84 | for pos, entry in enumerate(sortedfiles[:displaycount], start=1): 85 | try: 86 | checksum, paths = entry 87 | checksum = checksum.hex() 88 | prefix = os.path.dirname(os.path.commonprefix(paths)) 89 | print("\n(%d) Found %d duplicate files (size: %d Bytes, sha256 %r) in %s/:" % \ 90 | (pos, len(paths), os.path.getsize(paths[0]), checksum, prefix)) 91 | for i, path in enumerate(sorted(paths), start=1): 92 | print("%2d: %s" % (i, path)) 93 | except OSError as e: 94 | print("\nCould not display duplicate entry, file might have been deleted: %s" % e, file=sys.stderr) 95 | 96 | def print_duplicates_script_friendly(files, displaycount=None): 97 | """Prints a list of duplicates in a machine-readable format.""" 98 | try: 99 | sortedfiles = sorted(files, key=lambda x: (len(x[1]), os.path.getsize(x[1][0])), reverse=True) 100 | except OSError: 101 | sortedfiles = sorted(files, key=lambda x: len(x[1]), reverse=True) 102 | for i, entry in enumerate(sortedfiles[:displaycount]): 103 | _, paths = entry 104 | for path in sorted(paths): 105 | print("%d\t%s" % (i, path)) 106 | 107 | def delete_duplicates(files): 108 | """Deletes older duplicate files.""" 109 | for checksum, paths in files: 110 | try: 111 | sortedpaths = sorted(paths, key=lambda x: os.path.getmtime(x), reverse=True) 112 | for path in sortedpaths[1:]: 113 | try: 114 | print("deleting: %s" % path, file=sys.stderr) 115 | os.remove(path) 116 | except OSError as e: 117 | print("could not delete file: %s" % e, file=sys.stderr) 118 | except OSError as e: 119 | print("could not access file for deletion: %s" % e, file=sys.stderr) 120 | 121 | def get_hash_key(filename, partial=False): 122 | """Calculates the hash value for a file.""" 123 | try: 124 | hash_object = hashlib.sha256() 125 | with open(filename, 'rb') as inputfile: 126 | if partial: 127 | hash_object.update(inputfile.read(1024)) 128 | else: 129 | for chunk in iter(lambda:inputfile.read(1024 * 8), b""): 130 | hash_object.update(chunk) 131 | return hash_object.digest() 132 | except OSError as e: 133 | print("permission denied for file: %s" % e.filename, file=sys.stderr) 134 | return None 135 | 136 | def filter_duplicate_files(files, fast, top=None): 137 | """Finds all duplicate files in the directory.""" 138 | duplicates = {} 139 | update = UpdatePrinter(stream=sys.stderr).update 140 | 141 | iterations = [(os.path.getsize, "By Size", top**2 if top else None)] 142 | if fast: 143 | iterations.append((partial(get_hash_key, partial=True), "By Partial Hash", top*2 if top else None)) 144 | iterations.append((get_hash_key, "By Full Hash", None)) 145 | 146 | for keyfunction, name, topcount in iterations: 147 | duplicates.clear() 148 | count = 0 149 | duplicate_count = 0 150 | i = 0 151 | for i, filepath in enumerate(files, start=1): 152 | key = keyfunction(filepath) 153 | if key is None: 154 | continue 155 | duplicates.setdefault(key, []).append(filepath) 156 | if len(duplicates[key]) > 1: 157 | count += 1 158 | if len(duplicates[key]) == 2: 159 | count += 1 160 | duplicate_count += 1 161 | 162 | update("\r(%s) %d Files checked, %d duplicates found (%d files)" % (name, i, duplicate_count, count)) 163 | else: 164 | update("\r(%s) %d Files checked, %d duplicates found (%d files)" % (name, i, duplicate_count, count), force=True) 165 | print("", file=sys.stderr) 166 | sortedfiles = sorted(duplicates.values(), key=len, reverse=True) 167 | files = [filepath for filepaths in sortedfiles[:topcount] if len(filepaths) > 1 for filepath in filepaths] 168 | 169 | return [(checksum, duplicates[checksum]) for checksum in duplicates if len(duplicates[checksum]) > 1] 170 | 171 | def on_walk_error(err): 172 | """Error handler for os.walk.""" 173 | print("Cannot access directory '%s'. Permission denied." % err.filename, file=sys.stderr) 174 | 175 | 176 | def get_files(directories, include_hidden, min_file_size=1): 177 | """Returns all FILES in the directories which apply to the filter rules.""" 178 | for directory in directories: 179 | for dirpath, _, filenames in os.walk(directory, onerror=on_walk_error): 180 | for filename in filenames: 181 | try: 182 | filepath = os.path.join(dirpath, filename) 183 | if (not os.path.islink(filepath) 184 | and (include_hidden or 185 | reduce(lambda r, d: r and not d.startswith("."), os.path.abspath(filepath).split(os.sep), True)) 186 | and (os.path.getsize(filepath) >= min_file_size)): 187 | yield filepath 188 | except OSError: 189 | # e.g. permission denied 190 | print("Cannot access file '%s'. Permission denied." % filepath, file=sys.stderr) 191 | continue 192 | 193 | if __name__ == "__main__": 194 | ARGS = parse_arguments() 195 | if ARGS.include_empty: 196 | ARGS.min_file_size = 0 197 | FILES = get_files(ARGS.directories, ARGS.include_hidden, ARGS.min_file_size) 198 | DUPLICATES = filter_duplicate_files(FILES, ARGS.fast, ARGS.top if ARGS.fast else None) 199 | 200 | if ARGS.script_friendly: 201 | print_duplicates_script_friendly(DUPLICATES, ARGS.top) 202 | else: 203 | print_duplicates_human_readable(DUPLICATES, ARGS.top) 204 | 205 | if ARGS.delete_old: 206 | delete_duplicates(DUPLICATES) 207 | 208 | if ARGS.fast: 209 | print("\nFound %d duplicates at least (%d duplicate files total) -- More duplicates may exist." % \ 210 | (len(DUPLICATES), reduce(lambda sum_value, files: sum_value + len(files[1]), DUPLICATES, 0)), file=sys.stderr) 211 | else: 212 | print("\nFound %d duplicates (%d duplicate files total)" % \ 213 | (len(DUPLICATES), reduce(lambda sum_value, files: sum_value + len(files[1]), DUPLICATES, 0)), file=sys.stderr) 214 | 215 | --------------------------------------------------------------------------------