├── .gitignore
├── LICENSE
├── README.md
└── duplicatefilefinder.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[co]
 2 | 
 3 | # Packages
 4 | *.egg
 5 | *.egg-info
 6 | dist
 7 | build
 8 | eggs
 9 | parts
10 | bin
11 | var
12 | sdist
13 | develop-eggs
14 | .installed.cfg
15 | 
16 | # Installer logs
17 | pip-log.txt
18 | 
19 | # Unit test / coverage reports
20 | .coverage
21 | .tox
22 | 
23 | #Translations
24 | *.mo
25 | 
26 | #Mr Developer
27 | .mr.developer.cfg
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Michael Krisper
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # duplicate-file-finder
 2 | This is a python script to find duplicate files in a directory structure. For optimal performance, files are checked in 3 iterations:
 3 | 
 4 | 1. Group by file size
 5 | 2. Group by sha256-hash of the first 1024 Bytes
 6 | 3. Group by sha256-hash of the whole file
 7 | 
 8 | After the whole directory structure is searched, duplicate files are displayed.
 9 | 
10 | ## Usage:
11 | 	usage: duplicatefilefinder.py [-h] [-fast] [--delete] [--script-friendly]
12 | 	                              [-a] [-top X] [--hidden] [--empty]
13 | 	                              [--min-file-size MIN_FILE_SIZE]
14 | 	                              one-or-more-directories
15 | 
16 | 	positional arguments:
17 | 	  directory             the directory which should be checked for duplicate
18 | 	                        files
19 | 
20 | 	options:
21 | 	  -h, --help            show this help message and exit
22 | 	  -fast                 Enables a faster but less thorough search by pruning
23 | 	                        files between comparison stages. May result in fewer
24 | 	                        duplicates being found than actually exist. Ineffective
25 | 	                        when used with -a. Note: the default is doing a hash 
26 |                             of the entire file.
27 | 	  --delete              delete older duplicate files
28 | 	  --script-friendly     use machine-readable output
29 | 	  -a                    display all duplicate files. equal to -top 0
30 | 	  -top X                set the amount of displayed duplicates. If 0 is given,
31 | 	                        all results will be displayed. default=3
32 | 	  --hidden              check hidden files and hidden directories too
33 | 	  --empty               check empty files too
34 | 	  --min-file-size MIN_FILE_SIZE
35 | 	                        set the file filter so that file must be at least min-
36 | 	                        file-size to be examined, defaults to 1
37 | 
38 | ## EXAMPLES:
39 |     (1) duplicatefilefinder.py ~/Downloads
40 |         Description: Searches the Downloads directory for duplicate files and displays the top 3 duplicates (with the most files).
41 | 	
42 |     (2) duplicatefilefinder.py ~/Downloads -top 3
43 |         Description: Searches duplicates, but only displays the top 3 most duplicates
44 | 	
45 |     (3) duplicatefilefinder.py ~/Downloads -top 3 --fast 
46 |         Description: Searches for the top 3 duplicates. May eventually get less than 3 results, even if they would exist.
47 | 	
48 |     (4) duplicatefilefinder.py ~/Downloads -a
49 |         Description: Searches duplicates and displays ALL results
50 | 	
51 |     (5) duplicatefilefinder.py ~/Downloads --hidden --empty
52 |         Description: Searches duplicates and also include hidden or empty files
53 | 
54 |     (6) duplicatefilefinder.py ~/Downloads --delete
55 |         Description: Searches duplicates and deletes the older files, keeping the newest one.
56 | 
57 |     (7) duplicatefilefinder.py ~/Downloads --script-friendly
58 |         Description: Searches duplicates and prints them in a machine-readable format.
59 | 
60 |     (9) duplicatefilefinder.py ~/Downloads --min-file-size 1024
61 |         Description: Searches duplicates but only considers files that are at least 1024 bytes in size.
62 | 	
63 | ## Sample Output:
64 | 	duplicatefilefinder.py . --empty
65 | 	(By Size) 24 Files checked, 1 duplicates found (2 files) 
66 | 	(By Partial Hash) 2 Files checked, 1 duplicates found (2 files)
67 | 	(By Full Hash) 2 Files checked, 1 duplicates found (2 files)
68 | 	
69 | 	(1) Found 2 duplicate files (size: 16 Bytes, sha256 'a1b2c3d4...') in ./test/:
70 | 	 1: copy of testfile
71 | 	 2: testfile
72 | 	
73 | 	Found 1 duplicates (2 duplicate files total)
74 | 
75 | 


--------------------------------------------------------------------------------
/duplicatefilefinder.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: UTF-8 -*-
  3 | """Module for traversing a directory structure, finding duplicate FILES and displaying them, but does NOT delete them."""
  4 | 
  5 | import os
  6 | import argparse
  7 | import hashlib
  8 | import sys
  9 | import time
 10 | from functools import partial, reduce
 11 | 
 12 | def parse_arguments():
 13 |     """ Parses the Arguments """
 14 | 
 15 |     epilog = """EXAMPLES:
 16 |     (1) %(prog)s ~/Downloads
 17 |         Description: Searches the Downloads directory for duplicate files and displays the top 3 duplicates (with the most files).
 18 | 
 19 |     (2) %(prog)s ~/Downloads ~/Documents -top 3
 20 |         Description: Searches duplicates in Downloads and Documents directories, but only displays the top 3 most duplicates
 21 | 
 22 |     (3) %(prog)s ~/Downloads ~/Documents -top 3 --fast
 23 |         Description: Searches for the top 3 duplicates across multiple directories. May eventually get less than 3 results, even if they would exist.
 24 | 
 25 |     (4) %(prog)s ~/Downloads ~/Documents ~/Pictures -a
 26 |         Description: Searches duplicates across multiple directories and displays ALL results
 27 | 
 28 |     (5) %(prog)s ~/Downloads --hidden --empty
 29 |         Description: Searches duplicates and also include hidden or empty files
 30 |     """
 31 | 
 32 |     parser = argparse.ArgumentParser(description=__doc__, epilog=epilog, formatter_class=argparse.RawDescriptionHelpFormatter)
 33 |     parser.add_argument(dest="directories", nargs='+', help="one or more directories which should be checked for duplicate files")
 34 |     parser.add_argument("-f","--fast", dest="fast", action="store_true",
 35 |                         help="Searches very fast for only for the top X duplicates. The fast check may return less than the \
 36 |                         top X, even if they would exist. Remarks: the fast option is ignored when -a is given.")
 37 |     parser.add_argument("-d", "--delete", dest="delete_old", action="store_true", help="delete older duplicate files")
 38 |     parser.add_argument("-s", "--script-friendly", dest="script_friendly", action="store_true", help="use machine-readable output")
 39 |     group = parser.add_mutually_exclusive_group()
 40 |     group.add_argument("-a", dest="show_all", action="store_true", help="display all duplicate files. equal to -top 0")
 41 |     group.add_argument("-t", "-top", dest="top", action="store", metavar="X", default=3, type=int,
 42 |                        help="set the amount of displayed duplicates. If 0 is given, all results will be displayed. default=10")
 43 |     parser.add_argument("-H", "--hidden", dest="include_hidden", action="store_true", help="check hidden files and hidden directories too")
 44 |     parser.add_argument("-e", "--empty", dest="include_empty", action="store_true", help="check empty files too")
 45 |     group.add_argument("-m", "--min-file-size", dest="min_file_size", action="store", default=1, type=int,
 46 |                        help="set the file filter so that file must be at least min-file-size to be examined, defaults to 1")
 47 |     args = parser.parse_args()
 48 | 
 49 |     if args.fast:
 50 |         args.fast = True
 51 |     else:
 52 |         args.fast = False
 53 | 
 54 |     if args.show_all or args.top == 0:
 55 |         args.top = None
 56 | 
 57 |     return args
 58 | 
 59 | class UpdatePrinter(object):
 60 |     """Class for printing nice status output on the console."""
 61 |     def __init__(self, refreshrate=0.05, stream=sys.stdout):
 62 |         self.__last = 0
 63 |         self.__last_text_length = 0
 64 |         self.refreshrate = refreshrate
 65 |         self.stream = stream
 66 | 
 67 |     def update(self, value, force=False, flush=True):
 68 |         """Updates the last line on the console. Overwrites previous output made with
 69 |         this method. Has a mechanism which prevents flickering. Use the force parameter to enforce output."""
 70 |         if ((time.time() - self.__last) >= self.refreshrate) or force:
 71 |             print("\r%s%s" % (value, " " * (self.__last_text_length - len(value))), end=' ', file=self.stream)
 72 |             self.__last_text_length = len(value)
 73 |             if flush:
 74 |                 self.stream.flush()
 75 |             self.__last = time.time()
 76 | 
 77 | def print_duplicates_human_readable(files, displaycount=None):
 78 |     """Prints a list of duplicates in a human-readable format."""
 79 |     try:
 80 |         sortedfiles = sorted(files, key=lambda x: (len(x[1]), os.path.getsize(x[1][0])), reverse=True)
 81 |     except OSError:
 82 |         sortedfiles = sorted(files, key=lambda x: len(x[1]), reverse=True)
 83 | 
 84 |     for pos, entry in enumerate(sortedfiles[:displaycount], start=1):
 85 |         try:
 86 |             checksum, paths = entry
 87 |             checksum = checksum.hex()
 88 |             prefix = os.path.dirname(os.path.commonprefix(paths))
 89 |             print("\n(%d) Found %d duplicate files (size: %d Bytes, sha256 %r) in %s/:" % \
 90 |                 (pos, len(paths), os.path.getsize(paths[0]), checksum, prefix))
 91 |             for i, path in enumerate(sorted(paths), start=1):
 92 |                 print("%2d: %s" % (i, path))
 93 |         except OSError as e:
 94 |             print("\nCould not display duplicate entry, file might have been deleted: %s" % e, file=sys.stderr)
 95 | 
 96 | def print_duplicates_script_friendly(files, displaycount=None):
 97 |     """Prints a list of duplicates in a machine-readable format."""
 98 |     try:
 99 |         sortedfiles = sorted(files, key=lambda x: (len(x[1]), os.path.getsize(x[1][0])), reverse=True)
100 |     except OSError:
101 |         sortedfiles = sorted(files, key=lambda x: len(x[1]), reverse=True)
102 |     for i, entry in enumerate(sortedfiles[:displaycount]):
103 |         _, paths = entry
104 |         for path in sorted(paths):
105 |             print("%d\t%s" % (i, path))
106 | 
107 | def delete_duplicates(files):
108 |     """Deletes older duplicate files."""
109 |     for checksum, paths in files:
110 |         try:
111 |             sortedpaths = sorted(paths, key=lambda x: os.path.getmtime(x), reverse=True)
112 |             for path in sortedpaths[1:]:
113 |                 try:
114 |                     print("deleting: %s" % path, file=sys.stderr)
115 |                     os.remove(path)
116 |                 except OSError as e:
117 |                     print("could not delete file: %s" % e, file=sys.stderr)
118 |         except OSError as e:
119 |             print("could not access file for deletion: %s" % e, file=sys.stderr)
120 | 
121 | def get_hash_key(filename, partial=False):
122 |     """Calculates the hash value for a file."""
123 |     try:
124 |         hash_object = hashlib.sha256()
125 |         with open(filename, 'rb') as inputfile:
126 |             if partial:
127 |                 hash_object.update(inputfile.read(1024))
128 |             else:
129 |                 for chunk in iter(lambda:inputfile.read(1024 * 8), b""):
130 |                     hash_object.update(chunk)
131 |         return hash_object.digest()
132 |     except OSError as e:
133 |         print("permission denied for file: %s" % e.filename, file=sys.stderr)
134 |         return None
135 | 
136 | def filter_duplicate_files(files, fast, top=None):
137 |     """Finds all duplicate files in the directory."""
138 |     duplicates = {}
139 |     update = UpdatePrinter(stream=sys.stderr).update
140 |     
141 |     iterations = [(os.path.getsize, "By Size", top**2 if top else None)]
142 |     if fast:
143 |         iterations.append((partial(get_hash_key, partial=True), "By Partial Hash", top*2 if top else None))
144 |     iterations.append((get_hash_key, "By Full Hash", None))
145 | 
146 |     for keyfunction, name, topcount in iterations:
147 |         duplicates.clear()
148 |         count = 0
149 |         duplicate_count = 0
150 |         i = 0
151 |         for i, filepath in enumerate(files, start=1):
152 |             key = keyfunction(filepath)
153 |             if key is None:
154 |                 continue
155 |             duplicates.setdefault(key, []).append(filepath)
156 |             if len(duplicates[key]) > 1:
157 |                 count += 1
158 |                 if len(duplicates[key]) == 2:
159 |                     count += 1
160 |                     duplicate_count += 1
161 | 
162 |             update("\r(%s) %d Files checked, %d duplicates found (%d files)" % (name, i, duplicate_count, count))
163 |         else:
164 |             update("\r(%s) %d Files checked, %d duplicates found (%d files)" % (name, i, duplicate_count, count), force=True)
165 |         print("", file=sys.stderr)
166 |         sortedfiles = sorted(duplicates.values(), key=len, reverse=True)
167 |         files = [filepath for filepaths in sortedfiles[:topcount] if len(filepaths) > 1 for filepath in filepaths]
168 | 
169 |     return [(checksum, duplicates[checksum]) for checksum in duplicates if len(duplicates[checksum]) > 1]
170 | 
171 | def on_walk_error(err):
172 |     """Error handler for os.walk."""
173 |     print("Cannot access directory '%s'. Permission denied." % err.filename, file=sys.stderr)
174 | 
175 | 
176 | def get_files(directories, include_hidden, min_file_size=1):
177 |     """Returns all FILES in the directories which apply to the filter rules."""
178 |     for directory in directories:
179 |         for dirpath, _, filenames in os.walk(directory, onerror=on_walk_error):
180 |             for filename in filenames:
181 |                 try:
182 |                     filepath = os.path.join(dirpath, filename)
183 |                     if (not os.path.islink(filepath)
184 |                         and (include_hidden or
185 |                              reduce(lambda r, d: r and not d.startswith("."), os.path.abspath(filepath).split(os.sep), True))
186 |                         and (os.path.getsize(filepath) >= min_file_size)):
187 |                         yield filepath
188 |                 except OSError:
189 |                     # e.g. permission denied
190 |                     print("Cannot access file '%s'. Permission denied." % filepath, file=sys.stderr)
191 |                     continue
192 | 
193 | if __name__ == "__main__":
194 |     ARGS = parse_arguments()
195 |     if ARGS.include_empty:
196 |         ARGS.min_file_size = 0
197 |     FILES = get_files(ARGS.directories, ARGS.include_hidden, ARGS.min_file_size)
198 |     DUPLICATES = filter_duplicate_files(FILES, ARGS.fast, ARGS.top if ARGS.fast else None)
199 |     
200 |     if ARGS.script_friendly:
201 |         print_duplicates_script_friendly(DUPLICATES, ARGS.top)
202 |     else:
203 |         print_duplicates_human_readable(DUPLICATES, ARGS.top)
204 | 
205 |     if ARGS.delete_old:
206 |         delete_duplicates(DUPLICATES)
207 | 
208 |     if ARGS.fast:
209 |         print("\nFound %d duplicates at least (%d duplicate files total) -- More duplicates may exist." % \
210 |             (len(DUPLICATES), reduce(lambda sum_value, files: sum_value + len(files[1]), DUPLICATES, 0)), file=sys.stderr)
211 |     else:
212 |         print("\nFound %d duplicates (%d duplicate files total)" % \
213 |             (len(DUPLICATES), reduce(lambda sum_value, files: sum_value + len(files[1]), DUPLICATES, 0)), file=sys.stderr)
214 | 
215 | 


--------------------------------------------------------------------------------