├── requirements.txt
├── LICENSE
├── README.md
├── .gitignore
└── DuplicateFinder.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | astroid==2.4.0
2 | isort==4.3.21
3 | lazy-object-proxy==1.4.3
4 | mccabe==0.6.1
5 | pylint==2.5.0
6 | six==1.14.0
7 | toml==0.10.0
8 | wrapt==1.12.1
9 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 geraldlnj
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # duplicate-video-finder
 2 | 
 3 | DuplicateFinder is a Python module (heavily WIP) to detect duplicate videos in a directory.
 4 | 
 5 | ## Features
 6 | 
 7 | - [x] Detect exact video duplicates
 8 | - [ ] Detect similar video duplicates
 9 | - [x] Recursive directory support
10 | - [x] Multi file format support (mp4, mov, webm)
11 | - [ ] Interactive Manual CLI deletion
12 | - [ ] Auto deletion mode
13 | 
14 | ## Requirements
15 | 
16 | - Python 3
17 | - ffprobe
18 | 
19 | ## Usage Examples
20 | 
21 | In your script, import the module DuplicateFinder:
22 | 
23 | ``` Python
24 | import DuplicateFinder
25 | ```
26 | 
27 | Create a new instance of DuplicateFnder:
28 | 
29 | ``` Python
30 | duplicate_finder = DuplicateFinder("test-folder-here")
31 | ```
32 | 
33 | Find duplicates:
34 | 
35 | ``` Python
36 | duplicate_finder.find_dups()
37 | ```
38 | 
39 | Show formatted results:
40 | 
41 | ``` Python
42 | duplicate_finder.get_results()
43 | ```
44 | 
45 | ### Recursion
46 | 
47 | Recursive searching is disabled by default. To enable it, when initialising DuplicateFinder, use:
48 | 
49 | ``` Python
50 | duplicate_finder = DuplicateFinder('path_to_dir', recursive=True)
51 | ```
52 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | .vscode/
132 | 
133 | .DS_Store
134 | 
135 | dup/
136 | 


--------------------------------------------------------------------------------
/DuplicateFinder.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A module that helps to detetc duplicate videos in a folder.
  3 | Supports mp4, mov, webm. More formats may be included in the future.
  4 | Supports recursive search in a folder.
  5 | Separates videos into buckets of unique videos.
  6 | """
  7 | 
  8 | from io import BufferedReader
  9 | from os import walk, path
 10 | import subprocess
 11 | import time
 12 | import hashlib
 13 | from collections import defaultdict
 14 | from typing import Callable
 15 | 
 16 | 
 17 | def timeit(method):
 18 |     """
 19 |     decorator method to measure execution time
 20 |     """
 21 | 
 22 |     def timed(*args, **kw):
 23 |         time_start = time.time()
 24 |         result = method(*args, **kw)
 25 |         time_end = time.time()
 26 |         print((time_end - time_start) * 1000)
 27 |         return result
 28 | 
 29 |     return timed
 30 | 
 31 | 
 32 | class DuplicateFinder:
 33 |     """
 34 |     DuplicateFinder class.
 35 |     Contains methods etc.
 36 |     Idk how to write a class docstring.
 37 |     Inspired by this StackOverflow post:
 38 |     https://stackoverflow.com/a/3593153
 39 |     """
 40 | 
 41 |     def __init__(self, video_dir: str, recursive: bool = False):
 42 |         """
 43 |         video_dir (String): Path to videos.
 44 |         recursive (Boolean): Toggle recursive search on/off. Default off.
 45 |         """
 46 |         super().__init__()
 47 | 
 48 |         if not isinstance(video_dir, str):
 49 |             raise TypeError("Argument videoDir must be a string!")
 50 |         self.video_dir = video_dir
 51 | 
 52 |         if not isinstance(recursive, bool):
 53 |             raise TypeError("Optional argument Recursive must be a Boolean!")
 54 |         self.recursive = recursive
 55 | 
 56 |         # accepted file formats
 57 |         self.types = ["mp4", "mov", "webm"]
 58 | 
 59 |         # list of videos
 60 |         self.videos_list = []
 61 | 
 62 |         # buckets of dups
 63 |         self.buckets = []
 64 | 
 65 |     def generate_videos_list(self):
 66 |         """
 67 |         Returns a list of all videos in directory
 68 |         May/May not be recursive
 69 |         """
 70 |         videos_list = []
 71 |         for (dirpath, _, filenames) in walk(self.video_dir):
 72 |             videos = [
 73 |                 path.join(dirpath, f)
 74 |                 for f in filenames
 75 |                 if f.split(".")[-1] in self.types
 76 |             ]
 77 |             videos_list.extend(videos)
 78 |             if not self.recursive:
 79 |                 break
 80 |         return videos_list
 81 | 
 82 |     def get_duration(self, _video_path):
 83 |         """
 84 |         Returns duration of video in milliseconds
 85 |         """
 86 |         result = subprocess.run(
 87 |             [
 88 |                 "ffprobe",
 89 |                 "-v",
 90 |                 "error",
 91 |                 "-show_entries",
 92 |                 "format=duration",
 93 |                 "-of",
 94 |                 "default=noprint_wrappers=1:nokey=1",
 95 |                 _video_path,
 96 |             ],
 97 |             stdout=subprocess.PIPE,
 98 |             stderr=subprocess.STDOUT,
 99 |             check=True,
100 |         )
101 |         return float(result.stdout)
102 | 
103 |     def chunk_reader(self, fobj: BufferedReader, chunk_size: int = 1024):
104 |         """
105 |         Generator that reads a file in chunks of bytes
106 |         """
107 |         while True:
108 |             chunk = fobj.read(chunk_size)
109 |             if not chunk:
110 |                 return
111 |             yield chunk
112 | 
113 |     def get_hash(
114 |         self,
115 |         filename: str,
116 |         first_chunk_only: bool = False,
117 |         hash_algo: Callable = hashlib.sha1,
118 |     ):
119 |         """
120 |         Gets the hash of either the first chunk of file or whole file.
121 |         """
122 |         hashobj = hash_algo()
123 |         with open(filename, "rb") as _f:
124 |             if first_chunk_only:
125 |                 hashobj.update(_f.read(1024))
126 |             else:
127 |                 for chunk in self.chunk_reader(_f):
128 |                     hashobj.update(chunk)
129 |         return hashobj.digest()
130 | 
131 |     def pure_dups(self):
132 |         """
133 |         Finds exact duplicates.
134 |         Inspired by https://gist.github.com/tfeldmann/fc875e6630d11f2256e746f67a09c1ae
135 |         """
136 |         files_by_size = defaultdict(list)
137 |         files_by_small_hash = defaultdict(list)
138 |         files_by_full_hash = dict()
139 | 
140 |         for full_path in self.videos_list:
141 |             try:
142 |                 # if the target is a symlink (soft one), this will
143 |                 # dereference it - change the value to the actual target file
144 |                 full_path = path.realpath(full_path)
145 |                 file_size = path.getsize(full_path)
146 |             except OSError:
147 |                 # not accessible (permissions, etc) - pass on
148 |                 continue
149 |             files_by_size[file_size].append(full_path)
150 | 
151 |         # For all files with the same file size, get their hash on the first 1024 bytes
152 |         for size, files in files_by_size.items():
153 |             if len(files) < 2:
154 |                 continue  # this file size is unique, no need to spend cpu cycles on it
155 | 
156 |             for filename in files:
157 |                 try:
158 |                     small_hash = self.get_hash(filename, first_chunk_only=True)
159 |                 except OSError:
160 |                     # the file access might've changed till the exec point got here
161 |                     continue
162 |                 files_by_small_hash[(size, small_hash)].append(filename)
163 | 
164 |         # For all files with the hash on the first 1024 bytes, get their hash on the full
165 |         # file - collisions will be duplicates
166 |         for files in files_by_small_hash.values():
167 |             if len(files) < 2:
168 |                 # the hash of the first 1k bytes is unique -> skip this file
169 |                 continue
170 | 
171 |             for filename in files:
172 |                 try:
173 |                     full_hash = self.get_hash(filename, first_chunk_only=False)
174 |                 except OSError:
175 |                     # the file access might've changed till the exec point got here
176 |                     continue
177 | 
178 |                 if full_hash in files_by_full_hash:
179 |                     files_by_full_hash[full_hash].append(filename)
180 |                 else:
181 |                     files_by_full_hash[full_hash] = [filename]
182 | 
183 |         dups = [items for items in files_by_full_hash.values() if len(items) > 1]
184 |         self.buckets = dups
185 | 
186 |     def advanced_dups(self):
187 |         """
188 |         Finds non-exact duplicates
189 |         """
190 |         raise NotImplementedError
191 | 
192 |     def find_dups(self):
193 |         """
194 |         Find duplicates
195 |         """
196 |         self.videos_list = self.generate_videos_list()
197 | 
198 |         # find exact copies
199 |         self.pure_dups()
200 | 
201 |         # exclude exact copies from file list
202 |         flattened_dups = []
203 |         for bucket in self.buckets:
204 |             for filepath in bucket[1:]:
205 |                 flattened_dups.append(filepath)
206 | 
207 |         def to_keep(filepath):
208 |             return filepath not in flattened_dups
209 | 
210 |         self.videos_list = list(filter(to_keep, self.videos_list))
211 | 
212 |         # self.advanced_dups()
213 | 
214 |     def get_results(self):
215 |         """
216 |         Prints detected duplicates in a formatted view.
217 |         """
218 |         dup_buckets = [bucket for bucket in self.buckets if len(bucket) > 1]
219 |         deep_dup_len = sum([len(buckets) for buckets in dup_buckets])
220 |         print("{} duplicate files found\n".format(deep_dup_len - len(dup_buckets)))
221 | 
222 |         for bucket in dup_buckets:
223 |             for file_path in bucket:
224 |                 print("- {}".format(file_path))
225 |             print("\n")
226 | 
227 | 
228 | if __name__ == "__main__":
229 |     DUPLICATE_FINDER = DuplicateFinder("test-folder-here", True)
230 |     DUPLICATE_FINDER.find_dups()
231 |     DUPLICATE_FINDER.get_results()
232 | 


--------------------------------------------------------------------------------