├── requirements-dev.txt ├── .gitignore ├── setup.cfg ├── .commit-rules ├── .travis.yml ├── deduplicated ├── __main__.py ├── web │ ├── __main__.py │ ├── templates │ │ ├── indir.html │ │ ├── dirupdate.html │ │ ├── dirlist.html │ │ └── dirinfo.html │ ├── static │ │ └── css │ │ │ ├── app.css │ │ │ └── normalize.css │ └── __init__.py ├── cmd.py └── __init__.py ├── MANIFEST.in ├── .editorconfig ├── README.rst ├── LICENSE └── setup.py /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | flake8 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.py[cod] 3 | 4 | build/ 5 | dist/ 6 | *.egg-info/ 7 | *.egg 8 | 9 | /env/ 10 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal = 1 3 | 4 | [flake8] 5 | max-line-length = 119 6 | 7 | [pep8] 8 | max-line-length = 119 9 | -------------------------------------------------------------------------------- /.commit-rules: -------------------------------------------------------------------------------- 1 | Commit Files: git diff --cached --name-only | egrep '\.py[cod]$|~$' && exit 1 || : 2 | Git Check: git diff --cached --check 3 | flake8: flake8 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | 5 | install: 6 | - python setup.py install 7 | - pip install -r requirements-dev.txt 8 | 9 | script: 10 | - flake8 11 | -------------------------------------------------------------------------------- /deduplicated/__main__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Copyright (c) 2015 Eduardo Klosowski 4 | # License: MIT (see LICENSE for details) 5 | # 6 | 7 | from .cmd import main 8 | 9 | main() 10 | -------------------------------------------------------------------------------- /deduplicated/web/__main__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Copyright (c) 2015 Eduardo Klosowski 4 | # License: MIT (see LICENSE for details) 5 | # 6 | 7 | from . import main 8 | 9 | main() 10 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | include LICENSE 3 | include MANIFEST.in 4 | recursive-include deduplicated/web/static * 5 | recursive-include deduplicated/web/templates * 6 | recursive-exclude * __pycache__ 7 | recursive-exclude * *.py[cod] 8 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | indent_style = space 5 | indent_size = 4 6 | end_of_line = lf 7 | charset = utf-8 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | 11 | [*.css] 12 | indent_size = 2 13 | 14 | [*.html] 15 | indent_size = 2 16 | -------------------------------------------------------------------------------- /deduplicated/web/templates/indir.html: -------------------------------------------------------------------------------- 1 | {% extends 'dirinfo.html' %} 2 | 3 | {% block application %} 4 |
5 |
6 |
Same as "{{ name }}"
7 | 8 | 9 | {% for filename in files %} 10 | 11 | 12 | 13 | {% endfor %} 14 | 15 |
{{ filename }}
16 |
17 |
18 | {% endblock %} 19 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Deduplicated 2 | ============ 3 | 4 | Check duplicated files. 5 | 6 | Exemple of use 7 | -------------- 8 | 9 | .. code-block:: bash 10 | 11 | # Update directory cache 12 | $ deduplicated update /path/for/check 13 | 14 | # List duplicated files in directory cache 15 | $ deduplicated duplicated /path/for/check 16 | 17 | # Update and list duplicated files 18 | $ deduplicated check /path/for/check 19 | 20 | # Check if file in directory cache 21 | $ deduplicated indir myfile /path/for/check 22 | 23 | # Start web version, connect http://127.0.0.1:5050 24 | $ deduplicated-web 25 | 26 | 27 | Check 28 | ----- 29 | 30 | This project use: 31 | - `Commit Rules `_. 32 | - `EditorConfig `_. 33 | -------------------------------------------------------------------------------- /deduplicated/web/templates/dirupdate.html: -------------------------------------------------------------------------------- 1 | {% extends 'dirinfo.html' %} 2 | 3 | {% block application %} 4 |
5 |
6 |
Files
7 |
    8 |
  • +{{ outtree[0] }}
  • 9 |
  • ~{{ outtree[1] }}
  • 10 |
  • -{{ outtree[2] }}
  • 11 |
12 |
13 | 14 |
15 |
Update Hash
16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | {% for filename in outhash %} 24 | 25 | 26 | 27 | {% endfor %} 28 | 29 |
File
{{ filename }}
30 |
31 |
32 | {% endblock %} 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Eduardo Augusto Klosowski 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | 4 | version = __import__('deduplicated').__version__ 5 | with open('README.rst', 'rb') as f: 6 | long_description = f.read().decode('utf-8') 7 | 8 | 9 | setup( 10 | name='deduplicated', 11 | version=version, 12 | packages=find_packages(), 13 | 14 | extras_require={ 15 | 'web': ['Flask'], 16 | }, 17 | 18 | author='Eduardo Klosowski', 19 | author_email='eduardo_klosowski@yahoo.com', 20 | 21 | description='Check duplicated files', 22 | long_description=long_description, 23 | license='MIT', 24 | url='https://github.com/eduardoklosowski/deduplicated', 25 | 26 | include_package_data=True, 27 | zip_safe=False, 28 | 29 | entry_points={ 30 | 'console_scripts': [ 31 | 'deduplicated = deduplicated.cmd:main', 32 | 'deduplicated-web = deduplicated.web:main [web]' 33 | ], 34 | }, 35 | 36 | classifiers=[ 37 | 'Development Status :: 2 - Pre-Alpha', 38 | 'Environment :: Console', 39 | 'Environment :: Web Environment', 40 | 'Intended Audience :: System Administrators', 41 | 'License :: OSI Approved :: MIT License', 42 | 'Operating System :: OS Independent', 43 | 'Programming Language :: Python', 44 | 'Programming Language :: Python :: 2', 45 | 'Programming Language :: Python :: 2.7', 46 | 'Programming Language :: Python :: 3', 47 | 'Programming Language :: Python :: 3.4', 48 | 'Topic :: Software Development :: Libraries :: Python Modules', 49 | 'Topic :: System :: Archiving', 50 | ], 51 | ) 52 | -------------------------------------------------------------------------------- /deduplicated/web/static/css/app.css: -------------------------------------------------------------------------------- 1 | html { 2 | height: 100%; 3 | color: #333; 4 | font-size: 62.5%; 5 | } 6 | 7 | body { 8 | height: 100%; 9 | } 10 | 11 | a { 12 | color: #666; 13 | } 14 | 15 | header { 16 | padding: .5rem 1rem; 17 | background-color: #789; 18 | color: #fff; 19 | font-weight: bold; 20 | } 21 | 22 | header a { 23 | color: #fff; 24 | text-decoration: none; 25 | } 26 | 27 | h1 { 28 | margin: 0; 29 | font-size: 2.5rem; 30 | } 31 | 32 | ul { 33 | padding-left: 2rem; 34 | } 35 | 36 | .text-right { 37 | text-align: right; 38 | } 39 | 40 | 41 | /* Form */ 42 | 43 | input, 44 | textarea { 45 | border: .2rem solid #aaa; 46 | } 47 | 48 | .box textarea { 49 | width: calc(100% - 2rem); 50 | height: 8rem; 51 | } 52 | 53 | input[type=file] { 54 | border: 0; 55 | } 56 | 57 | input { 58 | height: 2rem; 59 | } 60 | 61 | button { 62 | height: 2.6rem; 63 | padding: .1rem .6rem; 64 | border: .2rem solid #444; 65 | background-color: #789; 66 | color: #fff; 67 | } 68 | 69 | 70 | /* Page */ 71 | 72 | section[role=application] { 73 | margin: 1rem; 74 | font-size: 1.6rem; 75 | } 76 | 77 | .dirinfo { 78 | margin: 0 0 1rem; 79 | padding: 0; 80 | text-align: center; 81 | list-style: none; 82 | } 83 | 84 | .dirinfo li { 85 | display: inline; 86 | } 87 | 88 | .dirinfo .title { 89 | font-weight: bold; 90 | } 91 | 92 | .box { 93 | margin-bottom: 1rem; 94 | border: .2rem solid #aaa; 95 | background-color: #def; 96 | } 97 | 98 | .box > .content { 99 | margin: 1rem; 100 | } 101 | 102 | .box.table { 103 | border: 0; 104 | background-color: #fff; 105 | } 106 | 107 | .box.table > header { 108 | border: .2rem solid #aaa; 109 | border-bottom: 0; 110 | } 111 | 112 | table.list { 113 | width: 100%; 114 | } 115 | 116 | table.list th, 117 | table.list td { 118 | padding: .5rem; 119 | border: .2rem solid #aaa; 120 | } 121 | 122 | table.list th { 123 | background-color: #def; 124 | } 125 | -------------------------------------------------------------------------------- /deduplicated/web/templates/dirlist.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {% block title %}Deduplicated{% endblock %} 6 | {% block css %} 7 | 8 | 9 | {% endblock %} 10 | 11 | 12 |
13 |

{% block ptitle %}Deduplicated{% endblock %}

14 |
15 | {% block application %} 16 |
17 |
18 |
Add Directory
19 |
20 | Directory: 21 | 22 | 23 |
24 |
25 | 26 |
27 |
Directories
28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | {% for directory in directories %} 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | {% endfor %} 51 | 52 |
DirectoryLast UpdateDuplicated
HashFilesSize
{{ directory }}{{ directory.get_lastupdate()|default('-', true) }}{% if not directory.is_completed() %} [Incomplete]{% endif %}{{ directory.get_duplicated_hash() }}{{ directory.get_duplicated_files() }}{{ directory.get_duplicated_size()|str_size }}
53 |
54 |
55 | {% endblock %} 56 | 57 | 58 | -------------------------------------------------------------------------------- /deduplicated/web/templates/dirinfo.html: -------------------------------------------------------------------------------- 1 | {% extends 'dirlist.html' %} 2 | 3 | {% block title %}{{ super() }} - {{ directory }}{% endblock %} 4 | {% block ptitle %}{{ super() }} - {{ directory }}{% endblock %} 5 | 6 | {% block application %} 7 |
8 |
    9 |
  • Last update: {{ directory.get_lastupdate()|default('-', true) }}
  • 10 |
  • 11 | Duplicated (hash/file/size): 12 | {{ directory.get_duplicated_hash() }} / 13 | {{ directory.get_duplicated_files() }} / 14 | {{ directory.get_duplicated_size()|str_size() }} 15 |
  • 16 |
17 | 18 |
19 |
Actions
20 | 24 |
25 | 26 |
27 |
Options
28 |
29 |
    30 |
  • 31 | Follow Links 32 |
  • 33 |
  • 34 | Excludes: 35 |
  • 36 |
37 | 38 |
39 |
40 | 41 |
42 |
Check file exist in this directory
43 |
44 | 45 | 46 |
47 |
48 | 49 |
50 |
Files Duplicated
51 |
52 | 53 | 54 | {% for hashfile, size, files in directory.get_duplicated()|reverse %} 55 | 56 | 57 | 58 | 59 | {% for filename in files %} 60 | 64 | {% endfor %} 65 | {% endfor %} 66 | 67 |
{{ size|str_size() }}{{ hashfile }}
61 | 63 |
68 | 69 |
70 |
71 |
72 | {% endblock %} 73 | -------------------------------------------------------------------------------- /deduplicated/web/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Copyright (c) 2015 Eduardo Klosowski 4 | # License: MIT (see LICENSE for details) 5 | # 6 | 7 | from flask import Flask, redirect, render_template, request 8 | import jinja2 9 | from tempfile import NamedTemporaryFile 10 | 11 | from .. import Directory, directory_by_hash, directory_delete, directory_list, str_size 12 | 13 | 14 | # Init app 15 | 16 | jinja2.filters.FILTERS['str_size'] = str_size 17 | 18 | app = Flask(__name__) 19 | 20 | 21 | # Pages 22 | 23 | @app.route('/') 24 | def dirlist(): 25 | directories = [Directory(d, checkvalid=False) for d in directory_list()] 26 | return render_template('dirlist.html', 27 | directories=directories) 28 | 29 | 30 | @app.route('/dir/add', methods=['post']) 31 | def diradd(): 32 | dirname = request.form.get('directory', '') 33 | if dirname: 34 | Directory(dirname) 35 | return redirect('/') 36 | 37 | 38 | @app.route('/dir/') 39 | def dirinfo(dirhash): 40 | return render_template('dirinfo.html', 41 | directory=directory_by_hash(dirhash, checkvalid=False)) 42 | 43 | 44 | @app.route('/dir//option', methods=['post']) 45 | def diroption(dirhash): 46 | directory = directory_by_hash(dirhash) 47 | directory.set_option_follow_link('followlink' in request.form) 48 | directory.save_meta() 49 | directory.exclude = request.form.get('exclude', '').splitlines() 50 | directory.save_exclude() 51 | return redirect('/dir/%s' % dirhash) 52 | 53 | 54 | @app.route('/dir//update') 55 | def dirupdate(dirhash): 56 | directory = directory_by_hash(dirhash) 57 | outtree = directory.update_tree() 58 | outhash = list(directory.hash_for_update()) 59 | [directory.update_hash(i) for i in outhash] 60 | return render_template('dirupdate.html', 61 | directory=directory, 62 | outtree=outtree, 63 | outhash=outhash) 64 | 65 | 66 | @app.route('/dir//delete') 67 | def dirdelete(dirhash): 68 | directory_delete(dirhash) 69 | return redirect('/') 70 | 71 | 72 | @app.route('/dir//deletefile', methods=['post']) 73 | def dirdeletefile(dirhash): 74 | directory = directory_by_hash(dirhash) 75 | for filename in request.form.getlist('file'): 76 | directory.delete_file(filename) 77 | return redirect('/dir/%s' % dirhash) 78 | 79 | 80 | @app.route('/dir//indir', methods=['post']) 81 | def indir(dirhash): 82 | directory = directory_by_hash(dirhash) 83 | with NamedTemporaryFile(prefix='deduplicated-') as tmpfile: 84 | request.files.get('file').save(tmpfile.name) 85 | files = directory.is_file_in(tmpfile.name) 86 | return render_template('indir.html', 87 | directory=directory, 88 | name=request.files.get('file').filename, 89 | files=files) 90 | 91 | 92 | # Run 93 | 94 | def main(): 95 | app.run(port=5050) 96 | -------------------------------------------------------------------------------- /deduplicated/cmd.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Copyright (c) 2015 Eduardo Klosowski 4 | # License: MIT (see LICENSE for details) 5 | # 6 | 7 | from __future__ import print_function 8 | from __future__ import unicode_literals 9 | 10 | import argparse 11 | import sys 12 | 13 | from . import __version__, Directory, directory_delete, directory_list, str_size 14 | 15 | 16 | # Argument parser 17 | 18 | parser = argparse.ArgumentParser(prog='deduplicated') 19 | subparsers = parser.add_subparsers(dest='action') 20 | 21 | parser.add_argument('--version', action='version', version='%(prog)s ' + __version__) 22 | 23 | # list command 24 | parser_list = subparsers.add_parser('list', 25 | help='list directories') 26 | 27 | # update command 28 | parser_update = subparsers.add_parser('update', 29 | help='update directories informations') 30 | parser_update.add_argument('directory', nargs='*', 31 | help='list of directories, if not present use all') 32 | 33 | # duplicated command 34 | parser_duplicated = subparsers.add_parser('duplicated', 35 | help='list duplicated files in directories') 36 | parser_duplicated.add_argument('directory', nargs='*', 37 | help='list of directories, if not present use all') 38 | 39 | # check command 40 | parser_check = subparsers.add_parser('check', 41 | help='update and duplicated directories') 42 | parser_check.add_argument('directory', nargs='*', 43 | help='list of directories, if not present use all') 44 | 45 | # delete command 46 | parser_delete = subparsers.add_parser('delete', 47 | help='delete directory information') 48 | parser_delete.add_argument('directory', nargs='+', 49 | help='list of directories') 50 | 51 | # indir command 52 | parser_indir = subparsers.add_parser('indir', 53 | help='check file exists in directories') 54 | parser_indir.add_argument('file', 55 | help='file for check') 56 | parser_indir.add_argument('directory', nargs='*', 57 | help='list of directories, if not present use all') 58 | 59 | # delindir command 60 | parser_delindir = subparsers.add_parser('delindir', 61 | help='delete duplicated files in directory') 62 | parser_delindir.add_argument('directory', nargs=1, 63 | help='directory information') 64 | parser_delindir.add_argument('delindir', 65 | help='subdirectory for delete') 66 | 67 | # optimize command 68 | parser_optimize = subparsers.add_parser('optimize', 69 | help='optimize cache database') 70 | parser_optimize.add_argument('directory', nargs='*', 71 | help='list of directories, if not present use all') 72 | 73 | 74 | # Utils 75 | 76 | def print_directories(directories): 77 | rows = [(str(directory), 78 | str(directory.get_lastupdate() or '-') + ('i' if not directory.is_completed() else ''), 79 | str(directory.get_duplicated_hash()), 80 | str(directory.get_duplicated_files()), 81 | str_size(directory.get_duplicated_size())) 82 | for directory in directories] 83 | if not rows: 84 | return 85 | 86 | header = 'Directory', 'Last update', 'Hashs', 'Files', 'Size' 87 | sizes = [max(len(header[i]), *[len(row[i]) for row in rows]) for i in range(len(header))] 88 | print('%-*s %-*s %-*s %-*s %-*s' % (sizes[0], header[0], sizes[1], header[1], sizes[2], header[2], 89 | sizes[3], header[3], sizes[4], header[4])) 90 | for row in rows: 91 | print('%-*s %-*s %*s %*s %*s' % (sizes[0], row[0], sizes[1], row[1], sizes[2], row[2], 92 | sizes[3], row[3], sizes[4], row[4])) 93 | 94 | 95 | def print_update_tree(directory): 96 | print('==> Update tree (%s): ' % directory, end='') 97 | print('+%d ~%d -%d' % directory.update_tree()) 98 | 99 | 100 | def print_update_hash(directory): 101 | for filename in directory.hash_for_update(): 102 | print('Updating %s' % filename) 103 | directory.update_hash(filename) 104 | 105 | 106 | def print_duplicated(directory): 107 | print('==> Duplicated (%s):' % directory) 108 | for hashfile, size, files in directory.get_duplicated(): 109 | print('%s [%s]' % (str_size(size), hashfile)) 110 | print(' %s' % '\n '.join(files)) 111 | print('%d hashs (%d files) %s' % ( 112 | directory.get_duplicated_hash(), 113 | directory.get_duplicated_files(), 114 | str_size(directory.get_duplicated_size()), 115 | )) 116 | 117 | 118 | def print_optimize(directory, sizes): 119 | print('%9s - %s (%s > %s)' % (str_size(sizes[2]), directory, str_size(sizes[0]), str_size(sizes[1]))) 120 | 121 | 122 | def main(): 123 | args = parser.parse_args() 124 | 125 | if 'directory' in args and not args.directory: 126 | args.directory = directory_list() 127 | 128 | if args.action == 'list': 129 | directories = [Directory(dirname, checkvalid=False) for dirname in directory_list()] 130 | print_directories(directories) 131 | sys.exit(0) 132 | 133 | if args.action == 'update': 134 | for dirname in args.directory: 135 | directory = Directory(dirname) 136 | print_update_tree(directory) 137 | print_update_hash(directory) 138 | sys.exit(0) 139 | 140 | if args.action == 'duplicated': 141 | for dirname in args.directory: 142 | directory = Directory(dirname) 143 | print_duplicated(directory) 144 | sys.exit(0) 145 | 146 | if args.action == 'check': 147 | for dirname in args.directory: 148 | directory = Directory(dirname) 149 | print_update_tree(directory) 150 | print_update_hash(directory) 151 | print_duplicated(directory) 152 | sys.exit(0) 153 | 154 | if args.action == 'delete': 155 | for dirname in args.directory: 156 | directory = Directory(dirname, checkvalid=False) 157 | directory_delete(directory.get_hash()) 158 | sys.exit(0) 159 | 160 | if args.action == 'indir': 161 | has = False 162 | for dirname in args.directory: 163 | directory = Directory(dirname) 164 | files = directory.is_file_in(args.file) 165 | if files: 166 | has = True 167 | for filename in files: 168 | print(filename) 169 | if has: 170 | sys.exit(0) 171 | else: 172 | sys.exit(1) 173 | 174 | if args.action == 'delindir': 175 | delindir = args.delindir 176 | if not delindir.endswith('/'): 177 | delindir += '/' 178 | directory = Directory(args.directory[0]) 179 | directory.delete_duplicated_indir(delindir) 180 | sys.exit(0) 181 | 182 | if args.action == 'optimize': 183 | for dirname in args.directory: 184 | directory = Directory(dirname) 185 | sizes = directory.optimize_database() 186 | if not sizes[2]: 187 | continue 188 | print_optimize(directory, sizes) 189 | sys.exit(0) 190 | 191 | parser.print_usage() 192 | -------------------------------------------------------------------------------- /deduplicated/web/static/css/normalize.css: -------------------------------------------------------------------------------- 1 | /*! normalize.css v3.0.2 | MIT License | git.io/normalize */ 2 | 3 | /** 4 | * 1. Set default font family to sans-serif. 5 | * 2. Prevent iOS text size adjust after orientation change, without disabling 6 | * user zoom. 7 | */ 8 | 9 | html { 10 | font-family: sans-serif; /* 1 */ 11 | -ms-text-size-adjust: 100%; /* 2 */ 12 | -webkit-text-size-adjust: 100%; /* 2 */ 13 | } 14 | 15 | /** 16 | * Remove default margin. 17 | */ 18 | 19 | body { 20 | margin: 0; 21 | } 22 | 23 | /* HTML5 display definitions 24 | ========================================================================== */ 25 | 26 | /** 27 | * Correct `block` display not defined for any HTML5 element in IE 8/9. 28 | * Correct `block` display not defined for `details` or `summary` in IE 10/11 29 | * and Firefox. 30 | * Correct `block` display not defined for `main` in IE 11. 31 | */ 32 | 33 | article, 34 | aside, 35 | details, 36 | figcaption, 37 | figure, 38 | footer, 39 | header, 40 | hgroup, 41 | main, 42 | menu, 43 | nav, 44 | section, 45 | summary { 46 | display: block; 47 | } 48 | 49 | /** 50 | * 1. Correct `inline-block` display not defined in IE 8/9. 51 | * 2. Normalize vertical alignment of `progress` in Chrome, Firefox, and Opera. 52 | */ 53 | 54 | audio, 55 | canvas, 56 | progress, 57 | video { 58 | display: inline-block; /* 1 */ 59 | vertical-align: baseline; /* 2 */ 60 | } 61 | 62 | /** 63 | * Prevent modern browsers from displaying `audio` without controls. 64 | * Remove excess height in iOS 5 devices. 65 | */ 66 | 67 | audio:not([controls]) { 68 | display: none; 69 | height: 0; 70 | } 71 | 72 | /** 73 | * Address `[hidden]` styling not present in IE 8/9/10. 74 | * Hide the `template` element in IE 8/9/11, Safari, and Firefox < 22. 75 | */ 76 | 77 | [hidden], 78 | template { 79 | display: none; 80 | } 81 | 82 | /* Links 83 | ========================================================================== */ 84 | 85 | /** 86 | * Remove the gray background color from active links in IE 10. 87 | */ 88 | 89 | a { 90 | background-color: transparent; 91 | } 92 | 93 | /** 94 | * Improve readability when focused and also mouse hovered in all browsers. 95 | */ 96 | 97 | a:active, 98 | a:hover { 99 | outline: 0; 100 | } 101 | 102 | /* Text-level semantics 103 | ========================================================================== */ 104 | 105 | /** 106 | * Address styling not present in IE 8/9/10/11, Safari, and Chrome. 107 | */ 108 | 109 | abbr[title] { 110 | border-bottom: 1px dotted; 111 | } 112 | 113 | /** 114 | * Address style set to `bolder` in Firefox 4+, Safari, and Chrome. 115 | */ 116 | 117 | b, 118 | strong { 119 | font-weight: bold; 120 | } 121 | 122 | /** 123 | * Address styling not present in Safari and Chrome. 124 | */ 125 | 126 | dfn { 127 | font-style: italic; 128 | } 129 | 130 | /** 131 | * Address variable `h1` font-size and margin within `section` and `article` 132 | * contexts in Firefox 4+, Safari, and Chrome. 133 | */ 134 | 135 | h1 { 136 | font-size: 2em; 137 | margin: 0.67em 0; 138 | } 139 | 140 | /** 141 | * Address styling not present in IE 8/9. 142 | */ 143 | 144 | mark { 145 | background: #ff0; 146 | color: #000; 147 | } 148 | 149 | /** 150 | * Address inconsistent and variable font size in all browsers. 151 | */ 152 | 153 | small { 154 | font-size: 80%; 155 | } 156 | 157 | /** 158 | * Prevent `sub` and `sup` affecting `line-height` in all browsers. 159 | */ 160 | 161 | sub, 162 | sup { 163 | font-size: 75%; 164 | line-height: 0; 165 | position: relative; 166 | vertical-align: baseline; 167 | } 168 | 169 | sup { 170 | top: -0.5em; 171 | } 172 | 173 | sub { 174 | bottom: -0.25em; 175 | } 176 | 177 | /* Embedded content 178 | ========================================================================== */ 179 | 180 | /** 181 | * Remove border when inside `a` element in IE 8/9/10. 182 | */ 183 | 184 | img { 185 | border: 0; 186 | } 187 | 188 | /** 189 | * Correct overflow not hidden in IE 9/10/11. 190 | */ 191 | 192 | svg:not(:root) { 193 | overflow: hidden; 194 | } 195 | 196 | /* Grouping content 197 | ========================================================================== */ 198 | 199 | /** 200 | * Address margin not present in IE 8/9 and Safari. 201 | */ 202 | 203 | figure { 204 | margin: 1em 40px; 205 | } 206 | 207 | /** 208 | * Address differences between Firefox and other browsers. 209 | */ 210 | 211 | hr { 212 | -moz-box-sizing: content-box; 213 | box-sizing: content-box; 214 | height: 0; 215 | } 216 | 217 | /** 218 | * Contain overflow in all browsers. 219 | */ 220 | 221 | pre { 222 | overflow: auto; 223 | } 224 | 225 | /** 226 | * Address odd `em`-unit font size rendering in all browsers. 227 | */ 228 | 229 | code, 230 | kbd, 231 | pre, 232 | samp { 233 | font-family: monospace, monospace; 234 | font-size: 1em; 235 | } 236 | 237 | /* Forms 238 | ========================================================================== */ 239 | 240 | /** 241 | * Known limitation: by default, Chrome and Safari on OS X allow very limited 242 | * styling of `select`, unless a `border` property is set. 243 | */ 244 | 245 | /** 246 | * 1. Correct color not being inherited. 247 | * Known issue: affects color of disabled elements. 248 | * 2. Correct font properties not being inherited. 249 | * 3. Address margins set differently in Firefox 4+, Safari, and Chrome. 250 | */ 251 | 252 | button, 253 | input, 254 | optgroup, 255 | select, 256 | textarea { 257 | color: inherit; /* 1 */ 258 | font: inherit; /* 2 */ 259 | margin: 0; /* 3 */ 260 | } 261 | 262 | /** 263 | * Address `overflow` set to `hidden` in IE 8/9/10/11. 264 | */ 265 | 266 | button { 267 | overflow: visible; 268 | } 269 | 270 | /** 271 | * Address inconsistent `text-transform` inheritance for `button` and `select`. 272 | * All other form control elements do not inherit `text-transform` values. 273 | * Correct `button` style inheritance in Firefox, IE 8/9/10/11, and Opera. 274 | * Correct `select` style inheritance in Firefox. 275 | */ 276 | 277 | button, 278 | select { 279 | text-transform: none; 280 | } 281 | 282 | /** 283 | * 1. Avoid the WebKit bug in Android 4.0.* where (2) destroys native `audio` 284 | * and `video` controls. 285 | * 2. Correct inability to style clickable `input` types in iOS. 286 | * 3. Improve usability and consistency of cursor style between image-type 287 | * `input` and others. 288 | */ 289 | 290 | button, 291 | html input[type="button"], /* 1 */ 292 | input[type="reset"], 293 | input[type="submit"] { 294 | -webkit-appearance: button; /* 2 */ 295 | cursor: pointer; /* 3 */ 296 | } 297 | 298 | /** 299 | * Re-set default cursor for disabled elements. 300 | */ 301 | 302 | button[disabled], 303 | html input[disabled] { 304 | cursor: default; 305 | } 306 | 307 | /** 308 | * Remove inner padding and border in Firefox 4+. 309 | */ 310 | 311 | button::-moz-focus-inner, 312 | input::-moz-focus-inner { 313 | border: 0; 314 | padding: 0; 315 | } 316 | 317 | /** 318 | * Address Firefox 4+ setting `line-height` on `input` using `!important` in 319 | * the UA stylesheet. 320 | */ 321 | 322 | input { 323 | line-height: normal; 324 | } 325 | 326 | /** 327 | * It's recommended that you don't attempt to style these elements. 328 | * Firefox's implementation doesn't respect box-sizing, padding, or width. 329 | * 330 | * 1. Address box sizing set to `content-box` in IE 8/9/10. 331 | * 2. Remove excess padding in IE 8/9/10. 332 | */ 333 | 334 | input[type="checkbox"], 335 | input[type="radio"] { 336 | box-sizing: border-box; /* 1 */ 337 | padding: 0; /* 2 */ 338 | } 339 | 340 | /** 341 | * Fix the cursor style for Chrome's increment/decrement buttons. For certain 342 | * `font-size` values of the `input`, it causes the cursor style of the 343 | * decrement button to change from `default` to `text`. 344 | */ 345 | 346 | input[type="number"]::-webkit-inner-spin-button, 347 | input[type="number"]::-webkit-outer-spin-button { 348 | height: auto; 349 | } 350 | 351 | /** 352 | * 1. Address `appearance` set to `searchfield` in Safari and Chrome. 353 | * 2. Address `box-sizing` set to `border-box` in Safari and Chrome 354 | * (include `-moz` to future-proof). 355 | */ 356 | 357 | input[type="search"] { 358 | -webkit-appearance: textfield; /* 1 */ 359 | -moz-box-sizing: content-box; 360 | -webkit-box-sizing: content-box; /* 2 */ 361 | box-sizing: content-box; 362 | } 363 | 364 | /** 365 | * Remove inner padding and search cancel button in Safari and Chrome on OS X. 366 | * Safari (but not Chrome) clips the cancel button when the search input has 367 | * padding (and `textfield` appearance). 368 | */ 369 | 370 | input[type="search"]::-webkit-search-cancel-button, 371 | input[type="search"]::-webkit-search-decoration { 372 | -webkit-appearance: none; 373 | } 374 | 375 | /** 376 | * Define consistent border, margin, and padding. 377 | */ 378 | 379 | fieldset { 380 | border: 1px solid #c0c0c0; 381 | margin: 0 2px; 382 | padding: 0.35em 0.625em 0.75em; 383 | } 384 | 385 | /** 386 | * 1. Correct `color` not being inherited in IE 8/9/10/11. 387 | * 2. Remove padding so people aren't caught out if they zero out fieldsets. 388 | */ 389 | 390 | legend { 391 | border: 0; /* 1 */ 392 | padding: 0; /* 2 */ 393 | } 394 | 395 | /** 396 | * Remove default vertical scrollbar in IE 8/9/10/11. 397 | */ 398 | 399 | textarea { 400 | overflow: auto; 401 | } 402 | 403 | /** 404 | * Don't inherit the `font-weight` (applied by a rule above). 405 | * NOTE: the default cannot safely be changed in Chrome and Safari on OS X. 406 | */ 407 | 408 | optgroup { 409 | font-weight: bold; 410 | } 411 | 412 | /* Tables 413 | ========================================================================== */ 414 | 415 | /** 416 | * Remove most spacing between table cells. 417 | */ 418 | 419 | table { 420 | border-collapse: collapse; 421 | border-spacing: 0; 422 | } 423 | 424 | td, 425 | th { 426 | padding: 0; 427 | } 428 | -------------------------------------------------------------------------------- /deduplicated/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Copyright (c) 2015 Eduardo Klosowski 4 | # License: MIT (see LICENSE for details) 5 | # 6 | 7 | from __future__ import unicode_literals 8 | 9 | from datetime import datetime 10 | from hashlib import sha1 11 | import os 12 | import sqlite3 13 | import sys 14 | 15 | # workaround for Python 2 16 | try: 17 | from configparser import ConfigParser 18 | except ImportError: 19 | from ConfigParser import ConfigParser 20 | 21 | if sys.version_info[0] == 2: 22 | reload(sys) # NOQA 23 | sys.setdefaultencoding('utf-8') 24 | 25 | 26 | # Global Vars 27 | 28 | __version__ = '1.0.dev0' 29 | CACHE_DIR = os.path.join(os.path.expanduser('~'), '.deduplicated') 30 | 31 | 32 | # Utils 33 | 34 | def sha1_file(filename): 35 | with open(filename, 'rb') as fp: 36 | s = sha1() 37 | block = True 38 | while block: 39 | block = fp.read(2 ** 10) 40 | s.update(block) 41 | return s.hexdigest() 42 | 43 | 44 | def str_size(size): 45 | size = float(size) 46 | if size < 2 ** 10: 47 | return '%d B' % size 48 | if size < 2 ** 20: 49 | return '%.2f KB' % (size / (2 ** 10)) 50 | if size < 2 ** 30: 51 | return '%.2f MB' % (size / (2 ** 20)) 52 | if size < 2 ** 40: 53 | return '%.2f GB' % (size / (2 ** 30)) 54 | return '%.2f TB' % (size / (2 ** 40)) 55 | 56 | 57 | # Directory 58 | 59 | def directory_by_hash(hashid, checkvalid=True): 60 | config = ConfigParser() 61 | if not config.read([os.path.join(CACHE_DIR, hashid + '.meta')]): 62 | raise IOError('hash directory not found') 63 | return Directory(config.get('META', 'path'), checkvalid=checkvalid) 64 | 65 | 66 | def directory_delete(hashid): 67 | for filename in [filename for filename in os.listdir(CACHE_DIR) if filename.startswith(hashid)]: 68 | os.remove(os.path.join(CACHE_DIR, filename)) 69 | 70 | 71 | def directory_list(): 72 | dirlist = [] 73 | for filename in [filename for filename in os.listdir(CACHE_DIR) if filename.endswith('.meta')]: 74 | meta = ConfigParser() 75 | meta.read([os.path.join(CACHE_DIR, filename)]) 76 | path = meta.get('META', 'path') 77 | dirlist.append(path) 78 | return sorted(dirlist, key=lambda x: x.lower()) 79 | 80 | 81 | class Directory(object): 82 | def __init__(self, path, checkvalid=True): 83 | path = os.path.abspath(path) 84 | self._path = path 85 | if checkvalid and not self.is_valid(): 86 | raise IOError('%s is not valid directory' % path) 87 | 88 | self._hashfile_prefix = os.path.join(CACHE_DIR, self.get_hash()) 89 | 90 | self._meta = ConfigParser() 91 | if os.path.exists(self.get_metafilename()): 92 | self._meta.read([self.get_metafilename()]) 93 | if not self._meta.has_section('META'): 94 | self._meta.add_section('META') 95 | self._meta.set('META', 'path', path) 96 | self._meta.set('META', 'lastupdate', '') 97 | self.save_meta() 98 | if not self._meta.has_section('options'): 99 | self._meta.add_section('options') 100 | self._meta.set('options', 'follow_link', 'False') 101 | self.save_meta() 102 | if not self._meta.has_section('duplicated'): 103 | self._meta.add_section('duplicated') 104 | self._meta.set('duplicated', 'hash', '0') 105 | self._meta.set('duplicated', 'files', '0') 106 | self._meta.set('duplicated', 'size', '0') 107 | self.save_meta() 108 | 109 | if os.path.exists(self.get_excludefilename()): 110 | with open(self.get_excludefilename()) as fp: 111 | self.exclude = fp.read().splitlines() 112 | else: 113 | self.exclude = [] 114 | 115 | self._conn = sqlite3.connect(self.get_dbfilename()) 116 | self._db = self._conn.cursor() 117 | self._db.execute('CREATE TABLE IF NOT EXISTS files ' 118 | '(filename TEXT PRIMARY KEY, mtime FLOAT, size INT, hash TEXT, exist INT)') 119 | 120 | def __str__(self): 121 | return self._path 122 | 123 | def get_hash(self): 124 | return sha1(self._path.encode('utf-8')).hexdigest() 125 | 126 | def is_valid(self): 127 | return os.path.isdir(self._path) 128 | 129 | # Path for files 130 | def get_dbfilename(self): 131 | return self._hashfile_prefix + '.db' 132 | 133 | def get_excludefilename(self): 134 | return self._hashfile_prefix + '.exclude' 135 | 136 | def get_metafilename(self): 137 | return self._hashfile_prefix + '.meta' 138 | 139 | # Database 140 | def save_database(self): 141 | self._conn.commit() 142 | 143 | def optimize_database(self): 144 | size_orig = os.path.getsize(self.get_dbfilename()) 145 | self._db.execute('VACUUM') 146 | size_opt = os.path.getsize(self.get_dbfilename()) 147 | return (size_orig, size_opt, size_orig - size_opt) 148 | 149 | # Exclude 150 | def save_exclude(self): 151 | with open(self.get_excludefilename(), 'w') as fp: 152 | fp.write('\n'.join(self.exclude)) 153 | 154 | # Meta 155 | def get_duplicated_hash(self): 156 | return self._meta.getint('duplicated', 'hash') 157 | 158 | def get_duplicated_files(self): 159 | return self._meta.getint('duplicated', 'files') 160 | 161 | def get_duplicated_size(self): 162 | return self._meta.getint('duplicated', 'size') 163 | 164 | def get_lastupdate(self): 165 | lastupdate = self._meta.get('META', 'lastupdate') 166 | if lastupdate: 167 | return datetime.strptime(lastupdate, '%Y-%m-%d %H:%M:%S') 168 | return None 169 | 170 | def now_lastupdate(self): 171 | return self._meta.set('META', 'lastupdate', datetime.now().strftime('%Y-%m-%d %H:%M:%S')) 172 | 173 | def set_option_follow_link(self, value): 174 | self._meta.set('options', 'follow_link', 'yes' if value else 'no') 175 | 176 | def is_option_follow_link(self): 177 | return self._meta.getboolean('options', 'follow_link') 178 | 179 | def save_meta(self): 180 | with open(self.get_metafilename(), 'w') as fp: 181 | self._meta.write(fp) 182 | 183 | # Utils 184 | def is_completed(self): 185 | self._db.execute('SELECT count(*) FROM files WHERE exist != 2') 186 | return not self._db.fetchone()[0] 187 | 188 | def is_file_in(self, filename): 189 | hashfile = sha1_file(filename) 190 | self._db.execute('SELECT filename FROM files WHERE hash = ?', (hashfile,)) 191 | return [os.path.join(self._path, row[0]) for row in self._db.fetchall()] 192 | 193 | def delete_file(self, filename): 194 | self._db.execute('DELETE FROM files WHERE filename = ?', (filename,)) 195 | if self._db.rowcount: 196 | os.remove(os.path.join(self._path, filename)) 197 | self.save_database() 198 | 199 | def delete_duplicated_indir(self, dirname): 200 | for _, _, files in self.get_duplicated(): 201 | for filename in [filename for filename in files if filename.startswith(dirname)]: 202 | self.delete_file(filename) 203 | 204 | def list_files(self, dirname=''): 205 | path = os.path.join(self._path, dirname) 206 | follow_link = self.is_option_follow_link() 207 | try: 208 | filenames = sorted(os.listdir(path)) 209 | except OSError: 210 | filenames = [] 211 | for filename in filenames: 212 | partial_filename = os.path.join(dirname, filename) 213 | abs_filename = os.path.join(self._path, partial_filename) 214 | 215 | if partial_filename in self.exclude: 216 | continue 217 | 218 | if not follow_link and os.path.islink(abs_filename): 219 | continue 220 | 221 | if os.path.isdir(abs_filename): 222 | for f in self.list_files(partial_filename): 223 | yield f 224 | continue 225 | 226 | file_stat = os.stat(abs_filename) 227 | yield partial_filename, file_stat.st_mtime, file_stat.st_size 228 | 229 | def update_duplicated(self): 230 | d_hash = 0 231 | d_files = 0 232 | d_size = 0 233 | 234 | for hashfile, size, files in self.get_duplicated(): 235 | d_hash += 1 236 | files_len = len(files) 237 | d_files += files_len 238 | d_size += (files_len - 1) * size 239 | 240 | self._meta.set('duplicated', 'hash', str(d_hash)) 241 | self._meta.set('duplicated', 'files', str(d_files)) 242 | self._meta.set('duplicated', 'size', str(d_size)) 243 | self.save_meta() 244 | 245 | # Steps 246 | def update_tree(self): 247 | insert = 0 248 | update = 0 249 | self._db.execute('UPDATE files SET exist = 0') 250 | for partial_filename, mtime, size in self.list_files(): 251 | self._db.execute('SELECT mtime, size FROM files WHERE filename = ?', (partial_filename,)) 252 | row = self._db.fetchone() 253 | 254 | # New file 255 | if row is None: 256 | self._db.execute('INSERT INTO files (filename, mtime, size, hash, exist) VALUES (?, NULL, ?, NULL, 1)', 257 | (partial_filename, size)) 258 | insert += 1 259 | continue 260 | 261 | # Update file 262 | if mtime != row[0] or size != row[1]: 263 | self._db.execute('UPDATE files SET mtime = NULL, size = ?, hash = NULL, exist = 1 WHERE filename = ?', 264 | (size, partial_filename)) 265 | update += 1 266 | continue 267 | 268 | # Unmodified file 269 | self._db.execute('UPDATE files SET exist = 2 WHERE filename = ?', (partial_filename,)) 270 | 271 | self._db.execute('DELETE FROM files WHERE exist = 0') 272 | delete = self._db.rowcount 273 | self.save_database() 274 | return insert, update, delete 275 | 276 | def hash_for_update(self): 277 | self._db.execute('SELECT filename FROM files WHERE exist = 1') 278 | for filename in self._db.fetchall(): 279 | yield filename[0] 280 | self.now_lastupdate() 281 | self.update_duplicated() 282 | self.save_meta() 283 | 284 | def update_hash(self, filename): 285 | abs_filename = os.path.join(str(self), filename) 286 | stat = os.stat(abs_filename) 287 | self._db.execute('UPDATE files SET mtime = ?, size = ?, hash = ?, exist = 2 WHERE filename = ?', 288 | (stat.st_mtime, stat.st_size, sha1_file(abs_filename), filename)) 289 | self.save_database() 290 | 291 | def get_duplicated(self): 292 | self._db.execute('SELECT hash, COUNT(hash) FROM files GROUP BY hash ORDER BY size ASC') 293 | for row in self._db.fetchall(): 294 | if row[1] > 1: 295 | self._db.execute('SELECT filename, size FROM files WHERE hash = ? ORDER BY filename ASC', (row[0],)) 296 | files = self._db.fetchall() 297 | yield row[0], files[0][1], [f[0] for f in files] 298 | 299 | 300 | # Create user directory if not exists 301 | 302 | if not os.path.exists(CACHE_DIR): 303 | os.mkdir(CACHE_DIR) 304 | --------------------------------------------------------------------------------