├── tests ├── __init__.py ├── readme.md ├── run_test.sh ├── test_pushpull.py ├── test_file_tracking.py └── testutils.py ├── PyFiSync ├── __init__.py ├── dry_run.py ├── config_template.py ├── PFSwalk.py ├── utils.py ├── dicttable.py ├── ldtable.py └── remote_interfaces.py ├── PyFiSync.py ├── setup.py ├── license ├── rsync.md ├── changelog.md ├── rclone.md ├── FAQs.md ├── rclone_b2.md └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /PyFiSync/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | sys.dont_write_bytecode = True 5 | 6 | from .main import __version__,__author__ 7 | from .main import cli 8 | -------------------------------------------------------------------------------- /PyFiSync.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from __future__ import division, print_function, unicode_literals, absolute_import 4 | 5 | import sys 6 | sys.dont_write_bytecode = True 7 | 8 | from PyFiSync import cli 9 | 10 | if __name__ == '__main__': 11 | argv = sys.argv[1:] # Argument besides function name 12 | cli(argv) 13 | -------------------------------------------------------------------------------- /tests/readme.md: -------------------------------------------------------------------------------- 1 | # Tests 2 | 3 | These tests are driven by `py.test`. They are not all formal unit-tests but rather tests the overal behavior. 4 | 5 | Note that the tests use the default configuration. If that is changed, tests should be carefully updated! 6 | 7 | ## Failed Tests 8 | 9 | When running the `test_pre_post_bash` with coverage and python3, it throws errors. They work find but not with coverage! 10 | -------------------------------------------------------------------------------- /tests/run_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Test local being python2 and python3. Inside of the tests, it test for 3 | # local --> local 4 | # local --> remote py2 5 | # local --> remote py3 6 | 7 | # pip install pytest-cov 8 | 9 | # Assuming you have py.test installed for both python2 and 3 10 | P0=$(pwd) 11 | cd "$(dirname "$0")" 12 | 13 | p2dir=$(dirname $(command which python2)) 14 | ${p2dir}/py.test --cov=PyFiSync --cov-report html test_*.py 15 | 16 | p3dir=$(dirname $(command which python3)) 17 | ${p3dir}/py.test --cov=PyFiSync --cov-report html test_*.py 18 | 19 | cd $P0 20 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import PyFiSync 5 | 6 | from setuptools import setup 7 | 8 | setup( 9 | name='PyFiSync', 10 | packages=['PyFiSync'], 11 | long_description=open('README.md').read(), 12 | entry_points = { 13 | 'console_scripts': ['PyFiSync=PyFiSync.main:cli'], 14 | }, 15 | version=PyFiSync.__version__, 16 | description='Python based intelligent file sync with automatic backups and file move/delete tracking.', 17 | url='https://github.com/Jwink3101/PyFiSync', 18 | author=PyFiSync.__author__, 19 | author_email='Jwink3101@@users.noreply.github.com', 20 | license='MIT', 21 | ) 22 | 23 | -------------------------------------------------------------------------------- /license: -------------------------------------------------------------------------------- 1 | Copyright 2019 Justin Winokur 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /rsync.md: -------------------------------------------------------------------------------- 1 | # Rsync Setup 2 | 3 | First set up ssh keys on your *local* machine: 4 | 5 | $ cd 6 | $ ssh-keygen -t rsa 7 | 8 | # It is highly suggested you use a password but you can hit enter 9 | # twice to skip it 10 | 11 | $ cat ~/.ssh/id_rsa.pub | ssh user@remote-system "mkdir -p ~/.ssh && cat >> ~/.ssh/authorized_keys" 12 | 13 | I will assume that `PyFiSync` has been installed on **both** machines. See [the FAQs](FAQs.md) if there is an issue with paths on the remote machines. It is likely due to `.bashrc` not being loaded properly. 14 | 15 | Then modify the config file. All options are commented. 16 | 17 | $ PyFiSync reset --force path/to/sync_dir 18 | 19 | Then sync. This will essentially create a union of the two sides 20 | 21 | $ PyFiSync sync path/to/syncdir 22 | 23 | Essentially this will be a union of the two sides 24 | 25 | (The `--all` is optional but suggested for the first sync. If using `--all`, it is *highly* suggested to add `--no-backup` since everything would be copied) 26 | 27 | Or, (`PyFiSync` assumes a `sync .` if not given other options) 28 | 29 | $ cd path/to/syncdir 30 | $ PyFiSync 31 | -------------------------------------------------------------------------------- /PyFiSync/dry_run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Dry Run equiv 4 | """ 5 | from __future__ import division, print_function, unicode_literals, absolute_import 6 | 7 | from . import utils 8 | 9 | import sys 10 | if sys.version_info >= (3,): 11 | unicode = str 12 | xrange = range 13 | 14 | def apply_action_queue(queue,log,name,config): 15 | log.add('\n(DRY-RUN) Applying queue on {}'.format(name)) 16 | for action_dict in queue: 17 | action,path = list(action_dict.items())[0] 18 | if action == 'move': 19 | log.add('(DRY-RUN) move: ' + utils.move_txt(path[0],path[1])) 20 | elif action in ['backup','delete']: 21 | if action == 'backup' and config.backup: 22 | log.add('(DRY-RUN) backup: ' + path) 23 | elif action=='delete' and config.backup: 24 | log.add('(DRY-RUN) delete (w/ backup): ' + path) 25 | elif action=='delete' and not config.backup: 26 | log.add('(DRY-RUN) delete (w/o backup): ' + path) 27 | else: 28 | pass # Do nothing for now 29 | 30 | def transfer(tqA2B,tqB2A,log,filesA,filesB): 31 | if len(tqA2B) > 0: 32 | totA = 0.0 33 | log.space = 1 34 | log.add('(DRY-RUN) A >>> B') 35 | log.space = 4 36 | for item in tqA2B: 37 | file = filesA.query_one(path=item) 38 | if file is not None: 39 | totA += file['size'] 40 | log.add('(DRY-RUN) ' + item) 41 | log.add('(DRY-RUN) Total Size: %0.2f %s' % utils.bytes2human(totA,short=False)) 42 | 43 | else: 44 | log.space=1 45 | log.add('\nNo A >>> B transfers') 46 | 47 | if len(tqB2A) > 0: 48 | totB = 0.0 49 | log.space = 1 50 | log.add('(DRY-RUN) A <<< B') 51 | log.space = 4 52 | for item in tqB2A: 53 | file = filesB.query_one(path=item) 54 | if file is not None: 55 | totB += file['size'] 56 | log.add('(DRY-RUN) ' + item) 57 | log.add('(DRY-RUN) Total Size: %0.2f %s' % utils.bytes2human(totB,short=False)) 58 | else: 59 | log.space=1 60 | log.add('\nNo A <<< B transfers') 61 | -------------------------------------------------------------------------------- /changelog.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | This is for *major* changes only; especially ones that break or change functionality 4 | 5 | ## 20210626.0: 6 | 7 | - Changes the conflict "tag" names from, for example, `file.ext.nameA` to `file.nameA.ext`. 8 | - Tests updated to reflect that change 9 | 10 | ## 20200916.0 11 | 12 | * Add `exclude_if_present` option to exclude certain directories based on the existence of a file. This is implemented as a post-listing filter so the directory is still transversed but then later filters. While less efficient, it is both simpler to code and allows for excludes to work on both sides making it *much* safer 13 | * Updates `ldtable` to `DictTable` (aa019ec800) 14 | * Adds note that rclone is still supported but users should migrate to [syncrclone](https://github.com/Jwink3101/syncrclone) as it is better for rclone remotes 15 | * *WARNING*: This is likely the last or nearly last version to support python2 16 | 17 | ## 20200814.0 18 | 19 | * Fixed buffsize warnings with python 3.8 20 | 21 | ## 20200423.0: 22 | 23 | The remote specification of the PyFiSync path has been changed to just specifying `remote_exe`. The prior settings `remote_program` and `PyFiSync_path` will still work but will throw a deprecation warning and may break in future releases. 24 | 25 | No warnings will be issued but this is likely the last (or nearly last) release that will support Python 2. 26 | 27 | Other minor changes. 28 | 29 | * Added stats on transfers and file lists 30 | * Fixed a bug and added test wherein the config file was not executed from the sync directory as expected. 31 | 32 | ## 20191119: 33 | 34 | Minor bug fix for ssh+rsync backend where the default excludes (e.g. `.git`) were being applied even when they were explicitly *not* intended to be excluded. 35 | 36 | ## 20191115: 37 | 38 | This change is all about using file hashes. 39 | 40 | * Added ability to tell PyFiSync to compare hashes instead of or in addition to `mtime`. This is *much* more robust, though more expensive 41 | * `mtime` is still used to resolve conflicts but if two files have differing `mtime` but the same hash (and name), they are not transfered. 42 | * Added the ability to specify any `hashlib.algorithms_guaranteed` for local and rsync remotes. 43 | * Also added dbhash 44 | * Changed adler to return hex value. (and actually made adler an option) 45 | * Improved hashdb test to ensure it is *actually* being used (it was! It just wasn't tested well) 46 | 47 | Plus minor bug fixes, typo fixes, and other improvements 48 | 49 | ## 20190509: 50 | 51 | This is a **major** change! Assuming PyFiSync has been updated on both sides, it is a good practice to copy your old config file, make a new one, and then manually update as needed. 52 | 53 | Some (but not all) changes are: 54 | 55 | * Added an rclone backend. The config file for rclone is very different but the rsync only has a few minor changes 56 | * Removed `git_exclude` completely. It was a nice feature but really could be accomplished by *only* allowing certain files in git and then excluding them from PyFiSync 57 | * Removed push/pull modes. They were not as robust and didn't add to the tool 58 | -------------------------------------------------------------------------------- /tests/test_pushpull.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from __future__ import unicode_literals,print_function 4 | 5 | try: 6 | from . import testutils 7 | except (ValueError,ImportError): 8 | import testutils 9 | testutils.add_module() # make sure the test are importing the NON-installed version 10 | import PyFiSync 11 | 12 | import os 13 | import sys 14 | import shutil 15 | import itertools 16 | from glob import glob 17 | import re 18 | from pprint import pprint 19 | 20 | import pytest 21 | 22 | ## Specify whether to test remotely or locally...or both 23 | # remotes = [False] # Just test locally 24 | # remotes = [False,True] 25 | remotes = [False,'python2','python3'] 26 | 27 | 28 | @pytest.mark.parametrize("remote,AB,all_", list(itertools.product(remotes,['A','B'],[True,False]))) 29 | def test_mod_new_different(remote,AB,all_): 30 | """ Different file modified on each side. Only one changes. Then add 'all' 31 | to make sure both have uploaded everything, even what is not modified """ 32 | testpath = os.path.join(os.path.abspath(os.path.split(__file__)[0]), 33 | 'test_dirs','pp','test_mod_new_different') 34 | try: 35 | shutil.rmtree(testpath) 36 | except: 37 | pass 38 | os.makedirs(testpath) 39 | testutil = testutils.Testutils(testpath=testpath) 40 | 41 | # Init 42 | testutil.write('A/fileA',text='fileA') 43 | testutil.write('A/fileB',text='fileB') 44 | 45 | # copy over 46 | testutil.copy_tree() 47 | 48 | # Start it 49 | config = testutil.get_config(remote=remote) 50 | testutil.init(config) 51 | 52 | # Apply actions 53 | testutil.write('A/fileA',text='Aaa',mode='a') # append it 54 | testutil.write('B/fileB',text='B',mode='a') 55 | 56 | testutil.write('A/fileA_new',text='fileA_new') 57 | testutil.write('B/fileB_new',text='fileB_new') 58 | 59 | # Sync 60 | if AB == 'A': 61 | mode = 'push' 62 | else: 63 | mode='pull' 64 | 65 | if all_: 66 | mode += '_all' 67 | 68 | testutil.run(config,mode=mode) 69 | # Check it -- Only need to check A 70 | diff = testutil.compare_tree() 71 | 72 | if all_: 73 | assert len(diff) == 1 74 | else: 75 | assert len(diff) == 2 76 | 77 | if AB == 'A': 78 | assert (u'missing_inA', u'fileB_new') in diff 79 | if not all_: # This change should be overwritten 80 | assert ('disagree', 'fileB') in diff 81 | else: 82 | assert (u'missing_inB', u'fileA_new') in diff 83 | if not all_:# This change should be overwritten 84 | assert ('disagree', 'fileA') in diff 85 | 86 | 87 | @pytest.mark.parametrize("remote,AB,all_", list(itertools.product(remotes,['A','B'],[True,False]))) 88 | def test_move_overwrite(remote,AB,all_): 89 | """ A file move that will overwrite on recieveing end. Check backups """ 90 | testpath = os.path.join(os.path.abspath(os.path.split(__file__)[0]), 91 | 'test_dirs','pp','test_move_overwrite') 92 | try: 93 | shutil.rmtree(testpath) 94 | except: 95 | pass 96 | os.makedirs(testpath) 97 | testutil = testutils.Testutils(testpath=testpath) 98 | 99 | # Init 100 | testutil.write('A/fileA0',text='fileA0') 101 | testutil.write('A/fileB0',text='fileB0') 102 | 103 | # copy over 104 | testutil.copy_tree() 105 | 106 | # Start it 107 | config = testutil.get_config(remote=remote) 108 | testutil.init(config) 109 | 110 | # Apply actions 111 | testutil.write('A/fileA1',text='fileA1') 112 | testutil.move('A/fileA0','A/fileB1') 113 | 114 | testutil.write('B/fileB1',text='fileB1') 115 | testutil.move('B/fileB0','B/fileA1') 116 | 117 | # Sync 118 | if AB == 'A': 119 | mode = 'push' 120 | else: 121 | mode='pull' 122 | 123 | if all_: 124 | mode += '_all' 125 | 126 | testutil.run(config,mode=mode) 127 | 128 | # Check it -- Only need to check A 129 | diff = testutil.compare_tree() 130 | 131 | if all_: 132 | assert len(diff) == 0 133 | # In the end, all files are either moved or overwritten. We do not 134 | # expect there to be any differences 135 | elif AB == 'A': # Check backups in B 136 | assert diff == [('missing_inB', 'fileB0')] # Never gets pushed 137 | 138 | elif AB == 'B': # Check backups in B 139 | assert diff == [('missing_inA', 'fileA0')] # Never gets pulled 140 | 141 | 142 | 143 | 144 | 145 | if __name__=='__main__': 146 | pass 147 | 148 | -------------------------------------------------------------------------------- /rclone.md: -------------------------------------------------------------------------------- 1 | # Rclone 2 | 3 | --- 4 | **DEPRECATION** 5 | rclone support exists and works but is not optimal. PyFiSync was designed around rsync and is therefore less efficient with rclone. 6 | 7 | It is *much* better to use [syncrclone](https://github.com/Jwink3101/syncrclone) instead 8 | 9 | --- 10 | 11 | (beta) 12 | 13 | [rclone](https://rclone.org/) is now a supported backend with PyFiSync but there are some important details. 14 | 15 | First and foremost, **this is a beta feature**. Some aspects have been formally and informally tested but there are a lot more edge cases to consider. And, a fair amount options that may or may not be needed based on settings 16 | 17 | ## Setup 18 | 19 | Set up your remote as you want. You can do it into its own config file but be sure to then add the `--config PATH` flag. 20 | 21 | See examples 22 | 23 | ## Password protected config 24 | 25 | If you password protect your config, you will need to either store your password in the PyFiSync config (*not* recommended) or you will have to enter it each time. To enter it each time, specify: 26 | 27 | # Specify False if config is not encrypted. True means it will prompt 28 | # for your password and a string specifies the password (not recommended) 29 | rclone_config_encrypted = True 30 | 31 | The password is stored in memory only and passes as an environment variable to rclone. This *should* be secure enough but, presumably, an offender can do a memory dump. 32 | 33 | ## Flags 34 | 35 | With this backend, you can (and usually *need*) to set some flags. Some of them are listed in the config file and they are often rclone remote dependent. Some of them have been tested but not all. 36 | 37 | Some common ones: 38 | 39 | * `--transfers NN`: How many transfers should be done at once 40 | * `--fast-list`: This should be used on all bucket (S3, B2, Swift) backends. 41 | * `--config PATH`: Specify a different path to the config file. This is very useful if you want to keep the config file somewhere else (including with the files synced) 42 | 43 | These are to be specified as a list! 44 | 45 | **WARNING**: There is no validation performed on the specified flags. That means that you could specify some options that interfere with the expected behavior of of rclone including links. 46 | 47 | ## Symlinks 48 | 49 | The `copy_symlinks_as_links` setting does not work for some remotes. rclone claims to have a workaround but it is inconsistent. See [this github issue](https://github.com/ncw/rclone/issues/3163). May be fixed in the future 50 | 51 | ## Attributes 52 | 53 | In addition to the default `mtime`, the acceptable attributes are `size` and hashes as described below 54 | 55 | 56 | ### Hashes 57 | 58 | Some, but not all, rlcone remotes support hashes. While on the local side, PyFiSync only supports sha1, the rclone backend will support whatever hash it can. This depends on the [remote]. To specify a hash as a `move_attribute` for rlcone specify as `hash.SHA-1` (where it must be `SHA-1` since that is what `lsjson` returns). If the remote does not support the specified hash, expect a key error! 59 | 60 | ## Mod Time 61 | 62 | rlcone remotes *must* support `ModTime` as per the [remote docs][remote]. If it does not, PyFiSync will likely fail and/or cause issues. There is no check to make sure this is case! It is up to the user 63 | 64 | ## Backups 65 | 66 | Some [remotes][remote] do not natively support moves or even server-side copy. Rclone presents a unified interface to all of these systems so it replicates moves with either download + upload + delete or, if it can, server-side copy + delete. As such, for files that should be backed up (before overwrite or delete) you can instead just download and store the backup locally. 67 | 68 | ## Tests 69 | 70 | Many of the sync tests are also tested with rclone. Some however are not because they are not expected to pass or because they require some custom change. 71 | 72 | See known issues for a discussion of situations (and failed tests) because rclone cannot handle it. 73 | 74 | ## Other situations 75 | 76 | ### Missing Hashes 77 | 78 | In general, a remote supports a certain type of hash and that can be specified. For example B2 supports SHA-1 (attribute `hash.SHA-1`) and S3 supports MD5 (attribute `hash.MD5`). Some remotes (e.g. crypt) do not support any hashes. 79 | 80 | According to the [rclone docs](https://rclone.org/s3/) not all files have a hash even if the system other-wise supports it. 81 | 82 | As such, if `imitate_hash = True` then a warning will be printed about the file but the code will imitate a hash by looking at the other metadata (which means it cannot be used for move tracking). Using `imitate_hash = True` with an incorrectly-specified hash (e.g. `hash.SHA1` instead of `hash.SHA-1`) will cause a lot of errors. 83 | 84 | ## Known Issues 85 | 86 | * When using rclone mode, folders are essentially ignored. Empty directories will remain. This is not an issue for remotes that do not explicitly view directories 87 | * If a file is deleted and then another is moved into its place, it will view it as a delete and then a new file (which will likely conflict). This is due to only specifying the path as a previous attribute so there is no way to know a file was moved vs deleted. This is tested with `test_file_deleted_replaced_with_move` (which the rclone-version would fail) but the issues is replicated in `test_file_replaced_deleted_other_prev_attr` 88 | * Since directories are not a concept in rclone nor some remotes, tests dealing with empty directories will fail. See 89 | * `test_delete_file_in_folder`,`test_delete_folder` 90 | * Symlinks do not work on some remotes unless when `copy_symlinks_as_links` is True. See [this github issue](https://github.com/ncw/rclone/issues/3163). A workaround may be included in the future 91 | 92 | 93 | 94 | [remote]:https://rclone.org/overview/ 95 | -------------------------------------------------------------------------------- /tests/test_file_tracking.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import unicode_literals,print_function 3 | 4 | import pytest #with pytest.raises(ValueError):... 5 | 6 | try: 7 | from . import testutils 8 | except (ValueError,ImportError): 9 | import testutils 10 | testutils.add_module() 11 | 12 | from PyFiSync import utils,PFSwalk 13 | from PyFiSync import main as PyFiSync # Need to fix 14 | DictTable = main.DictTable 15 | 16 | import os 17 | import sys 18 | import shutil 19 | 20 | 21 | def _file_list(path,config=None): 22 | if config is None: 23 | config = utils.configparser() 24 | log = utils.logger(silent=True,path=None) 25 | _tmp = PFSwalk.file_list(path,config,log) 26 | return _tmp.files() 27 | 28 | 29 | 30 | def test_untouched(): 31 | """ File is not touched """ 32 | name = 'test_untouched' 33 | testpath = os.path.join(os.path.abspath(os.path.split(__file__)[0]), 34 | 'test_dirs','move_tests',name) 35 | try: 36 | shutil.rmtree(testpath) 37 | except: 38 | pass 39 | os.makedirs(testpath) 40 | testutil = testutils.Testutils(testpath=testpath) 41 | 42 | 43 | # Init 44 | testutil.write('file1.txt',text='test1') 45 | 46 | prev_attr = ['ino','path'] 47 | move_attr = ['ino','birthtime'] 48 | 49 | # Get and inject configs 50 | config = testutil.get_config() 51 | PyFiSync.config = config 52 | 53 | # old list 54 | files_old = DictTable(_file_list(testpath,config)) 55 | 56 | # Apply actions 57 | 58 | 59 | # new list and track 60 | files_new = DictTable(_file_list(testpath,config)) 61 | PyFiSync.file_track(files_old,files_new,prev_attr,move_attr) 62 | 63 | # Check 64 | assert {'path':'file1.txt','untouched':True,'prev_path':'file1.txt'} in files_new 65 | 66 | def test_move(): # This used to be test 01 67 | """ File Moved. inode and birthtime tracking """ 68 | name = 'test_move' 69 | testpath = os.path.join(os.path.abspath(os.path.split(__file__)[0]), 70 | 'test_dirs','move_tests',name) 71 | try: 72 | shutil.rmtree(testpath) 73 | except: 74 | pass 75 | os.makedirs(testpath) 76 | testutil = testutils.Testutils(testpath=testpath) 77 | 78 | 79 | # Init 80 | testutil.write('file1.txt',text='test1') 81 | 82 | prev_attr = ['ino','path'] 83 | move_attr = ['ino','birthtime'] 84 | 85 | # Get and inject configs 86 | config = testutil.get_config() 87 | PyFiSync.config = config 88 | 89 | # old list 90 | files_old = DictTable(_file_list(testpath,config)) 91 | 92 | # Apply actions 93 | testutil.move('file1.txt','file2.txt') 94 | 95 | # new list and track 96 | files_new = DictTable(_file_list(testpath,config)) 97 | PyFiSync.file_track(files_old,files_new,prev_attr,move_attr) 98 | 99 | # Check 100 | assert {'path':'file2.txt','moved':True,'prev_path':'file1.txt'} in files_new 101 | 102 | @pytest.mark.parametrize("mode", ['birthtime','size']) 103 | def test_move_mod(mode): 104 | """ test modification after move with different modes""" 105 | name = 'test_move_mod_' + mode 106 | testpath = os.path.join(os.path.abspath(os.path.split(__file__)[0]), 107 | 'test_dirs','move_tests',name) 108 | try: 109 | shutil.rmtree(testpath) 110 | except: 111 | pass 112 | os.makedirs(testpath) 113 | testutil = testutils.Testutils(testpath=testpath) 114 | 115 | 116 | # Init 117 | testutil.write('file1.txt',text='test1') 118 | 119 | prev_attr = ['ino','path'] 120 | move_attr = ['ino'] + [mode] 121 | 122 | # Get and inject configs 123 | config = testutil.get_config() 124 | PyFiSync.config = config 125 | 126 | # old list 127 | files_old = DictTable(_file_list(testpath,config)) 128 | 129 | # Apply actions 130 | testutil.move('file1.txt','file2.txt') 131 | testutil.write('file2.txt',text='mod',mode='a') 132 | 133 | # new list and track 134 | files_new = DictTable(_file_list(testpath,config)) 135 | PyFiSync.file_track(files_old,files_new,prev_attr,move_attr) 136 | 137 | # Check 138 | if mode == 'birthtime': 139 | assert {'path':'file2.txt','moved':True,'prev_path':'file1.txt'} in files_new 140 | elif mode == 'size': 141 | assert not {'path':'file2.txt','moved':True,'prev_path':'file1.txt'} in files_new 142 | assert {'path':'file2.txt','new':True,'prev_path':None} in files_new 143 | else: 144 | assert False 145 | 146 | 147 | def test_no_moves(): 148 | """ tests using name as the only attribute""" 149 | name = 'pathonly' 150 | testpath = os.path.join(os.path.abspath(os.path.split(__file__)[0]), 151 | 'test_dirs','move_tests',name) 152 | try: 153 | shutil.rmtree(testpath) 154 | except: 155 | pass 156 | os.makedirs(testpath) 157 | testutil = testutils.Testutils(testpath=testpath) 158 | 159 | 160 | # Init 161 | testutil.write('file1.txt',text='test1') 162 | testutil.write('file2.txt',text='test2') 163 | testutil.write('file3.txt',text='test3') 164 | testutil.write('file4.txt',text='test4') 165 | 166 | prev_attr = ['path'] 167 | move_attr = ['path'] 168 | 169 | # Get and inject configs 170 | config = testutil.get_config() 171 | PyFiSync.config = config 172 | 173 | # old list 174 | files_old = DictTable(_file_list(testpath,config)) 175 | 176 | # Apply actions 177 | testutil.move('file2.txt','file22.txt') 178 | testutil.move('file3.txt','file33.txt') 179 | testutil.write('file3.txt',text='testnew',mode='w') 180 | testutil.remove('file4.txt') 181 | testutil.write('file5.txt',text='test5',mode='w') 182 | 183 | 184 | # new list and track 185 | files_new = DictTable(_file_list(testpath,config)) 186 | PyFiSync.file_track(files_old,files_new,prev_attr,move_attr) 187 | 188 | files_old.alwaysReturnList = True 189 | files_new.alwaysReturnList = True 190 | 191 | ## Check 192 | 193 | # Even though 22 and 33 were moves they should show as new 194 | # File5 should also be new (since it really is) 195 | t1db = DictTable( files_new(new=True)) 196 | assert len(t1db) == 3 197 | assert {'path':'file22.txt'} in t1db 198 | assert {'path':'file33.txt'} in t1db 199 | assert {'path':'file5.txt'} in t1db 200 | assert len(list(t1db(new=True))) == 3 201 | 202 | # file 3 should show as being modified and not new 203 | f3 = files_new.query_one(path='file3.txt') 204 | assert not f3['new'] 205 | assert testutil.read('file3.txt') == 'testnew' 206 | 207 | # file2 should be deleted even though moved 208 | assert files_old.query_one(path='file2.txt')['deleted'] 209 | 210 | if __name__=='__main__': 211 | test_no_moves()# 212 | # test_move() 213 | # test_move_mod() 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | -------------------------------------------------------------------------------- /tests/testutils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import unicode_literals,print_function 3 | #from io import open 4 | 5 | import os 6 | import sys 7 | import random 8 | import string 9 | import shutil 10 | from pprint import pprint 11 | from glob import glob 12 | 13 | 14 | def add_module(): 15 | 16 | path = os.path.abspath(__file__) 17 | path = os.path.split(os.path.split(path)[0])[0] # Move up one 18 | sys.path.insert(0,path) 19 | 20 | add_module() 21 | import PyFiSync 22 | import PyFiSync.utils 23 | utils = PyFiSync.utils 24 | 25 | MAX_TIME_MOD = 500; 26 | 27 | class Testutils(object): 28 | def __init__(self,testpath=None): 29 | self.testpath = testpath 30 | 31 | def modtime_all(self): 32 | """ modified all of the times """ 33 | random.seed(4474) 34 | pathA = os.path.join(self.testpath,'A') 35 | pathB = os.path.join(self.testpath,'B') 36 | 37 | for dirpath, dirnames, filenames in os.walk(pathA): 38 | for f in filenames: 39 | change_time(os.path.join(dirpath,f),random.randint(-100*MAX_TIME_MOD,-(MAX_TIME_MOD+2))) 40 | try: 41 | os.makedirs(pathB) 42 | except: 43 | pass 44 | 45 | 46 | def write(self,path,time_adj=None,mode='w',text=None): 47 | """Write or append a file""" 48 | path = os.path.join(self.testpath,path) 49 | directory = os.path.split(path)[0] 50 | try: 51 | os.makedirs(directory) 52 | except OSError: 53 | pass 54 | 55 | if text is None: 56 | text = randstr() 57 | 58 | text += '\n' 59 | 60 | if mode == 'a' and time_adj == 0: 61 | time_adj = 1 62 | 63 | with open(path,mode) as F: 64 | F.write(text) 65 | 66 | if time_adj is None: 67 | change_time(path,random.randint(5,MAX_TIME_MOD)) 68 | elif time_adj != 0: 69 | change_time(path,time_adj) 70 | 71 | 72 | def exists(self,path): 73 | path = os.path.join(self.testpath,path) 74 | return os.path.exists(path) 75 | def move(self,src,dst): 76 | """Move and makedirs if needed""" 77 | src = os.path.join(self.testpath,src) 78 | dst = os.path.join(self.testpath,dst) 79 | directory = os.path.split(dst)[0] 80 | try: 81 | os.makedirs(directory) 82 | except OSError: 83 | pass 84 | 85 | shutil.move(src,dst) 86 | 87 | def remove(self,path): 88 | """remove and makedirs if needed""" 89 | path = os.path.join(self.testpath,path) 90 | if os.path.isfile(path): 91 | os.remove(path) 92 | if os.path.isdir(path): 93 | shutil.rmtree(path) 94 | 95 | def read(self,item): 96 | path = os.path.join(self.testpath,item) 97 | 98 | assert os.path.exists(path), "file doesn't exist '%s'" % item 99 | 100 | with open(path) as F: 101 | return F.read().strip() 102 | 103 | def tree(self,path): 104 | files = [] 105 | for dirpath, dirnames, filenames in os.walk(path,followlinks=True): 106 | for d in ['.PyFiSync','.git']: 107 | try: 108 | dirnames.remove(d) 109 | except ValueError: 110 | pass 111 | 112 | files.extend(os.path.join(dirpath,filename) for filename in filenames) 113 | if len(dirnames) == len(filenames) == 0: 114 | files.append(os.path.join(dirpath,'>>EMPTY<<')) 115 | return files 116 | 117 | def compare_tree(self): 118 | """ All file systems are identical""" 119 | result = [] 120 | 121 | pathA = os.path.join(self.testpath,'A') 122 | pathB = os.path.join(self.testpath,'B') 123 | 124 | filesA = [os.path.relpath(f,pathA) for f in self.tree(pathA)] 125 | filesB = [os.path.relpath(f,pathB) for f in self.tree(pathB)] 126 | 127 | filesAB = set(filesA).union(filesB) 128 | for fileAB in sorted(list(filesAB)): 129 | 130 | fileA = os.path.join(self.testpath,'A',fileAB) 131 | fileB = os.path.join(self.testpath,'B',fileAB) 132 | try: 133 | fileAtxt = open(fileA).read() 134 | except IOError: 135 | result.append( ('missing_inA',fileAB) ) 136 | continue 137 | 138 | try: 139 | fileBtxt = open(fileB).read() 140 | except IOError: 141 | result.append( ('missing_inB',fileAB) ) 142 | continue 143 | 144 | if not fileAtxt == fileBtxt: 145 | result.append( ('disagree',fileAB)) 146 | 147 | return result 148 | 149 | def get_config(self,remote=False): 150 | if remote == 'rclone': 151 | config = utils.configparser(remote='rclone') 152 | config.move_attributesB = ['hash.SHA-1'] 153 | else: 154 | config = utils.configparser(remote='rsync') 155 | if remote: 156 | config.userhost = os.environ['USER'] + '@localhost' 157 | 158 | # Specify the full executable to make sure not using an installed copy 159 | if remote == 'python2': 160 | exe = 'python2' 161 | elif remote == 'python3': 162 | exe = 'python3' 163 | pfs_path = os.path.normpath(os.path.join(os.path.dirname(__file__),'..','PyFiSync.py')) 164 | config.remote_exe = '{} {}'.format(exe,pfs_path) 165 | else: 166 | config.userhost = '' 167 | 168 | 169 | config.excludes += ['.DS_Store','.git/','Thumbs.db'] 170 | config.pathA = os.path.join(self.testpath,'A') 171 | config.pathB = os.path.join(self.testpath,'B') 172 | 173 | return config 174 | 175 | def write_config(self,config): 176 | if self.testpath is None: 177 | return 178 | config_path = os.path.join(self.testpath,'A','.PyFiSync','config') 179 | config_file = open(config_path,'w') 180 | 181 | for key,val in config.__dict__.items(): 182 | if key.startswith('_') or key == 'pwprompt': 183 | continue 184 | config_file.write(key + ' = ' ) 185 | pprint(val,stream=config_file) 186 | 187 | config_file.close() 188 | 189 | def init(self,config): 190 | pathA = os.path.join(self.testpath,'A') 191 | pathB = os.path.join(self.testpath,'B') 192 | 193 | PyFiSync.cli(['init',pathA]) 194 | self.write_config(config) 195 | PyFiSync.cli(['reset','--force',pathA]) 196 | PyFiSync.cli(['sync',pathA]) 197 | # At init, every file's mod time was changed to be at least -(MAX_TIME_MOD+2) 198 | # so we do not need to modify the last_run 199 | 200 | 201 | def run(self,config,mode='sync',silent=False,flags=tuple()): 202 | pathA = os.path.join(self.testpath,'A') 203 | pathB = os.path.join(self.testpath,'B') 204 | 205 | self.write_config(config) 206 | if mode == 'sync': 207 | cmd = ['sync'] + list(flags) + [pathA] 208 | PyFiSync.cli(cmd) 209 | 210 | def get_log_txt(self,AB='A'): 211 | log_path = glob(os.path.join(self.testpath,AB,'.PyFiSync','logs','20*.log')) 212 | log_path.sort() 213 | with open(log_path[-1]) as l: # latest one 214 | return l.read() 215 | 216 | def randstr(N=10): 217 | return ''.join(random.choice(string.lowercase+'0123456789') for _ in xrange(10)) 218 | 219 | def change_time(path,time_adj): 220 | """ Change the time on a file path""" 221 | try: 222 | stat = os.stat(path) 223 | except OSError as E: 224 | print('path {:s} does not exist'.format(E)) 225 | 226 | os.utime(path,(stat.st_atime+time_adj,stat.st_mtime+time_adj)) 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | -------------------------------------------------------------------------------- /PyFiSync/config_template.py: -------------------------------------------------------------------------------- 1 | # PyFiSync Configuration 2 | # This will be evaluated as Python code so indentation matters 3 | # 4 | # Note: 'A' always refers to the local machine and 'B' is the remote; 5 | # even if the "remote" a local path 6 | # 7 | # Specify strings with ' ' and lists with [ ... ] 8 | 9 | # Local Machine 10 | nameA = 'machineA' 11 | 12 | # 13 | # These settings are the the ssh+rsync remote 14 | remote = 'rsync' 15 | 16 | # Remote Machine 17 | nameB = 'machineB' 18 | pathB = '/full/path/to/sync/dir' 19 | 20 | # SSH settings 21 | # Specify the user@host for a remote machine. Leave empty for a local 22 | userhost = '' 23 | ssh_port = 22 24 | 25 | # Create a persistant master SSH tunnel and multiplex over the connection. 26 | # This works in practice but has not been as thouroughly tested. 27 | persistant = True 28 | 29 | # Specify the remote executable. If it is installed, it is just 'PyFiSync'. 30 | # Otherwise it may be something like '/path/to/python /path/to/PyFiSync.py'. 31 | # Make sure the paths work via SSH. See the FAQs for details 32 | remote_exe = 'PyFiSync' 33 | # 34 | 35 | # 36 | 37 | # These settings are specific to rclone. 38 | remote = 'rclone' 39 | 40 | # Remote Machine name 41 | nameB = 'machineB' 42 | 43 | # Specify the path as you would a remote in rclone. 44 | pathB = 'myremote:bucket' 45 | 46 | # Set the executable. If rclone is installed, should just be the default 47 | rclone_executable = 'rclone' 48 | 49 | # Specify an rclone config password if one is set. Or specify `pwprompt()` to 50 | # have you promoted to enter one on each run. Specify False to ignore. 51 | # 52 | # Alternatively, you can do something like the following: Write the password in 53 | # something like ".PyFiSync/PW.txt" where, by putting it in the .PyFiSync 54 | # directory, it will not be synchronized. Then: 55 | # 56 | # with open(".PyFiSync/PW.txt",'rt') as file: 57 | # rclone_pw = file.read().strip() 58 | # 59 | # WARNINGS: 60 | # - Specifying the password in plain text may not be secure if this config file 61 | # is compromised 62 | # - The password is passed to rclone via environment variables. Alternatively, 63 | # use --password-command manually with flags. An improved method may be 64 | # implemented in the future. 65 | rclone_pw = False 66 | 67 | # Specify some flags to include in all rclone calls. Must be specified as a 68 | # list/tuple/set. These can be used to tune transfers and improve performance. 69 | # Some of them have been tested but not all. 70 | # 71 | # WARNING: There is no validation performed on the specified flags. That means 72 | # that you could specify some options tha 73 | # 74 | # Other Examples: 75 | # `--transfers NN`: How many transfers should be done at once 76 | # `--fast-list`: This should be used on all bucket (S3, B2, Swift) backends. 77 | # `--config PATH`: Specify a different path to the config file. This is very 78 | # useful if you want to keep the config file somewhere else (including with 79 | # the files synced). Rclone is always evaluated in the root sync directory 80 | # so the path can be relative to that. 81 | rclone_flags = ['--transfers', '15', 82 | '--fast-list', 83 | '--checkers', '10'] 84 | 85 | # Some remotes (e.g. Backblaze B2) do not support any server-side move/copy 86 | # opperations. As such, moving files is very inefficient as they must 87 | # be downloaded and then re-uploaded. For backups, this is a waste of effort 88 | # so instead, we can *just* backup via a local copy 89 | rclone_backup_local = False 90 | 91 | # Some remotes (e.g. S3) do not provide hashes for all files (such as those 92 | # uploaded with multi-part). As such PyFiSync can imitate a hash when missing 93 | # based on the other metadata (so it cannot track remote moves). Warning: if 94 | # this is set with an incorrectly specified hash, (a) the screen will fill with 95 | # warnings and (b) no moves will be tracked 96 | imitate_missing_hash = False 97 | 98 | # 99 | 100 | # File Settings: 101 | # move_attributes specify which attributes to determine a move or previous file. 102 | # Options for local and rsync remote 103 | # 'path','ino','size','birthtime','mtime' 'adler','dbhash', PLUS any 104 | # `hashlib.algorithms_guaranteed` 105 | # 106 | # Options for rclone remotes: 'path','size','mtime', and hashes as noted in the 107 | # readme 108 | # 109 | 110 | # 111 | # Prev File Suggestions (to determine if a file is new or it's only mod time 112 | # ['ino','path'] 113 | # Move Suggestions: (see readme for discussion) 114 | # macOS: ['ino','birthtime'] 115 | # Linux: ['ino','size'] --OR-- ['ino'] (birthtime isn't avalible and inodes 116 | # get reused) 117 | # MUST specify as a list 118 | move_attributesA = ['ino','birthtime'] 119 | prev_attributesA = ['ino','path'] 120 | 121 | move_attributesB = ['ino','birthtime'] # ['ino','size'] --OR-- ['ino','sha1'] 122 | prev_attributesB = ['ino','path'] 123 | # 124 | # 125 | # Prev File Suggestions: 126 | # ['path'] 127 | # Move Suggestions: Note with rclone, there is no advantage to moving an 128 | # also-modified file 129 | # move_attributesA 130 | # ['ino','mtime'] 131 | # move_attributesB 132 | # If hashes are supported: ['hash.SHA-1'] or whatever hash 133 | # If hashes are not supported: ['path'] # This essentially doesn't track moves 134 | # 135 | # MUST specify as a list 136 | move_attributesA = ['ino','mtime'] 137 | prev_attributesA = ['ino','path'] 138 | 139 | move_attributesB = ['path'] # --OR-- ['hash.SHA-1'] 140 | prev_attributesB = ['path'] 141 | # 142 | 143 | ## Conflict Settings 144 | move_conflict = 'A' # 'A' or 'B': When moves are conflicting 145 | 146 | # Modification date conflicts can be resolved as follows: 147 | # 'A','B' -- Always accept either A or B's copy regardless 148 | # 'both' -- Tag BOTH files to with the extension of the computer name 149 | # 'newer' -- Always keep the newer version 150 | # 'newer_tag' -- Keep the newer version un-named and tag the older 151 | mod_conflict = 'both' 152 | mod_resolution = 2.5 # (s) How much time difference is allowed between files 153 | 154 | # Specify attributes to comprare from A and B. Must specify as a list of 155 | # (attribA,attribB) tuple, See the examples. Note that `mtime` will use the 156 | # mod_resolution time 157 | mod_attributes = [('mtime','mtime')] 158 | 159 | # examples: 160 | # mod_attributes = [('sha1','sha1')] # for rsync remote 161 | # mod_attributes = [('sha1','hash.SHA-1')] # for rclone 162 | # mod_attributes = [('dbhash','hash.DropboxHash')] # dropbox rclone 163 | 164 | 165 | 166 | # Symlinked directories are ALWAYS follow unless excluded. However, if 167 | # copy_symlinks_as_links=False, symlinked files sync their referent (and 168 | # rsync uses `-L`) If True (default), symlinks copy the link itself (a la git) 169 | # 170 | # WARNING1: setting to True with links to files inside the sync root will cause 171 | # issues with tracking 172 | # WARNING2: Recursive symlinks will NOT be caught. 173 | copy_symlinks_as_links = True 174 | 175 | ## Other settings 176 | backup = True # Backup before deletion or overwriting 177 | 178 | # If a file is deleted but a new one is in the same place, do not treat it as 179 | # a delete. Useful when programs overwrite rather than update files. Final 180 | # sync will look the same but this will optimize using rsync on that file 181 | check_new_on_delete = True 182 | 183 | # Set whether or not to create a database of hash values if (and only if) using 184 | # sha1 or adler32 as an attribute. If True (default), the code will not re-hash 185 | # a file unless the path, size, and mtime has changed. This leaves an edge 186 | # case, though rare 187 | use_hash_db = True 188 | 189 | ## Exclusions. 190 | # * If an item ends in `/` it is a folder exclusion 191 | # * If an item starts with `/` it is a full path relative to the root 192 | # * Wildcards and other patterns are accepted 193 | # 194 | # | Pattern | Meaning | 195 | # |----------|------------------------------------| 196 | # | `*` | matches everything | 197 | # | `?` | matches any single character | 198 | # | `[seq]` | matches any character in `seq` | 199 | # | `[!seq]` | matches any character not in `seq` | 200 | # 201 | # Specify as a single list. 202 | # These are suggestions. They can be included if desired 203 | 204 | # excludes = ['.DS_Store','.git/','Thumbs.db'] # Suggested 205 | excludes = [] 206 | 207 | # This sets a specified filename (such as '.PyFiSync_skip') wherein if PyFiSync 208 | # sees this file in a directory, it will exclude it. If the file is found on 209 | # either side, it is applied to *both* sides. 210 | exclude_if_present = '' 211 | 212 | # The following can be used to perform certain tasks pre and post sync. 213 | # Called the root of the syn direcotory (i.e. they start with 214 | # $ cd $PyFiSync_root 215 | # Example uses include cleanup, git push,pull, sync. 216 | pre_sync_bash = '' 217 | post_sync_bash = '' 218 | 219 | 220 | 221 | 222 | -------------------------------------------------------------------------------- /FAQs.md: -------------------------------------------------------------------------------- 1 | (work in progress) 2 | 3 | ## Why is this better than Unison 4 | 5 | Well, it may not be! I have only dabbled in Unison. Unison seems like a great tool but I wanted my own that I could develop, test, and design (within my abilities). I wanted backups-before-overwrite/delete to be a baseline feature and I also wanted to be able to track file moves. 6 | 7 | Plus, this was a great learning tool for python. Developing this was a lot of fun. And I am also really happy with [ldtable](https://github.com/Jwink3101/ldtable) which I developed in concert with this. 8 | 9 | ## Are files encrypted? 10 | 11 | Using the rsync mode, the files are encrypted **in transit** via SSH. However, since this is not inherently a server-client model, the files are unencrypted at rest. 12 | 13 | I suggest [Cryptomator](https://cryptomator.org/) for encrypted files as it is cross-platform, doesn't introduce much overhead, and is efficient. It encrypts on a file-by-file basis (with obfuscated names) so changing a file will only require syncing that file (and some ancillary data). Speedups from rsync will not be realized. 14 | 15 | If using only macOS, encrypted disk images can also work well. If using encrypted disk images, I recommend using *sparse* disk image. Sparse images create bands (16mb if I recall correctly) so, while not file-by-file, they are more efficient but less than purely file-by-file. Regular encrypted disk images will, of course, work but *any* change will require syncing the entire thing. These are not recommended. 16 | 17 | Also, if using the rclone remote, you can use a crypt remote. Details are in the [rclone_b2](rclone_b2.md) guide. 18 | 19 | ## I set up SSH keys. Why is it asking me for my key password each time? 20 | 21 | If you set up SSH keys with a password, you still need to unlock the key. If you're in an interactive session (e.g. directly in macOS terminal or the same for Linux), it does this for you (usually). If you're also SSHed in, you may need to start the `ssh-agent`: 22 | 23 | $ ssh-agent 24 | $ ssh-add 25 | 26 | Or, you can put the following in your `.bashrc` and call: 27 | 28 | $ start-ssh-agent 29 | 30 | code: 31 | 32 | ```bash 33 | start-ssh-agent () { 34 | if [ -z "$SSH_AUTH_SOCK" ]; then 35 | eval $(ssh-agent); 36 | ssh-add; 37 | else 38 | echo "SSH agent already running"; 39 | fi 40 | } 41 | ``` 42 | 43 | ## Why am I getting Unicode errors? Did you mess up? 44 | 45 | PyFiSync can handle unicode filenames without a problem. It is likely your terminal encoding. 46 | 47 | On some `openssh` installations on macOS (anecdotally, from `brew`), there seems to be a problem with sending the wrong encoding to the remote terminal which makes it *seem* like there is a unicode error in PyFiSync. This is actually related to sending terminal encoding. See [this](https://askubuntu.com/a/874765) where they had the *opposite* problem. 48 | 49 | The fix is to add the following to `/etc/ssh/ssh_config` or `~/.ssh/config`: 50 | 51 | Host * 52 | SendEnv LANG LC_* # Send locale to remote 53 | 54 | ## How does PyFiSync handle interruptions 55 | 56 | The short answer is: If you run it again, everything *should* be fine. 57 | 58 | The slightly longer answer is that all actions are (more-or-less) safe since backups are made before anything is overwritten and moves are logged. 59 | 60 | The more detailed answer: The code does a few steps when you run it. First, it makes a file list of the current state of each side. Then, from that list and stored list of the last run, it compares all files to see what is a) new (called `prev_attributes` in the config); b) moved (called `move_attributes`); or c) deleted. This is independent of each side and does *not* determine if files are modified\*. The reason this is useful is purely to propagate moves on each side so rsync is more efficient. The determination of *what* to transfer happens *after* this independent one and is only based on comparing the mod times (if it exists on both sides) 61 | 62 | So, to better answer this question, consider the times an interruption can happen: 63 | 64 | * **initial listing and comparison**: No problem. No file has been touched 65 | * This is also when the file-transfers are determined! 66 | * **Propgate moves, perform deletions and/or backups**: If a file is moved, when PyFiSync is rerun, even without transfer, the system will think the file was moved on both sides regardless of failure. It won't need to do anything. If it is interrupted during "deletion" (which is really just a move to a backup dir), then the file will have been moved and all is good. If it happens during a backup, then you may have extra backup copies. No problem 67 | * **During file transfer**: If a file was successfully transferred, then when it is rerun, they will match mod-time and nothing happens. Otherwise, they will have the same outcome as before. An additional backup may be made, but that doesn't hurt. 68 | * **After transfer, before last file listings**: The last listing is only needed to get up-to-date `inode` numbers, etc. It is stored for the next run for moves and tracking. Therefore, rerunning it will not have any moves or deletions to propagate so nothing bad will happen. 69 | 70 | One additional case is if you didn’t realize it failed and run again later. In this case, the following would be the worst outcomes: 71 | 72 | * A file delete will not propagate and may be restored from the other side. No real harm here! 73 | * A file that *could* have been a move will actually end up being treated as a delete + new file. Some extra bandwidth but otherwise harmless. 74 | 75 | While I think I thought through all of these conditions, I may have missed something. I personally run extensive backups (since, even though this backs up prior to delete or overwrite, it is a *sync* code, not *backup*). I cannot recall a time I had to dig into them because my code failed. And, except in the early days, I do not remember having to manually unwind anything. Even then, it performs safe opperations but I have since figured out the missed edge case, handled it, and wrote a test! 76 | 77 | I hope this clears it up a bit! 78 | 79 | \* The original plan had it determine transfers from its own previous state but I moved to comparing the two sides since it was robust to a) failure and b) deletion of the stored file lists 80 | 81 | ## Does this support Windows 82 | 83 | No. The remote file listing is handled via an SSH call and the transfers are via rsync. It *may* work in Windows Subshell but it has not been tested. 84 | 85 | Also, I suspect that file tracking will be less robust since there are no inode numbers. SHA1 should would to track but that adds a lot of overhead 86 | 87 | ## Why can't I sync two rclone remotes 88 | 89 | This tool was built with rsync in mind and, in particular, syncing your local file system to a remote remote. The infrastructure was designed to be flexible for the remote only. With that said, I suspect I *could* make it work to handle two rclone remotes but I don't have the need. If there is interest, I may re-evaluate. 90 | 91 | ## Why use `ldtable` instead of something like SQLite 92 | 93 | The simple answer is, at the time, I didn't know much SQL. And building out `ldtable` was a lot of fun. It is very useful for in-memory data queries. The more complex answer is that `ldtable` is much easier. Since I do not know all attributes until PyFiSync is instantiated, I would need a variable schema. And since I may or may not query on different combinations of attributes, I would need many table indicies. 94 | 95 | Also, `ldtable` has proven to be sufficiently performant. Even on my 60,000 item (~200gb) photo collection. The database is well within memory constraints. I may consider SQLite in the future though. 96 | 97 | ## When should I use rsync+SSH vs rclone? 98 | 99 | First of all, if you want *anything* other than ssh or local, you *need* to use rclone. If you plan to use SFTP in rclone, use the ssh+rsync mode! 100 | 101 | If you are interested in SSH-based remotes, you are almost universally better off using the rsync+SSH mode. First and foremost, rclone does not support any kind of transfer deduplication while rsync is built around it! With rsync, if only a small part of the file changes, only those changes (plus some overhead) are transfered. 102 | 103 | Furthermore, in ssh+rsync mode, you can have things like a hash database (if set and using hashes) to greatly speed things up. And, all opperations are done on a persistant tunnel. 104 | 105 | ## I installed PyFiSync on the remote computer but I am getting an error. 106 | 107 | If you are getting a `bash: PyFiSync: command not found` error, it is likely because the remote machine either doesn't have PyFiSync installed or there is an issue with your `.bashrc` and/or `.bash_profile`. 108 | 109 | If your paths are set up in `.bash_profile`, move them to `.bashrc` and add the following to `.bash_profile`: 110 | ```bash 111 | if [ -f ~/.bashrc ]; then 112 | . ~/.bashrc 113 | fi 114 | ``` 115 | In addition, some versions of linux add the following line(s) to the `.bashrc`: 116 | ```bash 117 | # If not running interactively, don't do anything 118 | [ -z "$PS1" ] && return 119 | ``` 120 | You need to comment out that second line or still set up paths. 121 | 122 | Finally, you can instead specify the full path in the config file with `remote_exe` 123 | 124 | ## Why do you still support Python 2? 125 | 126 | When I first wrote PyFiSync, I didn't use Python 3! Part of that was because at work, I was limited to Python 2 and part of it was just that I learned on Python 2. However, when I made the transition to Python 3 for my own stuff, I made PyFiSync compatible with both. 127 | 128 | At this point, it is pretty easy to maintain that. If in the future I want to use a Python 3 only feature, I will probably just drop Python 2. 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | -------------------------------------------------------------------------------- /rclone_b2.md: -------------------------------------------------------------------------------- 1 | # Rclone with B2 (and S3) 2 | 3 | (See the bottom for noted on using S3 instead of B2) 4 | 5 | B2 is a very inexpensive storage platform that is great for syncing. It does have a few limitations, notably the lack of server-side copy. As such, any "moves" are actually download+upload+delete. (This is no longer the case for newer rclone versions with the latest B2 API. Therefore, some of these steps may not be necessary). 6 | 7 | As such, we make two major changes to our configuration: 8 | 9 | 1. Download backups locally rather than move them to the backup dir 10 | * Any remote backups aren't strictly needed since the buckets can be set to backup but we will keep this here for consistency 11 | 2. Do not do any file moves! We *could* let rclone handle the moves but it is less transparent. See below when using S3 because that is not the best setting 12 | 13 | Finally, another limitation as noted in the readmes relates to deleting a file and then renaming another in its place. 14 | 15 | See notes at the end for the differences with S3 setup 16 | 17 | ## Plan 18 | 19 | We will use the *same* bucket to set up two different PyFiSync repositores. The first will be unencrypted and the second will be encrypted. We will have to adjust the settings accordingly. 20 | 21 | It is a good idea to use encryption if you do not trust the remote or want extra protection. In this example, we also password protect the config file but that is really *not* needed on your local machine. 22 | 23 | ## Rclone Setup 24 | 25 | ### Preliminaries 26 | 27 | Assume your local machine directory is `/path/to/PFS/` 28 | 29 | $ cd /path/to/PFS/ 30 | 31 | And that you have a bucket called `MYBUCKET` 32 | 33 | ### Set up B2 base version 34 | 35 | Create a local config file 36 | 37 | $ rclone config --config rclone.cfg 38 | 39 | Create a new B2 remote 40 | 41 | ```rclone 42 | No remotes found - make a new one 43 | n) New remote 44 | s) Set configuration password 45 | q) Quit config 46 | n/s/q> n 47 | 48 | name> b2base 49 | 50 | Type of storage to configure. 51 | Storage> b2 52 | 53 | [...] 54 | ``` 55 | 56 | The final config should look something like 57 | 58 | ```rclone 59 | [b2base] 60 | type = b2 61 | account = **ACCOUNT** 62 | key = **KEY** 63 | ``` 64 | 65 | ### Set up rclone encrypted 66 | 67 | We *could* set up the encrypted in the same config file but we will later encrypt it. Since we also want the non-encrypted, it is easier to make a *copy*. Note that you will have to make changes in both if you change a setting 68 | 69 | It is **NOT vital** to encrypt the encrypted B2 config since you probably trust your own computer. If you do not want to encrypt that then there is no reason to make a copy and you can skip the relevant sections 70 | 71 | Follow the following 72 | 73 | $ cp rclone.cfg rcloneCRYPT.cfg 74 | $ rclone config --config rcloneCRYPT.cfg 75 | 76 | Then 77 | 78 | ``` 79 | n) New remote 80 | d) Delete remote 81 | r) Rename remote 82 | c) Copy remote 83 | s) Set configuration password 84 | q) Quit config 85 | e/n/d/r/c/s/q> n 86 | 87 | name> b2crypt 88 | 89 | Storage> crypt 90 | 91 | remote> b2base:MYBUCKET/crypt 92 | 93 | filename_encryption> standard 94 | 95 | directory_name_encryption> true 96 | 97 | Password or pass phrase for encryption. 98 | y) Yes type in my own password 99 | g) Generate random password 100 | n) No leave this optional password blank 101 | y/g/n> g 102 | 103 | Bits> 256 104 | 105 | y/n> y 106 | 107 | Password or pass phrase for salt. Optional but recommended. 108 | Should be different to the previous password. 109 | y) Yes type in my own password 110 | g) Generate random password 111 | n) No leave this optional password blank 112 | y/g/n> g 113 | 114 | Bits> 256 115 | 116 | y/n> y 117 | ``` 118 | 119 | Now your config should look something like the following: 120 | 121 | ``` 122 | [b2base] 123 | type = b2 124 | account = **ACCOUNT** 125 | key = **KEY** 126 | 127 | [b2crypt] 128 | type = crypt 129 | remote = b2base:MYBUCKET/crypt 130 | filename_encryption = standard 131 | directory_name_encryption = true 132 | password = **PW2** 133 | password2 = **PW** 134 | ``` 135 | 136 | But you will want to encrypt that! 137 | 138 | $ rclone config --config rcloneCRYPT.cfg 139 | 140 | ``` 141 | Name Type 142 | ==== ==== 143 | b2base b2 144 | b2crypt crypt 145 | 146 | e) Edit existing remote 147 | n) New remote 148 | d) Delete remote 149 | r) Rename remote 150 | c) Copy remote 151 | s) Set configuration password 152 | q) Quit config 153 | e/n/d/r/c/s/q> s 154 | 155 | a/q> a 156 | 157 | **enter password** 158 | 159 | Your configuration is encrypted. 160 | c) Change Password 161 | u) Unencrypt configuration 162 | q) Quit to main menu 163 | c/u/q> q 164 | 165 | e/n/d/r/c/s/q> q 166 | ``` 167 | 168 | Now you have two configuration files for rclone. Again, you could do this as one but it is nicer to not have to enter your password for the unencrypted but you want to keep your config encrypted for the other 169 | 170 | **BACKUP** the config file since if you lose these machine-generated passwords, you will lose access to your files. 171 | 172 | ## Set up PyFiSync 173 | 174 | ### Unencrypted 175 | 176 | These aren't *all* settings, but should give the general idea 177 | 178 | $ cd /path/to/PFS/ 179 | $ mkdir reg 180 | $ cd reg 181 | $ PyFiSync init --remote rclone 182 | 183 | Now edit the config. This is again, not *all* of it. The comments are **not** the default comments in the documentation. They are my notes 184 | 185 | ```python 186 | pathB = 'b2base:MYBUCKET/reg' 187 | 188 | rclone_pw = False 189 | 190 | # B2 should use fast-list to reduce API calls. The rclone docs 191 | # (https://rclone.org/b2/) suggest 32 transfers. The config flag is needed to 192 | # specify where the configuration file is that we created above. Note that 193 | # rclone is always executed in the sync directory 194 | 195 | rclone_flags = ['--transfers', '32', 196 | '--fast-list', 197 | '--checkers','10', 198 | '--config','../rclone.cfg'] 199 | 200 | # Do backups to the local machine since we can't move files with B2 201 | rclone_backup_local = True 202 | 203 | # We never want to do a move with B2. Note that for S3 this is NOT the case 204 | move_attributesB = ['ino','path'] 205 | prev_attributesB = ['ino','path'] 206 | 207 | # B2 supports SHA-1. 208 | move_attributesB = ['hash.SHA-1'] 209 | prev_attributesB = ['path'] 210 | 211 | # Rclone does not like symlinks with B2 and their workaround appears broken 212 | # as of writing. See https://github.com/ncw/rclone/issues/3163 213 | copy_symlinks_as_links = False 214 | 215 | ``` 216 | 217 | Notes: 218 | 219 | * We didn't specify it but you may want to add the flag `--b2-hard-delete` since we are doing backups. 220 | 221 | Set up then add some files 222 | 223 | $ PyFiSync reset --force 224 | 225 | ...add some files and test 226 | 227 | ### Encrypted 228 | 229 | Most of this is the same but we will reproduce it all just to be sure 230 | 231 | $ cd /path/to/PFS/ 232 | $ mkdir crypt 233 | $ cd crypt 234 | $ PyFiSync init --remote rclone 235 | 236 | Now edit the config. This is again, not *all* of it. The comments are **not** the default comments in the documentation. They are my notes 237 | 238 | ```python 239 | pathB = 'b2crypt' 240 | 241 | # Make it ask each time. You can also either enter the password 242 | # here or choose to not encrypt the rclone config. 243 | rclone_pw = pwprompt() 244 | 245 | # B2 should use fast-list to reduce API calls. The rclone docs 246 | # (https://rclone.org/b2/) suggest 32 transfers. The config flag is needed to 247 | # specify where the configuration file is that we created above. Note that 248 | # rclone is always executed in the sync directory 249 | 250 | rclone_flags = ['--transfers', '32', 251 | '--fast-list', 252 | '--checkers','10', 253 | '--config','../rcloneCRYPT.cfg'] 254 | 255 | # Do backups to the local machine since we can't move files with B2 256 | rclone_backup_local = True 257 | 258 | # As noted above, we only want to do a "move" when the file is unmodified 259 | # since, unlike rsync, rclone cannot make use of existing data 260 | move_attributesA = ['ino','mtime'] 261 | prev_attributesA = ['ino','path'] 262 | 263 | # Crypt does not support any hashes. No remote move tracking! 264 | # (which means you may redownload when not needed) 265 | move_attributesB = ['path'] 266 | prev_attributesB = ['path'] 267 | 268 | # Rclone does not like symlinks with B2 and their workaround appears broken 269 | # as of writing. See https://github.com/ncw/rclone/issues/3163 270 | copy_symlinks_as_links = False 271 | 272 | ``` 273 | 274 | Notes: 275 | 276 | * We didn't specify it but you may want to add the flag `--b2-hard-delete` since we are doing backups. 277 | 278 | Set up then add some files 279 | 280 | $ PyFiSync reset --force 281 | 282 | Now you should be good to go! You will get some "untracked file" warnings on the first sync for files that are not on the same side. 283 | 284 | ## S3 Notes 285 | 286 | The same process can be used for S3-based backends with a few changes. 287 | 288 | * S3 supports server-side copy so you *can* worry less about doing backups on the remote side but rclone still cannot use existing data so there is no need to move a modified file 289 | * Since S3 support server-side copy, it also behooves us to track moves. Change the *local* settings to 290 | 291 | # As noted above, we only want to do a "move" when the file is unmodified 292 | # since, unlike rsync, rclone cannot make use of existing data 293 | # move_attributesA = ['ino','mtime'] 294 | # prev_attributesA = ['ino','path'] 295 | 296 | * S3 supports MD5 hashes, not SHA-1 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyFiSync 2 | 3 | Python (+ rsync or rclone) based intelligent file sync with automatic backups and file move/delete tracking. 4 | 5 | ## Features 6 | 7 | * Robust tracking of file moves 8 | * Especially powerful on MacOS, but works well enough on linux. 9 | * rsync Mode: 10 | * Works out of the box with Python (tested on 2.7 and 3.5+) for rsync 11 | * Works over SSH for secure and easy connections with rsync mode 12 | * Uses rsync for actual file transfers to save bandwidth and make use of existing file data 13 | * [rclone][rclone] mode: (beta!) 14 | * Can connect to a wide variety of cloud-services and offers encryption 15 | * Note that rclone is still supported and works but **it is better to use** [syncrclone](https://github.com/Jwink3101/syncrclone) instead. 16 | * rclone support may be deprecated in the future! 17 | * Extensively tested for a **huge** variety of edge-cases 18 | 19 | ## Details 20 | 21 | PyFiSync uses a small database of files from the last sync to track moves and deletions (based on changeable attributes such as inode numbers, sha1 hashes, and/or create time). It then compares `mtime` from both sides on all files to decide on transfers. 22 | 23 | ### Backups 24 | 25 | By default, any time a file is to be overwritten or modified, it is backed up on the machine first. No distinction is made in the backup for overwrite vs delete. 26 | 27 | ### Attributes 28 | 29 | Moves and deletions are tracked via attributes described below. 30 | 31 | Move attributed are used to track if a file has moved while the `prev_attributes` are used to determine if a file is the same as before 32 | 33 | Note: On HFS+ (and maybe APFS?), macOS's file system, inodes are not reused quickly. On ext3 (Linux) they are recycled rapidly leading to issues when files are deleted and new ones are made. Do not use inodes alone on these systems 34 | 35 | #### Common attributes 36 | 37 | * `path` -- This essentially means that moves are not tracked. If a file has the same name, it is considered the same file 38 | * `size` -- File size. Do not use alone. Also, this attribute means that the file may not change between moves. See examples below 39 | * `mtime` -- When the file was modified. Use with `ino` to track files 40 | 41 | #### rsync and local attributes 42 | 43 | Attributes for the local machine and an rsync remote 44 | 45 | * `ino` (inode number)-- Track the filesystem inode number. May be safely used alone on HFS+ but not on ext3 since it reuses inodes. In that case, use with another attribute 46 | * hashes -- Very robust to track file moves but like `size`, requires the file not change. Also, slow to calculate (though, by default, they are not recalculated on every sync). Options: 47 | * `adler` -- Fast but less secure 48 | * `dbhash` -- Used for dropbox. Useful if comparing on hash 49 | * any `hashlib.algorithms_guaranteed`: `sha384`,`sha3_224`,`sha3_512`,`md5`,`sha512`,`sha3_256`,`blake2b`,`sha3_384`,`shake_128`,`blake2s`,`sha256`,`shake_256`,`sha1`,`sha224` 50 | * `birthtime` -- Use the file create time. This does not exist on some linux machines, some python implementations (PyPy), and/or is unreliable 51 | 52 | #### rclone attributes 53 | 54 | * `hash.HASH` -- Use a hash from rclone. Depends on which hashes are available. 55 | 56 | #### Suggested move Attribute Combinations 57 | 58 | For rsync 59 | 60 | * On macOS, the following is suggested: `[ino,birthtime]` 61 | * On linux, the following is suggested: `[inode,mtime]` 62 | * This means that **moved files should not be modified** on that side of the sync. 63 | 64 | #### Hashes 65 | 66 | As noted, any `hashlib.algorithms_guaranteed` is supported for rsync mode and the local machine. In order to save time, a database is used of the previous file. This can be turned off in the config forcing all of the files to be read and hashed again. 67 | 68 | 69 | ### Empty Directories 70 | 71 | PyFiSync syncs files and therefore will *not* sync empty directories from one machine to the other. However, if, and only if, a directory is *made* empty by the sync, it will be deleted. That includes nested directories. In rclone mode, empty directories are not handled at all by PyFiSync 72 | 73 | ## Install 74 | 75 | This are *no dependancies!* (for rsync). Everything is included in the package (though `ldtable` is also separately developed [here](https://github.com/Jwink3101/ldtable)) (now `DictTable`) 76 | 77 | To install: 78 | 79 | $ python -m pip install git+https://github.com/Jwink3101/PyFiSync 80 | 81 | Or download the zip file and run 82 | 83 | $ python setup.py install 84 | 85 | If using the rclone remote (see setup below), install it on the remote machine too. 86 | 87 | Note: On the remote machine, the path to PyFiSync must be found via SSH. For example, if your python is from (Ana/Mini)conda, then it places the paths into the `.bash_profile`. Move the paths to `.bashrc` so that PyFiSync can be found. 88 | 89 | Alternatively, specify `remote_exe`. 90 | 91 | ## Setup 92 | 93 | See [rsync](rsync.md) for setup of the default mode. PyFiSync must be installed on both machines (or the Python scripts must be there and configured) 94 | 95 | Setting up rclone is a bit more involved since you must set up an appropriate rclone remote. See [rclone readme](rclone.md) for general details and [rclone\_b2](rclone_b2.md) for a detailed walk through of setting up with B2 (and S3 with small noted changes). 96 | 97 | To initiate an rclone-based repo, do 98 | 99 | $ PyFiSync init --remote rclone 100 | 101 | ## Settings 102 | 103 | There are many settings, all documented in the config file written after an `init`. Here are a few: 104 | 105 | ### Exclusions 106 | 107 | Exclusion naming is done is such a way that it replicated a *subset* of `rsync` exclusions. That is, the following pattern is what **this** code follows. `rsync` has its own exclusion engine which is more advanced but should be have similarly. 108 | 109 | * If an item ends in `/` it is a folder exclusion 110 | * If an item starts with `/` it is a full path relative to the root 111 | * Wildcards and other patterns are accepted 112 | 113 | | Pattern | Meaning | 114 | |----------|------------------------------------| 115 | | `*` | matches everything | 116 | | `?` | matches any single character | 117 | | `[seq]` | matches any character in `seq` | 118 | | `[!seq]` | matches any character not in `seq` | 119 | 120 | Examples: 121 | 122 | * Exclude **all** git directories: `.git/` 123 | * Exclude a specific folder: `/path/to/folder/` (where `/` is the start of the sync directory 124 | * Exclude all files that start with `file`: `file*` 125 | * Exclude all files that start with `file` in a specific directory: `/path/to/file*` 126 | 127 | 128 | #### Exclude if Present 129 | 130 | PyFiSync allows for exclusion of a directory due to the presence of a specified file name (the contents of the file do not matter, only the presence of it). 131 | 132 | Unlike regular exclusions which halt traversing deeper into an excluded directory tree, `exclude_if_present` is a filter applied after the fact. This approach is safer as adding an exclusion file on one side will not cause a delete to be incorrectly propagated. It does come at a small performance penalty as the excluded directory is is initially traversed 133 | 134 | ### Symlinks 135 | 136 | First note that **all directory links are followed** regardless of setting. Use exclusions to avoid syncing a linked directory. 137 | 138 | If `copy_symlinks_as_links=False` symlinked files sync their referent (and rsync uses `-L`) If `True` (default), symlinks copy the link itself (a la how git works) 139 | 140 | WARNINGS: 141 | 142 | * If `copy_symlinks_as_links = False` and there are symlinked files to another IN sync root, there will be issues with the file tracking. Do not do this! 143 | * As also noted in Python's documentation, there is no **safeguard against recursively symlinked directories**. 144 | * rsync may throw warnings for broken links 145 | * rclone's support of symlinks is unreliable at the moment. 146 | 147 | 148 | ### Pre and Post Bash 149 | 150 | There is the option to also add some bash scripts pre and post sync. These may be useful if you wish to do a git push, pull, etc either remote or local. 151 | 152 | They are ALWAYS executed from the sync root (a `cd /path/to/syncroot` is inserted above). 153 | 154 | ## Running Tests 155 | 156 | To run the test, in bash, do: 157 | 158 | $ source run_test.sh 159 | 160 | In addition to testing a whole slew of edge cases, it also will test all actions on a local sync, and remote to both python2 and python3 (via `ssh localhost`). The run script will try to call `py.test` for both versions of python locally. 161 | 162 | ## Known Issues and Limitations 163 | 164 | The test suite is **extremely** extensive as to cover tons of different and difficult scenarios. See the tests for further exploration of how the code handles these cases. Please note that unless specified explicitly in the config or the command-line flag, all deletions and (future) overwrites first perform a backup. Moves are not backed up but make likely be unwound from the logs. 165 | 166 | A few notable limitations are as follows: 167 | 168 | * Symlinks are followed (optionally) but if the file they are linking to is also in the sync folder, it may confuse the move tracking 169 | * File move tracking 170 | * A file moved with a new name that is excluded will propagate as deleted. This is expected since the code no longer has a way to "see" the file on the one side. 171 | * A file that is moved on one side and deleted on the other will NOT have the deletion propagated regardless of modification 172 | * Sync is based on modification time metadata. This is fairly robust but could still have issues. In rsync mode, even if PyFiSync decides to sync the files, it may just update the metadata. In that case, you may just want to disable backups. With rclone, it depends on the remote and care should be taken. 173 | 174 | There is also a potential issue with the test suite. In order to ensure that the files are noted as changed (since they are all modified so quickly), the times are often adjusted via some random amounts. There is a *small* chance some tests could fail due to a small number not changing. Running the tests again should pass. 175 | 176 | See [rclone readme](rclone.md) for some rclone-related known issues 177 | 178 | ## Other Questions 179 | 180 | See the (growing) [FAQ](FAQs.md) for some more details and/or troubleshooting 181 | 182 | [rclone]:https://rclone.org/ 183 | 184 | 185 | -------------------------------------------------------------------------------- /PyFiSync/PFSwalk.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Main tool for walking a directory that is tuned to PyFiSync's needs and 5 | uses scandir if it can! 6 | """ 7 | 8 | from __future__ import division, print_function, unicode_literals 9 | from io import open 10 | 11 | import sys 12 | import os 13 | import fnmatch 14 | import subprocess 15 | import json 16 | import time 17 | 18 | try: 19 | from os import scandir as _scandir 20 | except ImportError: 21 | try: 22 | from scandir import scandir as _scandir 23 | except ImportError: 24 | _scandir = None 25 | 26 | try: 27 | from itertools import imap as map 28 | from itertools import izip as zip 29 | except ImportError: # python >2 30 | pass 31 | 32 | from itertools import repeat 33 | from functools import partial 34 | 35 | from . import utils 36 | from .dicttable import DictTable 37 | 38 | def fnmatch_mult(name,patterns): 39 | """ 40 | will return True if name matches any patterns 41 | """ 42 | return any(fnmatch.fnmatch(name,pat) for pat in patterns) 43 | 44 | 45 | class file_list: 46 | def __init__(self,path,config,log, 47 | attributes=(), 48 | empty='store',use_hash_db=True): 49 | """ 50 | Main interface to the walk. the final list will 51 | be in the attribute file_list. 52 | 53 | Empty: 54 | 'store': stores a list of empty directories 55 | 'remove': Deletes all empty directories if (and only if) they 56 | were *not* empty before. Also removes stored list 57 | 'reset': Removes stored list 58 | """ 59 | self.path = path 60 | self.config = config 61 | self.log = log 62 | self.attributes = attributes 63 | self.use_hash_db = use_hash_db 64 | 65 | self.empty = empty 66 | self.empties = set() 67 | 68 | self._set_exclusions() 69 | 70 | def files(self,parallel=False): 71 | """ 72 | Process the files. 73 | if parallel is False or <= 1 will run hashes serially 74 | Otherwise specify True to use all cores or specify a number 75 | """ 76 | 77 | # The hash_db is essentially the same as the filelist but it does 78 | # *not* need to be the latest a certain sync-pair has seen. It should 79 | # just be the latest parse of the files. The general idea is that 80 | # if the ['mtime','path','size'] are identical, no need to recalculate 81 | # the sha1 or adler of the file. 82 | 83 | self.hashes = any(a in utils.HASHFUNS for a in self.attributes) 84 | 85 | if self.hashes: 86 | self.load_hash_db() 87 | 88 | if parallel and self.hashes: 89 | import multiprocessing as mp 90 | if not isinstance(parallel,int): 91 | parallel = mp.cpu_count() 92 | pool = mp.Pool(parallel) 93 | _map = partial(pool.imap,chunksize=10) 94 | else: 95 | pool = None 96 | _map = map 97 | 98 | # Set this up as a chain of generators 99 | items = self._walk(self.path) # Tuples of DirEnty,rootpath 100 | items = map(self._file_info,items) # Dictionaries 101 | 102 | ## Here is where we add hashes 103 | for attribute in self.attributes: 104 | if attribute in utils.HASHFUNS: 105 | items = _map(partial(self.add_hash,hashname=attribute),zip(items,repeat(self.path))) 106 | 107 | # Run it! 108 | result = list(items) 109 | 110 | if pool is not None: 111 | pool.close() 112 | self.process_empty() 113 | 114 | if self.hashes: 115 | self.save_hash_db(result) 116 | 117 | return result 118 | def process_empty(self): 119 | """ 120 | Process empties based on self.empty and self.empties 121 | """ 122 | empty_path = os.path.join(self.path,'.PyFiSync','empty_dirs') 123 | if self.empty == 'reset': 124 | try: 125 | os.remove(empty_path) 126 | except OSError: 127 | pass 128 | elif self.empty == 'store': 129 | try: 130 | os.makedirs(os.path.dirname(empty_path)) 131 | except OSError: 132 | pass 133 | empties = list(self.empties) 134 | with open(empty_path,'wt',encoding='utf8') as fobj: 135 | fobj.write(utils.to_unicode(json.dumps(empties,ensure_ascii=False))) 136 | elif self.empty == 'remove': 137 | with open(empty_path,'rt') as fobj: 138 | prev = set(json.loads(fobj.read())) 139 | # Remove the remaining empty dirs 140 | empties = self.empties - prev # both sets 141 | 142 | # Loop through empty dirs and remove. Do it longest to shortest so 143 | # that nested empty dirs are removed 144 | empties = sorted(empties,key=lambda a: (-len(a),a.lower())) 145 | for empty_dir in empties: 146 | try: 147 | os.removedirs(empty_dir) 148 | except OSError: 149 | pass # May just be empty due to exclusion 150 | 151 | def _set_exclusions(self): 152 | """ 153 | Set up and control exclusion 154 | 155 | Note that we separate those with glob patterns from those without 156 | so that those without can be checked via O(1) set while those with 157 | will go through fnmatch. If there is a false positive for glob, 158 | it won't change the final outcome; it will just be slower on that 159 | single run. 160 | """ 161 | GLOBS = '*?[]!' 162 | self.all_excludes = set(self.config.excludes) 163 | 164 | self.exclude_file_full = set() 165 | self.exclude_file = set() 166 | 167 | # non-glob patters are done separately since the exclude can be checked faster 168 | # using a set `in` O(1) instead of fnmatch O(n) (or worse) 169 | self.exclude_file_full_no_glob = set() 170 | self.exclude_file_no_glob = set() 171 | 172 | 173 | self.exclude_dirs_full = set() 174 | self.exclude_dirs = set() 175 | 176 | 177 | for e in self.all_excludes: 178 | e = utils.to_unicode(e) 179 | if e.startswith('/'): # Full 180 | if e.endswith('/'): 181 | self.exclude_dirs_full.add(e) 182 | elif any(g in e for g in GLOBS): 183 | self.exclude_file_full.add(e) 184 | else: 185 | self.exclude_file_full_no_glob.add(e) 186 | else: 187 | if e.endswith('/'): 188 | self.exclude_dirs.add(e) 189 | elif any(g in e for g in GLOBS): 190 | self.exclude_file.add(e) 191 | else: 192 | self.exclude_file_no_glob.add(e) 193 | 194 | def _walk(self,path,_d=0): 195 | """ 196 | Yields tuples of (DirEntry,relpath) since relpath is already computed 197 | and avoids recompute 198 | """ 199 | if path.endswith('/'): 200 | path = path[:-1] # Remove trailing / so we can avoid os.path.join 201 | 202 | 203 | # We only care if anything was returned; directory or file 204 | # Note that based on the nested nature of this, directories are trans- 205 | # versed deep first so if the end is empty (and nothing from them was 206 | # returned) to empty will propagate upwards and will be deleted 207 | # if applicable. 208 | no_returns = True 209 | for item in scandir(path): 210 | 211 | itemname = utils.to_unicode(item.name) 212 | relpath = _relpath(item.path,self.path) 213 | if item.is_dir(follow_symlinks=True): # Always follow directory links 214 | 215 | if fnmatch_mult(itemname +'/',self.exclude_dirs): 216 | continue 217 | 218 | if fnmatch_mult('/'+relpath +'/',self.exclude_dirs_full): 219 | continue 220 | 221 | for subitem in self._walk(item.path,_d=_d+1): 222 | no_returns = False 223 | yield subitem 224 | 225 | elif item.is_file(): 226 | if itemname in self.exclude_file_no_glob: 227 | continue 228 | 229 | if '/'+relpath in self.exclude_file_full_no_glob: 230 | continue 231 | 232 | if fnmatch_mult(itemname,self.exclude_file): 233 | continue 234 | 235 | if fnmatch_mult('/'+relpath,self.exclude_file_full): 236 | continue 237 | 238 | no_returns = False 239 | yield item,relpath 240 | 241 | elif item.is_symlink(): # Must be broken! 242 | self.log.add_err('ERROR: Could not find information on {}\n'.format(relpath) + 243 | ' May be a BROKEN link. Skipping\n') 244 | 245 | 246 | # Was it empty? Note that if there is nothing returned because 247 | # of exclusions, it is still considered empty. 248 | if no_returns: 249 | self.empties.add(path) 250 | 251 | def _file_info(self,item_relpath): 252 | item,relpath = item_relpath 253 | 254 | stat_attributes = ['ino','size','mtime','birthtime'] 255 | file = {'path':relpath} 256 | 257 | 258 | follow_symlinks = not self.config.copy_symlinks_as_links 259 | 260 | try: 261 | stat = item.stat(follow_symlinks=follow_symlinks) 262 | except OSError as E: 263 | self.log.add_err('\n' + 264 | 'ERROR: Could not find information on {}\n'.format(relpath) + 265 | ' May be a BROKEN link.\n MSG: {}\nskipping...\n'.format(E)) 266 | return 267 | 268 | for attrib in stat_attributes: 269 | try: 270 | file[attrib] = getattr(stat,'st_'+attrib) 271 | except AttributeError: 272 | file[attrib] = 0.0 273 | 274 | # if it cannot get mtime, set to future: 275 | if file['mtime'] == 0: file['mtime'] = time.time()+3600 276 | 277 | return file 278 | 279 | def filter_old_list(self,old_list): 280 | """ 281 | Use the exclusions to filter the old lists 282 | """ 283 | out_list = [] 284 | for ix,file in enumerate(old_list): 285 | dirname,filename = os.path.split(file['path']) 286 | 287 | # file name only -- w/o glob 288 | if filename in self.exclude_file_no_glob: 289 | continue 290 | 291 | # file name only -- w/ glob 292 | if fnmatch_mult(filename,self.exclude_file): 293 | continue 294 | 295 | # Full file 296 | fullfile = '/' + file['path'] 297 | 298 | if fullfile in self.exclude_file_full_no_glob: 299 | continue 300 | 301 | if fnmatch_mult(fullfile,self.exclude_file_full): 302 | continue 303 | 304 | # Dirnames. Need to test the full build up 305 | 306 | #dname = [] 307 | 308 | # dirname only -- test 309 | dname_list = [] 310 | dflag = False 311 | for dname in dirname.split('/'): 312 | dname_list.append(dname) 313 | if fnmatch_mult(dname + '/',self.exclude_dirs): 314 | dflag = True 315 | break 316 | # Full dir 317 | if fnmatch_mult('/'+'/'.join(dname_list)+'/',self.exclude_dirs_full): 318 | dflag = True 319 | break 320 | if dflag: 321 | continue 322 | 323 | out_list.append(file) 324 | return out_list 325 | 326 | 327 | def add_hash(self,file_rootpath,hashname=None): 328 | """ 329 | Add the hash but check the db first. Note that if use_hash_db=False, 330 | the load_hash_db made it empty so we won't find it in the query. 331 | """ 332 | file,rootpath = file_rootpath 333 | 334 | query = {k:file[k] for k in ['mtime','path','size']} 335 | dbitem = self.hash_db.query_one(**query) 336 | 337 | if dbitem and hashname in dbitem: 338 | file[hashname] = dbitem[hashname] 339 | else: 340 | fullpath = os.path.join(rootpath,file['path']) 341 | file[hashname] = utils.HASHFUNS[hashname](fullpath) 342 | 343 | return file 344 | 345 | 346 | def load_hash_db(self): 347 | hash_db = list() 348 | 349 | hash_path = os.path.join(self.path,'.PyFiSync','hash_db.json') 350 | if self.use_hash_db and os.path.exists(hash_path): 351 | with open(hash_path,'rt',encoding='utf8') as F: 352 | hash_db = json.loads(F.read()) 353 | 354 | self.hash_db = DictTable(hash_db,fixed_attributes=['mtime','path','size']) 355 | 356 | def save_hash_db(self,files): 357 | if not self.use_hash_db: 358 | return 359 | hash_path = os.path.join(self.path,'.PyFiSync','hash_db.json') 360 | try: 361 | os.makedirs(os.path.dirname(hash_path)) 362 | except OSError: 363 | pass 364 | with open(hash_path,'wt',encoding='utf8') as F: 365 | F.write(utils.to_unicode(json.dumps(files))) 366 | 367 | def _relpath(*A,**K): 368 | """ 369 | Return the results of os.relpath but remove leading ./ 370 | """ 371 | res = os.path.relpath(*A,**K) 372 | res = utils.to_unicode(res) 373 | if res.startswith('./'): 374 | return res[2:] 375 | if res == '.': 376 | return '' 377 | return res 378 | 379 | 380 | def exclude_if_present(filesA,filesB,exclude_filename): 381 | """ 382 | Apply a filter to filesA and filesB to exclude any files below 383 | exclude_if_present files 384 | 385 | filesA and filesB are assumed to be DictTables and this happens in place 386 | """ 387 | exclude_dirs = set() 388 | for file in list(filesA) + list(filesB): 389 | path = file['path'] 390 | dirname,filename = os.path.split(path) 391 | if filename == exclude_filename: 392 | exclude_dirs.add(dirname) 393 | 394 | for exclude_dir in exclude_dirs: 395 | try: 396 | filesA.remove(filesA.Q.filter(lambda a:a['path'].startswith(exclude_dir))) 397 | except ValueError: 398 | pass 399 | try: 400 | filesB.remove(filesB.Q.filter(lambda a:a['path'].startswith(exclude_dir))) 401 | except: 402 | pass 403 | 404 | 405 | 406 | def scandir(path,force_listdir=False): 407 | if _scandir is not None and not force_listdir: 408 | for item in _scandir(path): 409 | yield item 410 | 411 | else: 412 | for item in os.listdir(path): 413 | fullpath = os.path.join(path,item) 414 | yield fake_DirEntry(fullpath) 415 | 416 | 417 | class fake_DirEntry(object): 418 | """ 419 | Fake DirEntry object. 420 | 421 | Will be used by backup scandir 422 | """ 423 | # Use __slots__ for better memory 424 | __slots__ = ('path','name','_lstat','_stat','_is_dir','_is_symlink') 425 | 426 | def __init__(self,path): 427 | self.path = path 428 | self.name = os.path.basename(path) 429 | 430 | self._stat = None 431 | self._lstat = None 432 | self._is_dir = None 433 | self._is_symlink = None 434 | 435 | def inode(self,follow_symlinks=True): 436 | """ 437 | The main object doesn't seem to be clear on whether or not 438 | if follows sym links. I added it but call stat first!!! 439 | """ 440 | if self._stat is None: 441 | self.stat(follow_symlinks=follow_symlinks) 442 | 443 | return self._stat.st_ino 444 | 445 | def is_dir(self,follow_symlinks=True): 446 | if self.is_symlink() and not follow_symlinks: 447 | return False # Symlinks are NEVER dirs when follow_symlinks is False 448 | 449 | if self._is_dir is None: 450 | self._is_dir = os.path.isdir(self.path) 451 | return self._is_dir 452 | 453 | def is_file(self,follow_symlinks=True): 454 | # Make sure it is not a broken link b/c DirEntry will 455 | # tell you both false for file and dir 456 | if self.is_symlink(): 457 | try: 458 | self.stat(follow_symlinks=True) 459 | except OSError: 460 | return False # Broken link 461 | 462 | return not self.is_dir(follow_symlinks=follow_symlinks) 463 | 464 | def stat(self,follow_symlinks=True): 465 | if follow_symlinks: 466 | if self._stat is None: 467 | self._stat = os.stat(self.path) 468 | return self._stat 469 | else: 470 | if self._lstat is None: 471 | self._lstat = os.lstat(self.path) 472 | return self._lstat 473 | 474 | 475 | def is_symlink(self): 476 | if self._is_symlink is None: 477 | self._is_symlink = os.path.islink(self.path) 478 | return self._is_symlink 479 | 480 | def __str__(self): 481 | return '<{0}: {1!r}>'.format(self.__class__.__name__, self.name) 482 | __repr__ = __str__ 483 | 484 | 485 | 486 | 487 | 488 | -------------------------------------------------------------------------------- /PyFiSync/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Collection of utilities I have written along the way that may be useful 4 | 5 | parallel_map -- Improved multiprocessing.map. See references therein 6 | """ 7 | from __future__ import division, print_function, unicode_literals, absolute_import 8 | 9 | import hashlib 10 | import os 11 | import sys 12 | import datetime 13 | import re 14 | import zlib 15 | from io import open 16 | import itertools 17 | import argparse 18 | import copy 19 | from threading import Thread 20 | import getpass 21 | from functools import partial 22 | 23 | try: 24 | from queue import Queue 25 | except ImportError: 26 | from Queue import Queue 27 | 28 | if sys.version_info >= (3,): 29 | unicode = str 30 | xrange = range 31 | 32 | class logger(object): 33 | def __init__(self,path=None,silent=False): 34 | 35 | self.silent = silent 36 | self.path = path 37 | 38 | if path is not None: 39 | filepath = os.path.abspath(os.path.join(path,'.PyFiSync','logs')) 40 | self.path = filepath 41 | try: 42 | os.makedirs(filepath) 43 | except OSError: 44 | pass # Already exists 45 | self.filepath = os.path.join(filepath, 46 | datetime.datetime.now().strftime('%Y-%m-%d_%H%M%S') + '.log') 47 | 48 | # Write the file with nothing but this will overwrite it 49 | with open(self.filepath,'w') as F: 50 | F.write(' ') 51 | 52 | 53 | self.space = 0 54 | self.prepend = '' 55 | def add(self,text,end=u'\n',return_out=False): 56 | 57 | if text is None: 58 | return 59 | text = text.split('\n') 60 | out = [] 61 | for line in text: 62 | out.append(self.prepend + ' '*self.space + line) 63 | 64 | out = '\n'.join(out) 65 | out = to_unicode(out) 66 | 67 | if not self.silent: 68 | try: 69 | print(out,end=end) 70 | except UnicodeEncodeError: # This is a bit hacky but I think there are issues with remove queues and printing 71 | print(out.encode('utf-8'),end=end.encode('utf8')) 72 | 73 | out = out.encode('utf-8') 74 | 75 | if self.path is not None: 76 | with open(self.filepath,'ba') as F: 77 | F.write(out + end.encode('utf-8')) 78 | 79 | if return_out: 80 | return out 81 | 82 | def add_err(self,*A,**K): 83 | """ 84 | Same as add except that it will write it to stderr instead of with 'print' 85 | """ 86 | silent0 = self.silent 87 | self.silent = True 88 | 89 | out = self.add(return_out=True,*A,**K) 90 | 91 | self.silent = silent0 92 | 93 | # Now print it to stderr (even if silent!) 94 | end = K.get('end',u'\n') 95 | out = to_unicode('\n') + to_unicode(out) + to_unicode(end) 96 | try: 97 | sys.stderr.write(out) 98 | except UnicodeEncodeError: # This is a bit hacky but I think there are issues with remove queues and printing 99 | sys.stderr.write(out.encode('utf-8')) 100 | 101 | def add_close(self): 102 | if self.path is None: 103 | return 104 | self.line() 105 | self.add('Log saved in {path:s}'.format(path=self.filepath)) 106 | 107 | def line(self): 108 | self.add('='*50,end='\n') 109 | 110 | class configparser(object): 111 | """This will eventually be the configuration""" 112 | default_path = os.path.join(os.path.dirname(__file__),'config_template.py') 113 | def __init__(self,sync_dir=None,remote=None): 114 | 115 | self.sync_dir = sync_dir 116 | 117 | # These must be a lists! 118 | self._listattr = ['move_attributesA','move_attributesB', 119 | 'prev_attributesA','prev_attributesB', 120 | 'excludes'] 121 | 122 | # These must be changed from the defaults (They are not parsed in 123 | # defaults and must be set later) 124 | self._reqattr = ['pathB'] 125 | if sync_dir is not None: 126 | # We parse the input twice since we want to know the remote before 127 | # parsing defaults. But, we do not want to prompt for a password 128 | # twice so we tell it to ignore it here 129 | _tmp = self.parse(getdict=True,pw=False) 130 | self.pathA = os.path.abspath(sync_dir) 131 | if 'remote' not in _tmp: 132 | print('ERROR: Must specify a remote. Must update config file for PyFiSync',file=sys.stderr) 133 | sys.exit(2) 134 | self.parse_defaults(remote=_tmp['remote']) 135 | self.parse() 136 | else: 137 | self.parse_defaults(remote=remote) 138 | 139 | # Some special things 140 | self.excludes = list(set(self.excludes + ['.PBrsync/','.PyFiSync/'])) 141 | 142 | def parse_defaults(self,remote=None): 143 | """ 144 | Parse all defaults from the template except those in self._reqattr 145 | """ 146 | config = dict() 147 | try: 148 | with open(self.default_path,'rt') as F: 149 | txt = F.read() 150 | except: 151 | # This is a hack for when it is in an egg file. I need to figure 152 | # out a better way 153 | import zipfile 154 | _zf = zipfile.ZipFile(self.default_path[:self.default_path.find('/PyFiSync/config_template.py')]) 155 | txt = _zf.read('PyFiSync/config_template.py') 156 | txt = to_unicode(txt) 157 | 158 | if remote is None: 159 | remote = 'rsync' 160 | txt = self._filterconfig(txt,remote=remote) 161 | 162 | exec_(txt,config) 163 | 164 | for key,val in config.items(): 165 | # Unike `parse`, there is no need to check for lists since 166 | # this isn't user code 167 | if key in self._reqattr: 168 | continue 169 | setattr(self,key,val) 170 | 171 | @property 172 | def configpath(self): 173 | for ext in ['','.py']: 174 | config_path = os.path.join(self.sync_dir,'.PyFiSync','config'+ext) 175 | if os.path.exists(config_path): 176 | break 177 | else: 178 | sys.stderr.write('ERROR Could not find config file. Did you run `init`?\n') 179 | sys.exit(2) 180 | return config_path 181 | 182 | def parse(self,getdict=False,pw=True): 183 | none = lambda *a,**k:None 184 | config = dict(pwprompt=getpass.getpass if pw else none) 185 | with open(self.configpath,'rt') as F: 186 | txt = F.read() 187 | # Make sure this is executed from the base path 188 | # but then switch back just in case it messes up other stuff 189 | pwd0 = os.getcwd() 190 | os.chdir(self.sync_dir) 191 | exec_(txt,config) 192 | os.chdir(pwd0) 193 | 194 | if getdict: 195 | return config 196 | 197 | for key,val in config.items(): 198 | if key in self._listattr and not isinstance(val,list): 199 | if isinstance(val,(set,tuple)): 200 | val = list(val) 201 | else: 202 | val = [val] 203 | setattr(self,key,val) 204 | 205 | # Some minor adjustments 206 | self.mod_resolution = float(self.mod_resolution) 207 | # Aliases 208 | if hasattr(self,'symlinks'): 209 | self.copy_symlinks_as_links = not self.symlinks 210 | 211 | @classmethod 212 | def config_example(cls,remote='rsync'): 213 | """ Return an example configuration""" 214 | with open(cls.default_path,'rt') as F: 215 | config = F.read() 216 | return configparser._filterconfig(config,remote=remote) 217 | 218 | @staticmethod 219 | def _filterconfig(config,remote='rsync'): 220 | # remove anything that is not part of this remote 221 | from .remote_interfaces import REMOTES 222 | if remote not in REMOTES: 223 | raise ValueError('Not a valid remote') 224 | for rem in REMOTES: 225 | if rem == remote: 226 | repr = r'\1' 227 | else: 228 | repr = '' 229 | regex = r'^#[\ \t]*?\(.*?)#[\ \t]*?\<[\/\\]FLAG\>'.replace('FLAG',rem) 230 | config = re.sub(regex,repr,config,flags=re.MULTILINE|re.DOTALL) 231 | 232 | # Remove more than one empty line from the above replacements 233 | # This can be done with regex but easier with python. 234 | config = '\n'.join(c.rstrip() for c in config.split('\n')) # All empty lines are blank 235 | while '\n\n\n' in config: 236 | config = config.replace('\n\n\n','\n\n') 237 | return config 238 | 239 | def hashlibhash(filepath,BLOCKSIZE=1*1024**2,name='sha1'): 240 | """ 241 | http://pythoncentral.io/hashing-files-with-python/ 242 | 243 | 1024*1024: 1 mb 244 | 4*1024: 4 kb 245 | 246 | """ 247 | hasher = hashlib.new(name) 248 | with open(filepath, 'rb') as afile: 249 | buf = afile.read(BLOCKSIZE) 250 | while len(buf) > 0: 251 | hasher.update(buf) 252 | buf = afile.read(BLOCKSIZE) 253 | if name.startswith('shake'): 254 | return hasher.hexdigest(32) 255 | else: 256 | return hasher.hexdigest() 257 | 258 | def adler(filepath,BLOCKSIZE=1*1024**2): 259 | """ 260 | Create an additive adler32 checksum. Faster than sha1. 261 | 262 | From the documentation: 263 | > Changed in version 3.0: Always returns an unsigned value. 264 | > To generate the same numeric value across all Python versions and 265 | > platforms, use adler32(data) & 0xffffffff. 266 | """ 267 | csum = 1 268 | with open(filepath, 'rb') as afile: 269 | buf = afile.read(BLOCKSIZE) 270 | while len(buf) > 0: 271 | csum = zlib.adler32(buf,csum) 272 | buf = afile.read(BLOCKSIZE) 273 | csum = csum & 0xffffffff 274 | return ('0'*8 + hex(csum)[2:])[-8:] # Preceding 0s 275 | 276 | def dropboxhash(filename): 277 | """ 278 | Compute the dropbox hash of a given file. See [1] for details 279 | This was tested on thier example and against rclone 280 | 281 | [1]: https://www.dropbox.com/developers/reference/content-hash 282 | 283 | """ 284 | subhashes = [] 285 | with open(filename,'rb') as file: 286 | while True: 287 | buf = file.read(4*1024**2) 288 | if len(buf) == 0: 289 | break 290 | subhashes.append(hashlib.sha256(buf).digest()) 291 | return hashlib.sha256(b''.join(subhashes)).hexdigest() 292 | 293 | HASHFUNS = { 294 | 'adler':adler, 295 | 'dbhash':dropboxhash} 296 | for name in hashlib.algorithms_guaranteed: 297 | HASHFUNS[name] = partial(hashlibhash,name=name) 298 | 299 | def to_unicode(txt,verbose=False): 300 | """ 301 | Convert input to unicode if it can! 302 | """ 303 | for objtype in [list,tuple,set]: 304 | if isinstance(txt,objtype): 305 | return objtype(to_unicode(a) for a in txt) 306 | if isinstance(txt,unicode): 307 | return txt 308 | if hasattr(txt,'decode'): 309 | return txt.decode('utf8') 310 | 311 | class RawSortingHelpFormatter(argparse.RawDescriptionHelpFormatter): 312 | """ 313 | argparse help formatter that uses RawDescriptionHelpFormatter but 314 | alphebatizes by the long-form action and lower case 315 | 316 | Based on https://stackoverflow.com/a/12269143/3633154 317 | WARNING: Uses non-documented behavior but it *should* be fine 318 | """ 319 | # override parent 320 | def add_arguments(self, actions): 321 | actions = sorted(actions, key=self._sortkey) 322 | super(RawSortingHelpFormatter, self).add_arguments(actions) 323 | 324 | # new 325 | def _sortkey(self,action): 326 | """ 327 | Sorter for optional strings. Sort by lower case of long 328 | argument otherwise short 329 | """ 330 | options = copy.copy(action.option_strings) 331 | options.sort(key=self._count_leading_dash) 332 | return tuple(opt.lower() for opt in options) 333 | 334 | def _count_leading_dash(self,item): 335 | count = 0 336 | while item.startswith('-'): 337 | count += -1 338 | item = item[1:] 339 | return count 340 | 341 | 342 | def move_txt(src,dst): 343 | """Apply some pretty printing to moves""" 344 | _fjoin = lambda s: '' if len(s) == 0 else (os.sep if s[0] == '' else '') + os.sep.join(s) 345 | 346 | # Split as sep and add it in 347 | srcs = src.split(os.sep) 348 | dsts = dst.split(os.sep) 349 | 350 | comb = [] 351 | 352 | for s,d in zip(srcs,dsts): 353 | if s != d: 354 | break 355 | comb.append(s) 356 | sremain = _fjoin(srcs[len(comb):]) 357 | dremain = _fjoin(dsts[len(comb):]) 358 | comb = _fjoin(comb) 359 | 360 | if len(comb)>2 and len(sremain)>0 and len(dremain)>0: 361 | # Just so that we aren't doing this for nothing 362 | mtxt = comb + os.sep + '{' + sremain + ' --> ' + dremain + '}' 363 | else: 364 | mtxt = '{src:s} --> {dst:s}'.format(src=src,dst=dst) 365 | 366 | while os.sep*2 in mtxt: 367 | mtxt = mtxt.replace(os.sep*2,os.sep) 368 | 369 | return mtxt 370 | 371 | 372 | 373 | class ReturnThread(Thread): 374 | """ 375 | Like a regular thread except when you `join`, it returns the function 376 | result. And it assumes a target is always passed 377 | """ 378 | def __init__(self,**kwargs): 379 | self.target = kwargs.pop('target',False) 380 | if self.target is False: 381 | raise ValueError('Must specify a target') 382 | self.q = Queue() 383 | super(ReturnThread, self).__init__(target=self._target,**kwargs) 384 | 385 | def _target(self,*args,**kwargs): 386 | self.q.put( self.target(*args,**kwargs) ) 387 | 388 | def join(self,**kwargs): 389 | super(ReturnThread, self).join(**kwargs) 390 | res = self.q.get() 391 | self.q.task_done() 392 | self.q.join() 393 | return res 394 | 395 | def RFC3339_to_unix(timestr): 396 | """ 397 | Parses RFC3339 into a unix time 398 | """ 399 | d,t = timestr.split('T') 400 | year,month,day = d.split('-') 401 | 402 | t = t.replace('Z','-00:00') # zulu time 403 | t = t.replace('-',':-').replace('+',':+') # Add a new set 404 | hh,mm,ss,tzhh,tzmm = t.split(':') 405 | 406 | offset = -1 if tzhh.startswith('-') else +1 407 | tzhh = tzhh[1:] 408 | 409 | try: 410 | ss,micro = ss.split('.') 411 | except ValueError: 412 | ss = ss 413 | micro = '00' 414 | micro = micro[:6] # Python doesn't support beyond 999999 415 | 416 | dt = datetime.datetime(int(year),int(month),int(day), 417 | hour=int(hh),minute=int(mm),second=int(ss), 418 | microsecond=int(micro)) 419 | unix = (dt - datetime.datetime(1970,1,1)).total_seconds() 420 | 421 | # Account for timezone which counts backwards so -= 422 | unix -= int(tzhh)*3600*offset 423 | unix -= int(tzmm)*60*offset 424 | return unix 425 | 426 | def imitate_hash(mydict): 427 | """ 428 | Imitate the hash. This is crude and imperfect but fine for replacing 429 | a missing hash 430 | """ 431 | hasher = hashlib.sha1() 432 | hasher.update(repr(mydict).encode('utf8')) 433 | return hasher.hexdigest() 434 | 435 | 436 | def bytes2human(byte_count,base=1024,short=True): 437 | """ 438 | Return a value,label tuple 439 | """ 440 | if base not in (1024,1000): 441 | raise ValueError('base must be 1000 or 1024') 442 | 443 | labels = ['kilo','mega','giga','tera','peta','exa','zetta','yotta'] 444 | name = 'bytes' 445 | if short: 446 | labels = [l[0] for l in labels] 447 | name = name[0] 448 | labels.insert(0,'') 449 | 450 | best = 0 451 | for ii in range(len(labels)): 452 | if (byte_count / (base**ii*1.0)) < 1: 453 | break 454 | best = ii 455 | 456 | return byte_count / (base**best*1.0),labels[best] + name 457 | 458 | def file_summary(files): 459 | N = len(files) 460 | s = sum(f['size'] for f in files if f) 461 | s = bytes2human(s) 462 | return "{:d} files, {:0.2f} {:s}".format(N,s[0],s[1]) 463 | 464 | 465 | ########################### six extracted codes ########################### 466 | # This is pulled from the python six module (see links below) to work 467 | # around some python 2.7.4 issues 468 | # Links: 469 | # https://github.com/benjaminp/six 470 | # https://pypi.python.org/pypi/six 471 | # http://pythonhosted.org/six/ 472 | ############################################################################## 473 | # Copyright (c) 2010-2018 Benjamin Peterson 474 | # 475 | # Permission is hereby granted, free of charge, to any person obtaining a copy 476 | # of this software and associated documentation files (the "Software"), to deal 477 | # in the Software without restriction, including without limitation the rights 478 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 479 | # copies of the Software, and to permit persons to whom the Software is 480 | # furnished to do so, subject to the following conditions: 481 | # 482 | # The above copyright notice and this permission notice shall be included in 483 | # all copies or substantial portions of the Software. 484 | # 485 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 486 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 487 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 488 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 489 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 490 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 491 | # THE SOFTWARE. 492 | ############################################################################# 493 | if sys.version_info[0]>2: 494 | exec('exec_ = exec') 495 | else: 496 | def exec_(_code_, _globs_=None, _locs_=None): 497 | """Execute code in a namespace.""" 498 | if _globs_ is None: 499 | frame = sys._getframe(1) 500 | _globs_ = frame.f_globals 501 | if _locs_ is None: 502 | _locs_ = frame.f_locals 503 | del frame 504 | elif _locs_ is None: 505 | _locs_ = _globs_ 506 | exec("""exec _code_ in _globs_, _locs_""") 507 | 508 | 509 | 510 | 511 | 512 | -------------------------------------------------------------------------------- /PyFiSync/dicttable.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import unicode_literals 3 | 4 | __version__ = "20200825" 5 | __author__ = "Justin Winokur" 6 | 7 | import copy 8 | from collections import defaultdict 9 | import uuid 10 | import types 11 | import sys 12 | 13 | if sys.version_info[0] > 2: 14 | unicode = str 15 | 16 | class ExcludedAttributeError(ValueError): 17 | pass 18 | 19 | class DictTable(object): 20 | """ 21 | DictTable: 22 | Create an in-memeory single table DB from a list of dictionaries that may 23 | be queried by any specified attribute. 24 | 25 | This is useful since, once created, lookup/query/"in" checks areO(1), 26 | Creation is still O(N) 27 | 28 | Note: Unless an entry is changed with update(), it must be reindexed 29 | 30 | Inputs: 31 | -------- 32 | items [ *empty* ] (list) 33 | Iterable of dictionaries with each attribute. Can also be a DictTable. 34 | If specified as a DictTable, other options are still settable here. 35 | 36 | fixed_attributes [None] (list, None) 37 | Specify _specific_ attributes to index for each item. Will *only* index 38 | them unless add_fixed_attribute('new_attribute') is called. 39 | 40 | If None, will use _all_ attributed *except* those of exclude_attributes 41 | 42 | exclude_attributes [ *empty* ] (list) 43 | Attributes that shouldn't ever be added even if attributes=None for 44 | dynamic addition of attributes. 45 | 46 | Multiple Values per attribute 47 | ----------------------------- 48 | A "row" can have multiple values per attribute as follows: 49 | 50 | {'attribute':[val1,val2,val3]} 51 | 52 | and can be queried for any (or all) values. 53 | 54 | Additional Opperations: 55 | ---------------------- 56 | This supports index-lookup with a dictionary as well as 57 | a python `in` check and lookup by a dictionary 58 | 59 | The code will allow you to edit/delete/update multiple items at once 60 | (just like a standard database). Use with caution. 61 | 62 | Tips: 63 | ------ 64 | * You can simply dump the DB with JSON using the DB.items() 65 | and then reload it with a new DB 66 | 67 | * There is also an attribute called `_index` which can be used to 68 | query by index. 69 | 70 | """ 71 | def __init__(self, items=None, 72 | fixed_attributes=None,exclude_attributes=None): 73 | 74 | # These are used to make sure the DB.Query is (a) from this DB and (b) 75 | # the DB hasn't changed. This *should* always be the case 76 | self._id = unicode(uuid.uuid4()) 77 | self._c = 0 78 | 79 | # Handle inputs 80 | if items is None: 81 | items = list() 82 | 83 | if exclude_attributes is None: 84 | exclude_attributes = set() 85 | if isinstance(exclude_attributes,(str,unicode)): 86 | exclude_attributes = [exclude_attributes] 87 | self.exclude_attributes = set(exclude_attributes) 88 | 89 | if fixed_attributes: 90 | if isinstance(fixed_attributes,(str,unicode)): 91 | fixed_attributes = [fixed_attributes] 92 | self.fixed_attributes = list(fixed_attributes) 93 | else: 94 | self.fixed_attributes = list() 95 | 96 | self.N = 0 # Will keep track 97 | self._list = [] 98 | self._lookup = defaultdict(_new_defaultdict_list) 99 | 100 | self._empty = _emptyList() 101 | self._ix = set() 102 | 103 | # Add the items 104 | for item in items: 105 | self.add(item) 106 | 107 | def add(self,item): 108 | """ 109 | Add an item or items to the DB 110 | """ 111 | if isinstance(item,(list,tuple,types.GeneratorType)): 112 | for it in item: 113 | self.add(it) 114 | return 115 | 116 | ix = len(self._list) # The length will be 1+ the last ix so do not change this 117 | 118 | # Add built in ones if it is there 119 | attribs = self.fixed_attributes if self.fixed_attributes else item.keys() 120 | 121 | for attrib in attribs: 122 | if attrib not in item or attrib in self.exclude_attributes: 123 | continue 124 | self._append(attrib,item[attrib],ix) # Add it to the index 125 | 126 | # Finally add it 127 | self._list.append(item) 128 | self.N += 1 129 | self._ix.add(ix) 130 | 131 | def query(self,*args,**kwargs): 132 | """ 133 | Query the value for attribute. Will always an iterator. Use 134 | `list(DB.query())` to return a list 135 | 136 | Usage 137 | ----- 138 | 139 | Any combination of the following will works 140 | 141 | Keywords: Only check equality 142 | 143 | >>> DB.query(attrib=val) 144 | >>> DB.query(attrib1=val1,attrib2=val2) # Match both 145 | 146 | >>> DB.query({'attrib':val}) 147 | >>> DB.query({'attrib1':val1,'attrib2':val2}) # Match Both 148 | 149 | Query Objects (DB.Q, DB.Query) 150 | 151 | >>> DB.query(DB.Q.attrib == val) 152 | >>> DB.query( (DB.Q.attrib1 == val1) & (DB.Q.attrib1 == val2) ) # Parentheses are important! 153 | >>> DB.query( (DB.Q.attrib1 == val1) & (DB.Q.attrib1 != val2) ) 154 | 155 | """ 156 | ixs = self._ixs(*args,**kwargs) 157 | for ix in ixs: 158 | yield self._list[ix] 159 | 160 | def query_one(self,*args,**kwargs): 161 | """ 162 | Return a single item from a query. See "query" for more details. 163 | 164 | Returns None if nothing matches 165 | """ 166 | try: 167 | return next(self.query(*args,**kwargs)) 168 | except StopIteration: 169 | return None 170 | 171 | def count(self,*args,**kwargs): 172 | """ 173 | Return the number of matched rows for a given query. See "query" for 174 | details on query construction 175 | """ 176 | return len(self._ixs(*args,**kwargs)) 177 | 178 | def isin(self,*args,**kwargs): 179 | """ 180 | Check if there is at least one item that matches the given query 181 | 182 | see query() for usage 183 | """ 184 | return self.count(*args,**kwargs) > 0 185 | 186 | def reindex(self,*attributes): 187 | """ 188 | Reindex the dictionary for specified attributes (or all) 189 | 190 | Usage 191 | ----- 192 | 193 | >>> DB.reindex() # All 194 | >>> DB.reindex('attrib') # Reindex 'attrib' 195 | >>> DB.reindex('attrib1','attrib2') # Multiple 196 | 197 | See Also 198 | -------- 199 | update() method which does not require reindexing 200 | """ 201 | if len(attributes) == 0: 202 | attributes = self.attributes 203 | 204 | if any(a in self.exclude_attributes for a in attributes): 205 | raise ValueError('Cannot reindex an excluded attribute') 206 | 207 | for attribute in attributes: 208 | self._lookup[attribute] = defaultdict(list) # Reset 209 | 210 | for ix,item in enumerate(self._list): 211 | if item is None: continue 212 | for attrib in attributes: 213 | if attrib in item: 214 | self._append(attrib,item[attrib],ix) 215 | 216 | def update(self,*args,**queryKWs): 217 | """ 218 | Update an entry without needing to reindex the DB (or a specific 219 | attribute) 220 | 221 | Usage: 222 | ------ 223 | 224 | >>> DB.update(updated_dict, query_dict_or_Query, query_attrib1=val1,...) 225 | >>> DB.update(updated_dict, query_attrib1=val1,...) 226 | 227 | Inputs: 228 | ------- 229 | 230 | updated_dict : Dictionary with which to update the entry. This is 231 | done using the typical dict().update() construct to 232 | overwrite it 233 | 234 | query_dict_or_Query 235 | : Either the dictionary used in the query or a Query that 236 | defines a more advanced query 237 | 238 | query_attrib1=val1 239 | : Additional (or sole) query attributes 240 | 241 | Notes: 242 | ------ 243 | * Updating an item requires a deletion in a list that has length n 244 | equal to the number of items matching an attribute. This is O(n). 245 | However changing the entry directly and reindexing is O(N) where 246 | N is the size of the DB. If many items are changing and you do not 247 | need to query them in between, it *may* be faster to directly 248 | update the item and reindex 249 | """ 250 | 251 | if len(args) == 1: 252 | updated_dict = args[0] 253 | query = {} 254 | elif len(args) == 2: 255 | updated_dict,query = args 256 | else: 257 | raise ValueError('Incorrect number of inputs. See documentation') 258 | 259 | if not isinstance(updated_dict,dict): 260 | raise ValueError('Must specify updated values as a dictionary') 261 | 262 | if isinstance(query,Query): 263 | ixs = self._ixs(query,**queryKWs) 264 | elif isinstance(query,dict): 265 | queryKWs.update(query) 266 | ixs = self._ixs(**queryKWs) 267 | else: 268 | raise ValueError('Unrecognized query {:s}. Must be a dict or Query',format(type(query))) 269 | 270 | if len(ixs) == 0: 271 | raise ValueError('Query did not match any results') 272 | 273 | for ix in ixs: 274 | # Get original item 275 | item = self._list[ix] 276 | 277 | # Allow the update to also include non DB attributes. 278 | # The intersection will eliminate any exclude_attributes 279 | attributes = set(updated_dict.keys()).intersection(self.attributes) 280 | 281 | for attrib in attributes: # Only loop over the updated attribs 282 | value = item[attrib] # get old value 283 | self._remove(attrib,value,ix) # Remove any ix matching it 284 | value = updated_dict[attrib] # Get new value 285 | self._append(attrib,value,ix) # Add ix to any new value 286 | 287 | item.update(updated_dict) # Update the item 288 | 289 | def add_fixed_attribute(self,attrib,force=False): 290 | """ 291 | Adds a fixed attribute. If there are NO fixed attributes (i.e. it is 292 | dynamic attributes), do *NOT* add them unless force. 293 | 294 | Will reindex either way 295 | """ 296 | if attrib in self.exclude_attributes: 297 | raise ExcludedAttributeError("'{}' is excludes".format(attrib)) 298 | 299 | if self.fixed_attributes or force and attrib not in self.fixed_attributes: # Must already be filled or forced 300 | self.fixed_attributes.append(attrib) 301 | 302 | self.reindex(attrib) 303 | 304 | def remove(self,*args,**kwargs): 305 | """ 306 | Remove item that matches a given attribute or dict. See query() for 307 | input specification 308 | ----------- 309 | """ 310 | ixs = list(self._ixs(*args,**kwargs)) 311 | 312 | if len(ixs) == 0: 313 | raise ValueError('No matching items') 314 | 315 | items = [] 316 | 317 | for ix in ixs[:]: # Must remove it from everything. 318 | # not sure what is happening, but it seems that I need to make a copy 319 | # since Python is doing something strange here... 320 | 321 | item = self._list[ix] 322 | for attrib in self.attributes: 323 | if attrib in item: 324 | self._remove(attrib,item[attrib],ix) 325 | 326 | # Remove it from the list by setting to None. Do not reshuffle 327 | # the indices. A None check will be performed elsewhere 328 | self._list[ix] = None 329 | self._ix.difference_update([ix]) 330 | self.N -= 1 331 | 332 | def copy(self): 333 | return DictTable(self, 334 | exclude_attributes=copy.copy(self.exclude_attributes), 335 | fixed_attributes=copy.copy(self.fixed_attributes)) 336 | __copy__ = copy 337 | 338 | @property 339 | def Query(self): 340 | """ 341 | Query object already loaded with the DB 342 | 343 | DB.Query <==> DB.Q 344 | """ 345 | return Query(self) 346 | Q = Query 347 | 348 | def _ixs(self,*args,**kwargs): 349 | """ 350 | Get the inde(x/ies) of matching information 351 | """ 352 | if not hasattr(self,'_lookup') or self.N==0: # It may be empty 353 | return [] 354 | 355 | # Make the entire kwargs be lists with default of []. Edge case of 356 | # multiple items 357 | for key,val in kwargs.items(): 358 | if not isinstance(val,list): 359 | kwargs[key] = [val] 360 | kwargs = defaultdict(list,kwargs) 361 | 362 | Q = Query(self) # Empty object 363 | for arg in args: 364 | if isinstance(arg,Query): 365 | if arg._id != self._id: 366 | raise ValueError("Cannot use another DictTable's Query object") 367 | 368 | Q = Q & arg # Will add these conditions. If Q is empty, will just be arg 369 | continue 370 | if isinstance(arg,dict): 371 | for key,val in arg.items(): # Add it rather than update in case it is already specified 372 | kwargs[key].append(val) 373 | else: 374 | raise ValueError('unrecognized input of type {:s}'.format(str(type(arg)))) 375 | 376 | # Construct a query for kwargs 377 | for key,value in kwargs.items(): 378 | if isinstance(value,list) and len(value) == 0: 379 | value = [self._empty] 380 | for val in _makelist(value): 381 | Qtmp = Query(self) 382 | Qtmp._attr = key 383 | Q = Q & (Qtmp == val) 384 | 385 | ixs = Q._ixs 386 | return list(ixs) 387 | 388 | def _index(self,ix): 389 | """ 390 | Return ix if it hasn't been deleted 391 | """ 392 | try: 393 | item = self._list[ix] 394 | except IndexError: 395 | return [] 396 | 397 | if item is None: 398 | return [] 399 | return [ix] 400 | 401 | def _append(self,attrib,value,ix): 402 | """ 403 | Add to the lookup and update the modify time 404 | """ 405 | # Final check but we should be guarded from this 406 | if attrib in self.exclude_attributes: 407 | #print('BAD! Should guard against this in public methods!') 408 | raise ValueError('Cannot reindex an excluded attribute') 409 | 410 | valueL = _makelist(value) 411 | for val in valueL: 412 | self._lookup[attrib][val].append(ix) 413 | if len(valueL) == 0: 414 | self._lookup[attrib][self._empty].append(ix) # empty list 415 | 416 | self._c += 1 417 | 418 | def _remove(self,attrib,value,ix): 419 | """ 420 | Remove from the lookup and update the modify time 421 | """ 422 | valueL = _makelist(value) 423 | for val in valueL: 424 | try: 425 | self._lookup[attrib][val].remove(ix) 426 | except ValueError: 427 | raise ValueError('Item not found in internal lookup. May need to first call reindex()') 428 | if len(valueL) == 0: 429 | self._lookup[attrib][self._empty].remove(ix) # empty list 430 | 431 | self._c += 1 432 | 433 | def __contains__(self,check_diff): 434 | if not ( isinstance(check_diff,dict) or isinstance(check_diff,Query)): 435 | raise ValueError('Python `in` queries should be a of {attribute:value} or Query') 436 | return self.isin(check_diff) 437 | 438 | def __len__(self): 439 | return self.N 440 | 441 | def __getitem__(self,item): 442 | if isinstance(item,dict) or isinstance(item,Query): 443 | return self.query_one(item) 444 | elif isinstance(item,int): # numbered item 445 | if self._list[item] is not None: 446 | return self._list[item] 447 | else: 448 | raise ValueError("Index has been deleted") 449 | else: 450 | raise ValueError("Must specify DB[{'attribute':val}] or DB[index]'") 451 | __call__ = query 452 | 453 | def __iter__(self): 454 | return (item for item in self._list if item is not None) 455 | items = __iter__ 456 | 457 | @property 458 | def attributes(self): 459 | # The attributes are the keys of _lookup but _lookup is a defaultdict 460 | # of a defaultdict(list) so need to check that it is also empy 461 | if self.fixed_attributes: 462 | return self.fixed_attributes 463 | 464 | attribs = [] 465 | # This seems slow but the second for-loop will break at the first non-empty 466 | # item (likely the first one) 467 | for attrib,val in self._lookup.items(): 468 | if not val: # Empty 469 | continue 470 | for v in val.values(): 471 | if v: break 472 | else: 473 | continue 474 | 475 | attribs.append(attrib) 476 | attribs.sort() 477 | return attribs 478 | 479 | def _makelist(input): 480 | if isinstance(input,list): 481 | return input 482 | return [input] 483 | 484 | class _emptyList(object): 485 | def __init__(self): 486 | pass 487 | def __hash__(self): 488 | return 9999999999999 489 | def __eq__(self,other): 490 | return isinstance(other,list) and len(other)==0 491 | 492 | def _new_defaultdict_list(): 493 | return defaultdict(list) 494 | 495 | class Query(object): 496 | """ 497 | Query objects. This works by returning an updated *copy* of the object 498 | whenever it is acted upon 499 | 500 | Calling 501 | * Q.attribute sets attribute and returns a copy 502 | * Q.attribute == val (or any other comparison) set the index of elements 503 | * Q1 & Q1 or other boolean perform set operations 504 | 505 | Useful Methods: 506 | _filter : (or just `filter` if not an attribute): Apply a filter 507 | to the DB 508 | """ 509 | def __init__(self,DB): 510 | self._DB = DB 511 | self._ixs = DB._ix # Everything. Do *NOT* copy but also never modify in place 512 | self._attr = None 513 | 514 | self._c = DB._c 515 | self._id = DB._id 516 | 517 | def _valid(self): 518 | if self._c != self._DB._c: 519 | raise ValueError('This query object is out of date from the DB. Create a new one') 520 | 521 | def _filter(self,filter_func): 522 | """ 523 | If 'filter' is NOT an attribute of the DB, this can be called 524 | with 'filter' instead of '_filter' 525 | 526 | Apply a filter to the data that returns True if it matches and False 527 | otherwise 528 | 529 | Note that filters are O(N) 530 | """ 531 | self._valid() # Actually, these would still work but still check 532 | ixs = set() 533 | for ix,item in enumerate(self._DB._list): # loop all 534 | if item is None: 535 | continue 536 | if filter_func(item): 537 | ixs.add(ix) 538 | self._ixs = ixs # reset it 539 | return self 540 | 541 | # Comparisons 542 | def __eq__(self,value): 543 | self._valid() 544 | 545 | if not self._ixs: 546 | return self 547 | 548 | # Account for '_index' attribute (May be deprecated in the future...) 549 | if self._attr == '_index': 550 | self._ixs = self._ixs.intersection({value}) # replace, don't update 551 | return self 552 | for val in _makelist(value): 553 | self._ixs = self._ixs.intersection(self._DB._lookup[self._attr][val]) # Will return [] if _attr or val not there . Replace, don't update 554 | return self 555 | 556 | def __ne__(self,value): 557 | self._ixs = self._DB._ix - (self == value)._ixs 558 | return self 559 | 560 | def __lt__(self,value): 561 | self._valid() # Actually, these would still work but still check 562 | ixs = set() 563 | for ix,item in enumerate(self._DB._list): # loop all 564 | if item is None or self._attr not in item: 565 | continue 566 | for ival in _makelist(item[self._attr]): 567 | if ival < value: 568 | ixs.add(ix) 569 | self._ixs = ixs 570 | return self 571 | 572 | def __le__(self,value): 573 | self._valid() # Actually, these would still work but still check 574 | ixs = set() 575 | for ix,item in enumerate(self._DB._list): # loop all 576 | if item is None: 577 | continue 578 | if self._attr in item and item[self._attr] <= value: 579 | ixs.add(ix) 580 | self._ixs = ixs 581 | return self 582 | 583 | def __gt__(self,value): 584 | self._valid() # Actually, these would still work but still check 585 | ixs = set() 586 | for ix,item in enumerate(self._DB._list): # loop all 587 | if item is None: 588 | continue 589 | if self._attr in item and item[self._attr] > value: 590 | ixs.add(ix) 591 | self._ixs = ixs 592 | return self 593 | 594 | def __ge__(self,value): 595 | self._valid() # Actually, these would still work but still check 596 | ixs = set() 597 | for ix,item in enumerate(self._DB._list): # loop all 598 | if item is None: 599 | continue 600 | if self._attr in item and item[self._attr] >= value: 601 | ixs.add(ix) 602 | self._ixs = ixs 603 | return self 604 | 605 | # Logic 606 | def __and__(self,Q2): 607 | self._ixs = self._ixs.intersection(Q2._ixs) 608 | return self 609 | 610 | def __or__(self,Q2): 611 | self._ixs = self._ixs.union(Q2._ixs) 612 | return self 613 | 614 | def __invert__(self): 615 | self._ixs = self._DB._ix - self._ixs 616 | return self 617 | 618 | # Attributes 619 | def __getattr__(self,attr): 620 | if self._attr is not None: 621 | raise ValueError('Already set attribute') 622 | if attr == 'filter' and 'filter' not in self._DB.attributes: 623 | return self._filter 624 | 625 | self._attr = attr 626 | if attr == '_index': 627 | return self 628 | 629 | ixs = set() 630 | for vals in self._DB._lookup[attr].values(): 631 | ixs.update(vals) 632 | self._ixs = ixs 633 | return self 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | -------------------------------------------------------------------------------- /PyFiSync/ldtable.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import unicode_literals 3 | 4 | __version__ = "20191123" 5 | __author__ = "Justin Winokur" 6 | 7 | import copy 8 | from collections import defaultdict 9 | import time 10 | import types 11 | 12 | 13 | class ldtable(object): 14 | def __init__(self, items=None, attributes=None, default_attribute=None, 15 | exclude_attributes=None, indexObjects=False): 16 | """ 17 | ldtable: 18 | Create an in-memeory single table DB from a list of dictionaries that 19 | may be queried by any specified attribute. 20 | 21 | This is useful since, once created, lookup/query/"in" checks are 22 | O(1), Creation is still O(N) 23 | 24 | Note: Unless an entry is changed with update(), it must be reindexed 25 | 26 | Inputs: 27 | -------- 28 | items [ *empty* ] (list) 29 | List of dictionaries with each attribute 30 | 31 | attributes [None] (list, None) 32 | Either a list of attributes to index, or if specified as None 33 | (default) it will add every attribute of every item and assign 34 | missing ones to be `default_attribute` (unless excluded) 35 | 36 | NOTE: If attributes is set, you may still add items with extra 37 | attributes. They just won't be indexed. 38 | Or use add_attribute() 39 | 40 | exclude_attributes [ *empty* ] (list) 41 | Attributes that shouldn't ever be added even if attributes=None 42 | for dynamic addition of attributes. 43 | 44 | default_attribute [None] (*any*) 45 | Default attribute to assign if `attributes=None` and it is missing. 46 | If the specified object is callable, it will call it (for example, 47 | `default_attribute=list` will make it an empty list), Otherwise, 48 | it will set it to whatever value is specified. 49 | 50 | Options: (These may be changed later too) 51 | -------- 52 | indexObjects: [False] 53 | If True, will automatically take any object and use its 54 | __dict__ as the dict. 55 | 56 | Note 57 | * Changing to False after adding an object will cause issues. 58 | * Does not support __slots__ since they are immutable 59 | 60 | Multiple Values per attribute 61 | ----------------------------- 62 | A "row" can have multiple values per attribute as follows: 63 | 64 | {'attribute':[val1,val2,val3]} 65 | 66 | and can be queried for any (or all) values. 67 | 68 | Additional Opperations: 69 | ---------------------- 70 | This supports index-lookup with a dictionary as well as 71 | a python `in` check and lookup by a dictionary 72 | 73 | The code will allow you to edit/delete/update multiple items at once 74 | (just like a standard database). Use with caution. 75 | 76 | Tips: 77 | ------ 78 | * You can simply dump the DB with JSON using the DB.items() 79 | and then reload it with a new DB 80 | 81 | * There is also an attribute called `_index` which can be used to 82 | query by index. 83 | 84 | """ 85 | 86 | # Handle inputs 87 | if items is None: 88 | items = list() 89 | 90 | if exclude_attributes is None: 91 | exclude_attributes = list() 92 | self.indexObjects = indexObjects 93 | 94 | 95 | self.attributes = attributes # Will be reset in first add 96 | self._is_attr_None = attributes is None 97 | self.default_attribute = default_attribute 98 | self.exclude_attributes = exclude_attributes 99 | 100 | self.N = 0 # Will keep track 101 | self._list = [] 102 | 103 | self._empty = _emptyList() 104 | self._ix = set() 105 | 106 | # Add the items 107 | for item in items: 108 | self.add(item) 109 | 110 | self._time = time.time() 111 | 112 | # Edge case: No items 113 | if self.attributes is None: 114 | self.attributes = [] 115 | 116 | def add(self,item): 117 | """ 118 | Add an item or items to the DB 119 | """ 120 | if isinstance(item,(list,tuple,types.GeneratorType)): 121 | for it in item: 122 | self.add(it) 123 | return 124 | 125 | # handle other object types 126 | item0 = item 127 | item = self._convert2dict(item) 128 | 129 | if self.N == 0: 130 | attributes = self.attributes 131 | if attributes is None: 132 | attributes = list(item.keys()) 133 | 134 | self.attributes = [attrib for attrib in attributes \ 135 | if attrib not in self.exclude_attributes] # Make a copy 136 | 137 | 138 | # Set up the lookup 139 | self._lookup = {attribute:defaultdict(list) for attribute in self.attributes} 140 | 141 | ix = len(self._list) # The length will be 1+ the last ix so do not change this 142 | 143 | if self._is_attr_None: # Set to None which means we add all 144 | for attrib in item.keys(): 145 | if attrib in self.exclude_attributes: 146 | continue 147 | if attrib not in self.attributes: 148 | self.add_attribute(attrib,self.default_attribute) 149 | # Add built in ones 150 | for attrib in self.attributes: 151 | if attrib not in item: 152 | if hasattr(self.default_attribute, '__call__'): 153 | item[attrib] = self.default_attribute() 154 | else: 155 | item[attrib] = self.default_attribute 156 | 157 | value = item[attrib] 158 | self._append(attrib,value,ix) 159 | 160 | # Finally add it 161 | self._list.append(item0) 162 | self.N += 1 163 | self._ix.add(ix) 164 | 165 | def query(self,*A,**K): 166 | """ 167 | Query the value for attribute. Will always an iterator. Use 168 | `list(DB.query())` to return a list 169 | 170 | Usage 171 | ----- 172 | 173 | Any combination of the following will works 174 | 175 | Keywords: Only check equality 176 | 177 | >>> DB.query(attrib=val) 178 | >>> DB.query(attrib1=val1,attrib2=val2) # Match both 179 | 180 | >>> DB.query({'attrib':val}) 181 | >>> DB.query({'attrib1':val1,'attrib2':val2}) # Match Both 182 | 183 | Query Objects (DB.Q, DB.Qobj) 184 | 185 | >>> DB.query(DB.Q.attrib == val) 186 | >>> DB.query( (DB.Q.attrib1 == val1) & (DB.Q.attrib1 == val2) ) # Parentheses are important! 187 | >>> DB.query( (DB.Q.attrib1 == val1) & (DB.Q.attrib1 != val2)) 188 | 189 | """ 190 | ixs = self._ixs(*A,**K) 191 | for ix in ixs: 192 | yield self._list[ix] 193 | 194 | def query_one(self,*A,**K): 195 | """ 196 | Return a single item from a query. See "query" for more details. 197 | 198 | Returns None if nothing matches 199 | """ 200 | try: 201 | return next(self.query(*A,**K)) 202 | except StopIteration: 203 | return None 204 | 205 | def count(self,*A,**K): 206 | """ 207 | Return the number of matched rows for a given query. See "query" for 208 | details on query construction 209 | """ 210 | return len(self._ixs(*A,**K)) 211 | 212 | def isin(self,*A,**K): 213 | """ 214 | Check if there is at least one item that matches the given query 215 | 216 | see query() for usage 217 | """ 218 | 219 | return len(self._ixs(*A,**K))>0 220 | 221 | def reindex(self,*args): 222 | """ 223 | Reindex the dictionary for specified attributes (or all) 224 | 225 | Usage 226 | ----- 227 | 228 | >>> DB.reindex() # All 229 | >>> DB.reindex('attrib') # Reindex 'attrib' 230 | >>> DB.reindex('attrib1','attrib2') # Multiple 231 | 232 | See Also 233 | -------- 234 | update() method which does not require reindexing 235 | """ 236 | if len(args) == 0: 237 | attributes = self.attributes 238 | 239 | # Just an extra check (and makes a copy) 240 | attributes = [attr for attr in attributes \ 241 | if attr not in self.exclude_attributes] 242 | else: 243 | attributes = args 244 | if any(a in self.exclude_attributes for a in args): 245 | raise ValueError('Cannot reindex an excluded attribute') 246 | 247 | for attribute in attributes: 248 | self._lookup[attribute] = defaultdict(list) # Reset 249 | 250 | for ix,item in enumerate(self._list): 251 | if item is None: continue 252 | item = self._convert2dict(item) 253 | for attrib in attributes: 254 | value = item[attrib] 255 | self._append(attrib,value,ix) 256 | 257 | def update(self,*args,**queryKWs): 258 | """ 259 | Update an entry without needing to reindex the DB (or a specific 260 | attribute) 261 | 262 | Usage: 263 | ------ 264 | 265 | >>> DB.update(updated_dict, query_dict_or_Qobj, query_attrib1=val1,...) 266 | >>> DB.update(updated_dict, query_attrib1=val1,...) 267 | 268 | Inputs: 269 | ------- 270 | 271 | updated_dict : Dictionary with which to update the entry. This is 272 | done using the typical dict().update() construct to 273 | overwrite it 274 | 275 | query_dict_or_Qobj 276 | : Either the dictionary used in the query or a Qobj that 277 | defines a more advanced query 278 | 279 | query_attrib1=val1 280 | : Additional (or sole) query attributes 281 | 282 | Notes: 283 | ------ 284 | * Updating an item requires a deletion in a list that has length n 285 | equal to the number of items matching an attribute. This is O(n). 286 | However changing the entry directly and reindexing is O(N) where 287 | N is the size of the DB. If many items are changing and you do not 288 | need to query them in between, it *may* be faster to directly 289 | update the item and reindex 290 | """ 291 | 292 | if len(args) == 1: 293 | updated_dict = args[0] 294 | query = {} 295 | elif len(args) == 2: 296 | updated_dict,query = args 297 | else: 298 | raise ValueError('Incorrect number of inputs. See documentation') 299 | 300 | updated_dict = self._convert2dict(updated_dict) 301 | if not isinstance(updated_dict,dict): 302 | raise ValueError('Must specify updated values as a dictionary') 303 | 304 | query = self._convert2dict(query) 305 | if isinstance(query,Qobj): 306 | ixs = self._ixs(query,**queryKWs) 307 | elif isinstance(query,dict): 308 | queryKWs.update(query) 309 | ixs = self._ixs(**queryKWs) 310 | else: 311 | raise ValueError('Unrecognized query {:s}. Must be a dict or Qobj',format(type(query))) 312 | 313 | if len(ixs) == 0: 314 | raise ValueError('Query did not match any results') 315 | 316 | for ix in ixs: 317 | # Get original item 318 | item = self._list[ix] 319 | item = self._convert2dict(item) 320 | 321 | # Allow the update to also include non DB attributes. 322 | # The intersection will eliminate any exclude_attributes 323 | attributes = set(updated_dict.keys()).intersection(self.attributes) 324 | 325 | for attrib in attributes: # Only loop over the updated attribs 326 | # get old value 327 | value = item[attrib] 328 | 329 | # Remove any ix matching it 330 | self._remove(attrib,value,ix) 331 | 332 | # Get new value 333 | value = updated_dict[attrib] 334 | 335 | # Add ix to any new value 336 | self._append(attrib,value,ix) 337 | 338 | # Update the item 339 | item.update(updated_dict) 340 | 341 | return 342 | 343 | def add_attribute(self,attribute,*default): 344 | """ 345 | Add an attribute to the index attributes. 346 | 347 | Usage 348 | ----- 349 | >>> DB.add_attribute('new_attrib') # Will raise an error if *any* 350 | # items don't have 'new_attrib' 351 | >>> DB.add_attribute('new_attrib',default) 352 | # Set any missing to the default 353 | 354 | If the `default` is callable, it will call it instead. (such as `list` 355 | to add an empty list) 356 | 357 | """ 358 | if attribute in self.exclude_attributes: 359 | raise ValueError("Can't add exclude_attributes") 360 | 361 | attrib = attribute 362 | if not hasattr(self,'_lookup'): 363 | self._lookup = {} 364 | self._lookup[attribute] = defaultdict(list) 365 | 366 | set_default = False 367 | if len(default) >0: 368 | set_default = True 369 | default = default[0] 370 | 371 | for ix,item in enumerate(self._list): 372 | if item is None: continue 373 | item = self._convert2dict(item) 374 | try: 375 | value = item[attribute] 376 | self._append(attrib,value,ix) 377 | except KeyError as KE: 378 | if set_default: 379 | if hasattr(default, '__call__'): 380 | item[attribute] = default() 381 | else: 382 | item[attribute] = default 383 | else: 384 | raise KeyError("Attribute {:s} not found".format(attrib)) 385 | 386 | value = item[attribute] 387 | self._append(attrib,value,ix) 388 | 389 | self.attributes.append(attribute) 390 | 391 | def remove(self,*A,**K): 392 | """ 393 | Remove item that matches a given attribute or dict. See query() for 394 | input specification 395 | 396 | 397 | DB Options: 398 | This is set at instantiation but can be changed directly 399 | ----------- 400 | 401 | """ 402 | ixs = list(self._ixs(*A,**K)) 403 | 404 | if len(ixs) == 0: 405 | raise ValueError('No matching items') 406 | 407 | items = [] 408 | 409 | for ix in ixs[:]: # Must remove it from everything. 410 | # not sure what is happening, but it seems that I need to make a copy 411 | # since Python is doing something strange here... 412 | 413 | item = self._list[ix] 414 | item = self._convert2dict(item) 415 | 416 | for attrib in self.attributes: 417 | value = item[attrib] 418 | self._remove(attrib,value,ix) 419 | 420 | # Remove it from the list by setting to None. Do not reshuffle 421 | # the indices. A None check will be performed elsewhere 422 | self._list[ix] = None 423 | self._ix.difference_update([ix]) 424 | self.N -= 1 425 | 426 | 427 | @property 428 | def Qobj(self): 429 | """ 430 | Query object already loaded with the DB 431 | 432 | DB.Qobj <==> DB.Q <==> Qobj(DB) 433 | """ 434 | return Qobj(self) 435 | Q = Qobj 436 | 437 | def _convert2dict(self,obj): 438 | """ 439 | Convert objects to a regular dictionary for the sake of indexing 440 | 441 | Also return Qobjs untouched since they may be used in queries too 442 | 443 | If it not an Qobj or dict, it will try to get it's __dict__ and if 444 | that doesn't work, will just return it 445 | """ 446 | if isinstance(obj,Qobj): 447 | return obj 448 | 449 | if isinstance(obj,dict): # Also accounts for OrderedDicts or ... 450 | return obj # ... anything that inherits dict 451 | 452 | if self.indexObjects and hasattr(obj,'__dict__'): 453 | return obj.__dict__ 454 | 455 | return obj 456 | 457 | 458 | def _ixs(self,*args,**kwords): 459 | """ 460 | Get the inde(x/ies) of matching information 461 | """ 462 | if not hasattr(self,'_lookup') or self.N==0: # It may be empty 463 | return [] 464 | 465 | # Make the entire kwords be lists with default of []. Edge case of 466 | # multiple items 467 | for key,val in kwords.items(): 468 | if not isinstance(val,list): 469 | kwords[key] = [val] 470 | kwords = defaultdict(list,kwords) 471 | 472 | Q = Qobj(self) # Empty object 473 | for arg in args: 474 | arg = self._convert2dict(arg) # handle other object types 475 | if isinstance(arg,Qobj): 476 | Q = Q & arg # Will add these conditions. If Q is empty, will just be arg 477 | continue 478 | if isinstance(arg,dict): 479 | for key,val in arg.items(): # Add it rather than update in case it is already specified 480 | kwords[key].append(val) 481 | else: 482 | raise ValueError('unrecognized input of type {:s}'.format(str(type(arg)))) 483 | 484 | # Construct a query for kwords 485 | for key,value in kwords.items(): 486 | if isinstance(value,list) and len(value) == 0: 487 | value = [self._empty] 488 | for val in _makelist(value): 489 | Qtmp = Qobj(self) 490 | Qtmp._attr = key 491 | Q = Q & (Qtmp == val) 492 | 493 | ixs = Q._ixs 494 | # Ensure one match 495 | if ixs is None: 496 | ixs = [] 497 | return list(ixs) 498 | 499 | 500 | def _index(self,ix): 501 | """ 502 | Return ix if it hasn't been deleted 503 | """ 504 | try: 505 | item = self._list[ix] 506 | except IndexError: 507 | return [] 508 | 509 | if item is None: 510 | return [] 511 | 512 | return [ix] 513 | 514 | def _append(self,attrib,value,ix): 515 | """ 516 | Add to the lookup and update the modify time 517 | """ 518 | # Final check but we should be guarded from this 519 | if attrib in self.exclude_attributes: 520 | print('BAD! Should guard against this in public methods!') 521 | raise ValueError('Cannot reindex an excluded attribute') 522 | 523 | valueL = _makelist(value) 524 | for val in valueL: 525 | self._lookup[attrib][val].append(ix) 526 | if len(valueL) == 0: 527 | self._lookup[attrib][self._empty].append(ix) # empty list 528 | self._time = time.time() 529 | 530 | def _remove(self,attrib,value,ix): 531 | """ 532 | Remove from the lookup and update the modify time 533 | """ 534 | valueL = _makelist(value) 535 | for val in valueL: 536 | try: 537 | self._lookup[attrib][val].remove(ix) 538 | except ValueError: 539 | raise ValueError('Item not found in internal lookup. May need to first call reindex()') 540 | if len(valueL) == 0: 541 | self._lookup[attrib][self._empty].remove(ix) # empty list 542 | 543 | self._time = time.time() 544 | 545 | def __contains__(self,check_diff): 546 | check_diff = self._convert2dict(check_diff) 547 | if not ( isinstance(check_diff,dict) or isinstance(check_diff,Qobj)): 548 | raise ValueError('Python `in` queries should be a of {attribute:value} or Qobj') 549 | return self.isin(check_diff) 550 | 551 | def __len__(self): 552 | return self.N 553 | 554 | def __getitem__(self,item): 555 | item = self._convert2dict(item) 556 | if isinstance(item,dict) or isinstance(item,Qobj): 557 | return self.query_one(item) 558 | elif isinstance(item,int): # numbered item 559 | if self._list[item] is not None: 560 | return self._list[item] 561 | else: 562 | raise ValueError("Index has been deleted") 563 | else: 564 | raise ValueError("Must specify DB[{'attribute':val}] or DB[index]'") 565 | __call__ = query 566 | 567 | def __iter__(self): 568 | return (item for item in self._list if item is not None) 569 | items = __iter__ 570 | 571 | def _makelist(input): 572 | if isinstance(input,list): 573 | return input 574 | return [input] 575 | 576 | class _emptyList(object): 577 | def __init__(self): 578 | pass 579 | def __hash__(self): 580 | return 9999999999999 581 | def __eq__(self,other): 582 | return isinstance(other,list) and len(other)==0 583 | 584 | class Qobj(object): 585 | """ 586 | Query objects. This works by returning an updated *copy* of the object 587 | whenever it is acted upon 588 | 589 | Calling 590 | * Q.attribute sets attribute and returns a copy 591 | * Q.attribute == val (or any other comparison) set the index of elements 592 | * Q1 & Q1 or other boolean perform set operations 593 | 594 | Useful Methods: 595 | _filter : (or just `filter` if not an attribute): Apply a filter 596 | to the DB 597 | """ 598 | def __init__(self,DB,ixs=None,attr=None): 599 | self._DB = DB 600 | self._ixs = ixs 601 | self._attr = attr 602 | 603 | self._time = time.time() 604 | 605 | 606 | def _valid(self): 607 | if self._time < self._DB._time: 608 | raise ValueError('This query object is out of date from the DB. Create a new one') 609 | 610 | def _filter(self,filter_func): 611 | """ 612 | 613 | If 'filter' is NOT an attribute of the DB, this can be called 614 | with 'filter' instead of '_filter' 615 | 616 | Apply a filter to the data that returns True if it matches and False 617 | otherwise 618 | 619 | Note that filters are O(N) 620 | """ 621 | self._valid() # Actually, these would still work but still check 622 | ixs = set() 623 | for ix,item in enumerate(self._DB._list): # loop all 624 | item = self._DB._convert2dict(item) 625 | if item is None: 626 | continue 627 | if filter_func(item): 628 | ixs.add(ix) 629 | self._ixs = ixs 630 | return self.copy() 631 | 632 | 633 | # Comparisons 634 | def __eq__(self,value): 635 | self._valid() 636 | 637 | if self._DB.N == 0: 638 | self._ixs = set() 639 | return self.copy() 640 | 641 | first_set = True 642 | for val in _makelist(value): # Account for list inputs 643 | if self._attr == '_index': 644 | if first_set: 645 | ixs = set(self._DB._index(val)) 646 | first_set = False 647 | else: 648 | ixs = ixs.intersection(self._DB._index(val)) 649 | continue 650 | if self._attr not in self._DB.attributes: 651 | raise KeyError("'{:s}' is not an attribute".format(self._attr)) 652 | 653 | ixs_at = self._DB._lookup[self._attr][val] 654 | if first_set: 655 | ixs = set(ixs_at) 656 | first_set = False 657 | else: 658 | ixs = ixs.intersection(ixs_at) 659 | 660 | self._ixs = ixs 661 | return self.copy() 662 | 663 | def __ne__(self,value): 664 | self._ixs = self._DB._ix - (self == value)._ixs 665 | return self.copy() 666 | 667 | def __lt__(self,value): 668 | self._valid() # Actually, these would still work but still check 669 | ixs = set() 670 | for ix,item in enumerate(self._DB._list): # loop all 671 | item = self._DB._convert2dict(item) 672 | if item is None: 673 | continue 674 | for ival in _makelist(item[self._attr]): 675 | if ival < value: 676 | ixs.add(ix) 677 | self._ixs = ixs 678 | return self.copy() 679 | 680 | def __le__(self,value): 681 | self._valid() # Actually, these would still work but still check 682 | ixs = set() 683 | for ix,item in enumerate(self._DB._list): # loop all 684 | item = self._DB._convert2dict(item) 685 | if item is None: 686 | continue 687 | if item[self._attr] <= value: 688 | ixs.add(ix) 689 | self._ixs = ixs 690 | return self.copy() 691 | 692 | def __gt__(self,value): 693 | self._valid() # Actually, these would still work but still check 694 | ixs = set() 695 | for ix,item in enumerate(self._DB._list): # loop all 696 | item = self._DB._convert2dict(item) 697 | if item is None: 698 | continue 699 | if item[self._attr] > value: 700 | ixs.add(ix) 701 | self._ixs = ixs 702 | return self.copy() 703 | 704 | def __ge__(self,value): 705 | self._valid() # Actually, these would still work but still check 706 | ixs = set() 707 | for ix,item in enumerate(self._DB._list): # loop all 708 | item = self._DB._convert2dict(item) 709 | if item is None: 710 | continue 711 | if item[self._attr] >= value: 712 | ixs.add(ix) 713 | self._ixs = ixs 714 | return self.copy() 715 | 716 | # Logic 717 | def __and__(self,Q2): 718 | if self._ixs == None: # An empty object and another will just return other 719 | return Q2 720 | self._ixs.intersection_update(Q2._ixs) 721 | return self.copy() 722 | def __or__(self,Q2): 723 | self._ixs.update(Q2._ixs) 724 | return self.copy() 725 | def __invert__(self): 726 | self._ixs = self._DB._ix - self._ixs 727 | return self.copy() 728 | 729 | def __getattr__(self,attr): 730 | if attr == 'filter' and 'filter' not in self._DB.attributes: 731 | return self._filter 732 | self._attr = attr 733 | return self.copy() 734 | 735 | def copy(self): 736 | new = Qobj(self._DB,ixs=self._ixs,attr=self._attr) 737 | # Reset the time 738 | new._time = self._time 739 | return new 740 | 741 | 742 | 743 | 744 | 745 | 746 | 747 | 748 | 749 | 750 | 751 | 752 | 753 | 754 | 755 | 756 | 757 | 758 | 759 | 760 | 761 | 762 | -------------------------------------------------------------------------------- /PyFiSync/remote_interfaces.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Remote interfaces 5 | 6 | For now, it is *just* SSH + rsync to work within the subprocess on Unix/Linux 7 | (macOS is Unix). It is separated to more easily make other backends 8 | 9 | An interface must have the following methods and behaviors from 10 | 'remote_interface_base'. Note that optional methods have a pass but the 11 | required ones will raise a NotImplementedError 12 | """ 13 | from __future__ import division, print_function, unicode_literals 14 | 15 | import subprocess 16 | import re 17 | import sys 18 | import os 19 | import random 20 | import string 21 | import json 22 | import zlib 23 | import shlex 24 | import tempfile 25 | import datetime 26 | 27 | from io import open 28 | 29 | if sys.version_info[0] > 2: 30 | xrange = range 31 | unicode = str 32 | 33 | from . import utils 34 | 35 | REMOTES = ['rsync','rclone'] 36 | 37 | class remote_interface_base(object): 38 | def __init__(self,config,log=None): 39 | """ 40 | * Just pass it the configuration file 41 | * Optionally, pass it the log object to modify 42 | """ 43 | raise NotImplementedError() 44 | def file_list(self,attributes,empty): 45 | """ 46 | * Attributes are a list of requested attributes but generally, more 47 | should be returned in case attributes change. 48 | * follow the `empty` settings of PFSwalk -- if applicable 49 | 'store': stores a list of empty directories 50 | 'remove': Deletes all empty directories if (and only if) they 51 | were *not* empty before. Also removes stored list 52 | 'reset': Removes stored list 53 | """ 54 | raise NotImplementedError() 55 | 56 | def apply_queue(self,queue,force): 57 | """ 58 | * queue is the action queue that takes the following form 59 | * {'backup':[file_path]} # Make a copy to the backup 60 | * {'move': [src,dest]} # Move the file 61 | * {'delete': [file_path]} # Move the file into the backup. Essentially a backup 62 | * Force tells it to allow a file to be moved into another 63 | 64 | Notes: 65 | * overwriting moves have already been removed 66 | * Delete should backup first if set config.backup == True 67 | * Backup should NOT happen if config.backup == False 68 | """ 69 | raise NotImplementedError() 70 | 71 | def transfer(self,tqA2B,tqB2A): 72 | """ 73 | * Apply the trasnfer from B to A and from A to B 74 | * MUST maintain modification times upon transfer 75 | """ 76 | raise NotImplementedError() 77 | 78 | def close(self): 79 | """ 80 | * If it has this function, it will try to call it at the very end 81 | """ 82 | pass 83 | 84 | @staticmethod 85 | def cli(argv): 86 | """ 87 | should be decorated with @staticmethod 88 | All of the commands will be passed. Can use this to communicate remotely 89 | if needed 90 | 91 | For example 92 | ./PyFiSync.py _api file_list --flag1 val1 --flag2 93 | 94 | will pass argv = ['file_list', '--flag1', 'val1', '--flag2'] 95 | """ 96 | pass 97 | 98 | class ssh_rsync(remote_interface_base): 99 | def __init__(self,config,log=None): 100 | self.config = config 101 | if log is None: 102 | log = utils.logger(silent=False,path=None) 103 | self.log = log 104 | self._debug = getattr(config,'_debug',False) 105 | 106 | if config.persistant: 107 | # Set up master connection for 600 seconds 108 | self.sm = '-S /tmp/' + _randstr(5) 109 | cmd = 'ssh -N -M {sm:s} -p {ssh_port:d} -q {userhost:s}'.\ 110 | format(sm=self.sm,**config.__dict__) 111 | 112 | self.persistant_proc = subprocess.Popen(shlex.split(cmd)) 113 | 114 | else: 115 | self.sm = '' # Do nothings 116 | 117 | def file_list(self,attributes,empty=None): 118 | """ 119 | Get the file list in B (remote) 120 | """ 121 | attributes = list(set(attributes)) 122 | config = self.config 123 | log = self.log 124 | 125 | # Construct the command 126 | cmd = 'ssh {sm} -p {ssh_port:d} -q {userhost:s} "'.format(sm=self.sm,**config.__dict__) 127 | 128 | if hasattr(config,'PyFiSync_path') and hasattr(config,'remote_program'): 129 | log.add("DEPRECATION WARNING: 'PyFiSync_path' and 'remote_program' are deprecated. Use 'remote_exe'") 130 | # construct the call cmd 131 | if len(config.PyFiSync_path) == 0: 132 | cmd += 'PyFiSync _api file_list"' 133 | else: 134 | cmd += config.remote_program + ' ' 135 | if any(config.PyFiSync_path.endswith('PyFiSync'+ext) for ext in ['','.py']): 136 | cmd += config.PyFiSync_path + ' _api file_list"' 137 | else: 138 | cmd += os.path.join(config.PyFiSync_path,'PyFiSync.py _api file_list"') 139 | else: 140 | cmd += '{} _api file_list"'.format(config.remote_exe) 141 | 142 | remote_config = dict() 143 | 144 | remote_config['path'] = config.pathB 145 | remote_config['excludes'] = list(set(config.excludes)) 146 | remote_config['empty'] = empty 147 | remote_config['attributes'] = list(set(attributes)) 148 | remote_config['copy_symlinks_as_links'] = config.copy_symlinks_as_links 149 | remote_config['use_hash_db'] = config.use_hash_db 150 | 151 | log.add('Calling for remote file list') 152 | 153 | # Encode the config. Just in case there is any additional cruft, add 154 | # a starting sentinel 155 | sentinel = _randstr(N=10).encode('ascii') 156 | cmd = shlex.split(cmd) 157 | cmd[-1] += ' ' + sentinel.decode('ascii') # Add the sentinel to the final command 158 | 159 | json_config = sentinel+json.dumps(remote_config,ensure_ascii=False).encode('utf8') 160 | 161 | # Use a tempfile to prevent a buffering issue 162 | outfile = tempfile.NamedTemporaryFile(mode='wb',delete=False) 163 | 164 | proc = subprocess.Popen(cmd,stdin=subprocess.PIPE, 165 | stdout=outfile, 166 | stderr=subprocess.PIPE, 167 | shell=False) 168 | _,err = proc.communicate(json_config) 169 | 170 | if len(err)>0: 171 | err = utils.to_unicode(err) 172 | log.add('Remote Call returned warnings:') 173 | log.space = 4 174 | log.add(err) 175 | log.space = 0 176 | 177 | # Read back the output, find the sentinel, decompress and return the output 178 | with open(outfile.name,'rb') as F: 179 | out = F.read() 180 | out = out[out.find(sentinel) + len(sentinel):] 181 | 182 | try: 183 | out = zlib.decompress(out) 184 | except: 185 | return 186 | 187 | return json.loads(out) 188 | 189 | def apply_queue(self,queue,force=False): 190 | """ 191 | Remote call to apply queue assumeing B is remote 192 | """ 193 | log = self.log 194 | config = self.config 195 | 196 | if len(queue) == 0: 197 | log.add(' >> No remote actions <<') 198 | return 199 | 200 | sentinel = _randstr(N=10).encode('ascii') 201 | 202 | queue_bytes = json.dumps(queue,ensure_ascii=False).encode('utf8') 203 | 204 | # Construct the command 205 | cmd = 'ssh {sm} -p {ssh_port:d} -q {userhost:s} "'.format( 206 | sm=self.sm,**config.__dict__) 207 | 208 | # construct the call cmd 209 | if hasattr(config,'PyFiSync_path') and hasattr(config,'remote_program'): 210 | log.add("DEPRECATION WARNING: 'PyFiSync_path' and 'remote_program' are deprecated. Use 'remote_exe'") 211 | if len(config.PyFiSync_path) == 0: 212 | cmd += 'PyFiSync _api apply_queue' 213 | else: 214 | cmd += config.remote_program + ' ' 215 | if any(config.PyFiSync_path.endswith('PyFiSync'+ext) for ext in ['','.py']): 216 | cmd += config.PyFiSync_path + ' _api apply_queue' 217 | else: 218 | cmd += os.path.join(config.PyFiSync_path,'PyFiSync.py _api apply_queue') 219 | else: 220 | cmd += '{} _api apply_queue'.format(config.remote_exe) 221 | 222 | if force: 223 | cmd += ' --force ' 224 | 225 | if not config.backup: 226 | cmd += ' --no-backup ' 227 | 228 | cmd += ' ' + config.pathB + ' {}"'.format(sentinel.decode('ascii')) 229 | 230 | out = '' 231 | err = '' 232 | 233 | log.space=0 234 | log.add('\nApplying queue on remote') 235 | log.prepend = '> ' 236 | 237 | started = False 238 | cmd = shlex.split(cmd) 239 | proc = subprocess.Popen(cmd,stdin=subprocess.PIPE, 240 | stdout=subprocess.PIPE, 241 | stderr=subprocess.PIPE, 242 | shell=False) 243 | 244 | proc.stdin.write(sentinel + queue_bytes) 245 | proc.stdin.close() 246 | 247 | with proc.stdout as stdout: 248 | for line in iter(stdout.readline, b''): 249 | line = utils.to_unicode(line) 250 | if not started and line.find('START>>>>>>>')>=0: 251 | started = True 252 | continue 253 | 254 | if line.find('<<<<<<=0: 255 | started = False 256 | 257 | if started: 258 | log.add(line.rstrip()) 259 | 260 | 261 | with proc.stderr as stderr: 262 | for line in iter(stderr.readline, b''): 263 | err += utils.to_unicode(line) 264 | proc.wait() 265 | log.prepend = '' 266 | if len(err)>0: 267 | log.add('Remote Call returned warnings:') 268 | log.space = 4 269 | log.add(err) 270 | 271 | def transfer(self,tqA2B,tqB2A): 272 | config = self.config 273 | log = self.log 274 | 275 | pwd0 = os.getcwd() 276 | os.chdir(config.pathA) 277 | 278 | # Build the command 279 | cmd = 'rsync -azvi -hh ' \ 280 | + '--keep-dirlinks --copy-dirlinks ' # make directory links behave like they were folders 281 | 282 | if not config.copy_symlinks_as_links: 283 | cmd += '--copy-links ' 284 | 285 | if len(config.userhost) >0: 286 | cmd += '-e "ssh -q -p {p:d} {sm}" '.format(p=config.ssh_port,sm=self.sm) 287 | B = '{userhost:s}:{pathB:s}'.format(**config.__dict__) 288 | else: 289 | B = '{pathB:s}'.format(**config.__dict__) 290 | 291 | cmd += ' --files-from={files:s} {src:s}/ {dest:s}/' 292 | 293 | log.add('(using rsync)') 294 | 295 | if len(tqA2B) > 0: 296 | 297 | # A2B 298 | tmp_file = '/tmp/tqA2B' + _randstr() 299 | 300 | for ix,item in enumerate(tqA2B): # Opperate on the list IN PLACE 301 | item = item.encode('utf-8') 302 | tqA2B[ix] = item 303 | 304 | with open(tmp_file,'wb') as F: 305 | F.write('\n'.encode('utf-8').join(tqA2B)) 306 | 307 | cmdA2B = cmd.format(files=tmp_file,src=config.pathA,dest=B) 308 | 309 | log.space=1 310 | log.add('Running rsync A >>> B') 311 | log.add(' cmd = ' + cmdA2B) 312 | log.space=4 313 | 314 | 315 | proc = subprocess.Popen(cmdA2B, stdout=subprocess.PIPE,shell=True) 316 | with proc.stdout: 317 | for line in iter(proc.stdout.readline, b''): 318 | line = self._proc_final_log(line) 319 | log.add(line) 320 | 321 | proc.wait() 322 | else: 323 | log.space=1 324 | log.add('\nNo A >>> B transfers') 325 | 326 | ######### 327 | 328 | if len(tqB2A) > 0: 329 | # B2A 330 | tmp_file = '/tmp/tqB2A' + _randstr() 331 | for ix,item in enumerate(tqB2A): # Opperate on the list IN PLACE 332 | item = item.encode('utf-8') 333 | tqB2A[ix] = item 334 | 335 | with open(tmp_file,'wb') as F: 336 | F.write('\n'.encode('utf-8').join(tqB2A)) 337 | 338 | cmdB2A = cmd.format(files=tmp_file,dest=config.pathA,src=B) 339 | 340 | log.space=1 341 | log.add('\nRunning rsync A <<< B') 342 | log.add(' cmd = ' + cmdB2A) 343 | log.space=4 344 | 345 | proc = subprocess.Popen(cmdB2A, stdout=subprocess.PIPE,shell=True) 346 | with proc.stdout: 347 | for line in iter(proc.stdout.readline, b''): 348 | line = self._proc_final_log(line) 349 | log.add(line) 350 | 351 | proc.wait() 352 | else: 353 | log.space=1 354 | log.add('\nNo A <<< B transfers') 355 | 356 | os.chdir(pwd0) 357 | 358 | 359 | def _proc_final_log(self,line): 360 | line = line.strip() 361 | if len(line) == 0: return None 362 | try: 363 | line = utils.to_unicode(line) 364 | except: 365 | return None 366 | try: 367 | action_path = [i.strip() for i in line.split(' ',1)] 368 | except UnicodeDecodeError: # A bit of a hack but this works to make py2 happy 369 | action_path = [utils.to_unicode(a) for a in line.decode('utf8').split(' ')] 370 | 371 | if len(action_path) != 2: 372 | return 'could not parse action: {:s}'.format(line) 373 | 374 | action = action_path[0] 375 | path = action_path[1] 376 | 377 | action = action.replace('<','>') 378 | 379 | if action.startswith('sent'): 380 | return '\n' + line 381 | if action.startswith('total'): 382 | return line 383 | 384 | if any([action.startswith(d) for d in ['receiving','building']]): 385 | return None 386 | 387 | if action.startswith('>'): return 'Transfer ' + path 388 | if action.startswith('cd'): return 'mkdir ' + path 389 | if action.startswith('cL'): return 'link ' + path 390 | if action.startswith('.'): return None 391 | 392 | return line 393 | 394 | @staticmethod 395 | def cli(argv): 396 | from . import PFSwalk 397 | from . import main 398 | 399 | mode = argv[0] 400 | argv = argv[1:] 401 | if mode == 'file_list': 402 | # Get the sentinel 403 | sentinel = argv[0].encode('ascii') 404 | 405 | # For python3 to read bytes 406 | stdin = sys.stdin 407 | if hasattr(stdin,'buffer'): 408 | stdin = stdin .buffer 409 | stdout = sys.stdout 410 | if hasattr(stdout,'buffer'): 411 | stdout = stdout.buffer 412 | 413 | # Read the config, find and cut up to the sentinel, convert to 414 | # unicode and json load 415 | 416 | 417 | remote_config_bytes = stdin.read() 418 | remote_config_bytes = remote_config_bytes[remote_config_bytes.find(sentinel)+len(sentinel):] 419 | remote_config_bytes = remote_config_bytes.decode('utf8') 420 | remote_config = json.loads(remote_config_bytes) 421 | 422 | # Process the input 423 | path = remote_config['path'] 424 | config = utils.configparser() 425 | config.pathA = path 426 | 427 | empty = remote_config['empty'] 428 | config.copy_symlinks_as_links = remote_config['copy_symlinks_as_links'] 429 | config.excludes = list(set(remote_config['excludes'])) # do *not* use default excludes 430 | config.use_hash_db = remote_config['use_hash_db'] 431 | 432 | # Generate the list. This may raise errors so do not start 433 | # capture until later 434 | log = utils.logger(silent=True,path=None) 435 | _tmp = PFSwalk.file_list(path,config,log, 436 | attributes=remote_config['attributes'], 437 | empty=empty, 438 | use_hash_db=config.use_hash_db) 439 | flist = _tmp.files() 440 | 441 | out = json.dumps(flist,ensure_ascii=False) 442 | out = zlib.compress(out.encode('utf8'),9) # Compress it 443 | 444 | stdout.write(sentinel + out) # write the bytes 445 | 446 | elif mode == 'apply_queue': 447 | import getopt # Even though it is "old school" use getopt here 448 | # since it is easier and this interface is never 449 | # exposed to the user 450 | # For python3 to read bytes 451 | stdin = sys.stdin 452 | if hasattr(stdin,'buffer'): 453 | stdin = stdin.buffer 454 | stdout = sys.stdout 455 | if hasattr(stdout,'buffer'): 456 | stdout = stdout.buffer 457 | 458 | try: 459 | opts, args = getopt.getopt(argv, "",['force','no-backup']) 460 | except getopt.GetoptError as err: 461 | print(str(err)) #print error 462 | sys.exit(2) 463 | 464 | path,sentinel = args 465 | 466 | config = utils.configparser() 467 | config.pathA = path 468 | 469 | # Place the config into PyFiSync 470 | main.config = config 471 | 472 | force = False 473 | for opt,val in opts: 474 | if opt == '--force': 475 | force = True 476 | if opt == '--no-backup': 477 | config.backup = False 478 | 479 | log = utils.logger(path=path,silent=False) 480 | 481 | sys.stdout.write('START>>>>>>>\n') 482 | 483 | # Get the queue from stdin 484 | sentinel = sentinel.encode('ascii') 485 | queue = stdin.read() 486 | queue = queue[queue.find(sentinel)+len(sentinel):] 487 | queue = queue.decode('utf8') 488 | 489 | try: 490 | queue = json.loads(queue) 491 | except Exception as E: 492 | sys.stderr.write('could not parse input. Error: "{}"'.format(E)) 493 | sys.exit(2) 494 | 495 | print('Successfully loading action queue of {:d} items'.format(len(queue))) 496 | 497 | main.apply_action_queue(path,queue) 498 | 499 | sys.stdout.write('\n<<<<<< 0: 653 | 654 | # A2B 655 | tmp_file = '/tmp/tqA2B' + _randstr() 656 | 657 | with open(tmp_file,'wt') as file: 658 | file.write('\n'.join('/' + t for t in tqA2B)) # Must start with / to be full path for root 659 | 660 | newargs = args[:] 661 | newargs.extend(['-v','--stats-one-line']) 662 | newargs.extend(['copy','--files-from','{}'.format(tmp_file)]) 663 | newargs.extend([config.pathA,config.pathB]) 664 | 665 | log.space=1 666 | log.add('Running rclone A >>> B') 667 | log.space = 4 668 | out = self.call(newargs,echo=True) 669 | 670 | 671 | else: 672 | log.space=1 673 | log.add('\nNo A >>> B transfers') 674 | 675 | log.add('') 676 | 677 | if len(tqB2A) > 0: 678 | 679 | # B2A 680 | tmp_file = '/tmp/tqB2A' + _randstr() 681 | 682 | with open(tmp_file,'wt') as file: 683 | file.write('\n'.join('/' + t for t in tqB2A)) # Must start with / to be full path for root 684 | 685 | newargs = args[:] 686 | newargs.extend(['-v','--stats-one-line']) 687 | newargs.extend(['copy','--files-from','{}'.format(tmp_file)]) 688 | newargs.extend([config.pathB,config.pathA]) 689 | 690 | log.space=1 691 | log.add('Running rclone A <<< B ') 692 | log.space = 4 693 | out = self.call(newargs,echo=True) 694 | #log.add(out) 695 | 696 | else: 697 | log.space=1 698 | log.add('\nNo A <<< B transfers') 699 | 700 | 701 | def call(self,args,echo=False): 702 | """ 703 | Call rclone with the appropriate flags already set 704 | """ 705 | if isinstance(args,(str,unicode)): 706 | args = shlex.split(args) 707 | args = list(args) 708 | env = dict(os.environ) 709 | if self.config.rclone_pw: 710 | args.append('--ask-password=false') 711 | env['RCLONE_CONFIG_PASS'] = self.config.rclone_pw 712 | 713 | cmd = list() 714 | cmd.append(self.config.rclone_executable) 715 | cmd.extend(self.flags) 716 | cmd.extend(args) 717 | 718 | # Use two different methods depending on whether we need to stream 719 | # the result. This is to hopefully prevent issues with large 720 | # buffered responses 721 | if self._debug: 722 | txt = ['DEBUG MODE',''] 723 | txt.append('rclone call') 724 | txt.append(' '.join(cmd)) 725 | txt.append(' ') 726 | self.log.add_err('\n'.join(txt)) 727 | 728 | if echo: 729 | stdout = subprocess.PIPE 730 | self.log.add('rclone\n $ ' + ' '.join(cmd) + '\n') 731 | else: 732 | stdout = tempfile.NamedTemporaryFile(mode='wb',delete=False) 733 | 734 | proc = subprocess.Popen(cmd, 735 | stdout=stdout, 736 | stderr=subprocess.STDOUT if not self._debug else subprocess.PIPE, 737 | shell=False, 738 | env=env, 739 | cwd=self.config.pathA) 740 | if echo: 741 | out = [] 742 | with proc.stdout: 743 | for line in iter(proc.stdout.readline, b''): 744 | line = utils.to_unicode(line) 745 | self.log.add(line.rstrip()) 746 | out.append(line) 747 | if self._debug: 748 | err = proc.stderr.read() 749 | else: 750 | _,err = proc.communicate() # Since we are not streaming the output 751 | with open(stdout.name,'rb') as F: 752 | out = utils.to_unicode(F.read()) 753 | proc.wait() 754 | if proc.returncode >0: 755 | self.log.add_err('rclone returned a non-zero exit code') 756 | 757 | if self._debug: 758 | txt = [] 759 | txt.append('OUT:') 760 | txt.append(''.join(out)) 761 | txt.append('ERR:') 762 | txt.append(utils.to_unicode(err)) 763 | txt = '\n'.join(txt) 764 | txt = [''] + txt.split('\n') 765 | txt = '\nDEBUG: '.join(txt) 766 | self.log.add_err(txt) 767 | 768 | 769 | return ''.join(out) 770 | 771 | 772 | def get_remote_interface(config=None,name=None): 773 | if config is None == name is None: 774 | raise ValueError('Must specify config OR name') 775 | 776 | if config is not None: 777 | name = config.remote 778 | 779 | if name == 'rsync': 780 | if len(getattr(config,'userhost','')) == 0: 781 | return None 782 | return ssh_rsync 783 | elif name == 'rclone': 784 | return Rclone 785 | else: 786 | raise ValueError() 787 | 788 | 789 | def _randstr(N=10): 790 | random.seed() 791 | return ''.join(random.choice('abcdefghijklmnopqrstuvwxyz') for _ in xrange(N)) 792 | 793 | 794 | 795 | 796 | 797 | 798 | --------------------------------------------------------------------------------