├── tests
    ├── __init__.py
    ├── readme.md
    ├── run_test.sh
    ├── test_pushpull.py
    ├── test_file_tracking.py
    └── testutils.py
├── PyFiSync
    ├── __init__.py
    ├── dry_run.py
    ├── config_template.py
    ├── PFSwalk.py
    ├── utils.py
    ├── dicttable.py
    ├── ldtable.py
    └── remote_interfaces.py
├── PyFiSync.py
├── setup.py
├── license
├── rsync.md
├── changelog.md
├── rclone.md
├── FAQs.md
├── rclone_b2.md
└── README.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/PyFiSync/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | sys.dont_write_bytecode = True
5 | 
6 | from .main import __version__,__author__ 
7 | from .main import cli
8 | 


--------------------------------------------------------------------------------
/PyFiSync.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | from __future__ import division, print_function, unicode_literals, absolute_import
 4 | 
 5 | import sys
 6 | sys.dont_write_bytecode = True
 7 | 
 8 | from PyFiSync import cli
 9 | 
10 | if __name__ == '__main__':
11 |     argv = sys.argv[1:] # Argument besides function name
12 |     cli(argv)
13 | 


--------------------------------------------------------------------------------
/tests/readme.md:
--------------------------------------------------------------------------------
 1 | # Tests
 2 | 
 3 | These tests are driven by `py.test`. They are not all formal unit-tests but rather tests the overal behavior.
 4 | 
 5 | Note that the tests use the default configuration. If that is changed, tests should be carefully updated!
 6 | 
 7 | ## Failed Tests
 8 | 
 9 | When running the `test_pre_post_bash` with coverage and python3, it throws errors. They work find but not with coverage!
10 | 


--------------------------------------------------------------------------------
/tests/run_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Test local being python2 and python3. Inside of the tests, it test for 
 3 | #     local --> local
 4 | #     local --> remote py2
 5 | #     local --> remote py3
 6 | 
 7 | # pip install pytest-cov
 8 | 
 9 | # Assuming you have py.test installed for both python2 and 3
10 | P0=$(pwd)
11 | cd "$(dirname "$0")"
12 | 
13 | p2dir=$(dirname $(command which python2))
14 | ${p2dir}/py.test --cov=PyFiSync --cov-report html test_*.py
15 | 
16 | p3dir=$(dirname $(command which python3))
17 | ${p3dir}/py.test --cov=PyFiSync --cov-report html test_*.py
18 | 
19 | cd $P0
20 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import PyFiSync
 5 | 
 6 | from setuptools import setup
 7 | 
 8 | setup(
 9 |     name='PyFiSync',
10 |     packages=['PyFiSync'],
11 |     long_description=open('README.md').read(),
12 |     entry_points = {
13 |         'console_scripts': ['PyFiSync=PyFiSync.main:cli'],
14 |     },
15 |     version=PyFiSync.__version__,
16 |     description='Python based intelligent file sync with automatic backups and file move/delete tracking.',
17 |     url='https://github.com/Jwink3101/PyFiSync',
18 |     author=PyFiSync.__author__,
19 |     author_email='Jwink3101@@users.noreply.github.com',
20 |     license='MIT',
21 | )
22 | 
23 | 


--------------------------------------------------------------------------------
/license:
--------------------------------------------------------------------------------
1 | Copyright 2019 Justin Winokur
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/rsync.md:
--------------------------------------------------------------------------------
 1 | # Rsync Setup
 2 | 
 3 | First set up ssh keys on your *local* machine:
 4 | 
 5 |     $ cd
 6 |     $ ssh-keygen -t rsa 
 7 |     
 8 |     # It is highly suggested you use a password but you can hit enter 
 9 |     # twice to skip it
10 | 
11 |     $ cat ~/.ssh/id_rsa.pub | ssh user@remote-system "mkdir -p ~/.ssh && cat >>  ~/.ssh/authorized_keys" 
12 | 
13 | I will assume that `PyFiSync` has been installed on **both** machines. See [the FAQs](FAQs.md) if there is an issue with paths on the remote machines. It is likely due to `.bashrc` not being loaded properly.
14 | 
15 | Then modify the config file. All options are commented.
16 | 
17 |     $ PyFiSync reset --force path/to/sync_dir
18 | 
19 | Then sync. This will essentially create a union of the two sides
20 | 
21 |     $ PyFiSync sync path/to/syncdir
22 | 
23 | Essentially this will be a union of the two sides
24 |     
25 | (The `--all` is optional but suggested for the first sync. If using `--all`, it is *highly* suggested to add `--no-backup` since everything would be copied)
26 | 
27 | Or, (`PyFiSync` assumes a `sync .` if not given other options)
28 | 
29 |     $ cd path/to/syncdir
30 |     $ PyFiSync
31 | 


--------------------------------------------------------------------------------
/PyFiSync/dry_run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Dry Run equiv
 4 | """
 5 | from __future__ import division, print_function, unicode_literals, absolute_import
 6 | 
 7 | from . import utils
 8 | 
 9 | import sys
10 | if sys.version_info >= (3,):
11 |     unicode = str
12 |     xrange = range
13 | 
14 | def apply_action_queue(queue,log,name,config):
15 |     log.add('\n(DRY-RUN) Applying queue on {}'.format(name))
16 |     for action_dict in queue:
17 |         action,path = list(action_dict.items())[0]
18 |         if action == 'move':
19 |             log.add('(DRY-RUN) move: ' + utils.move_txt(path[0],path[1]))
20 |         elif action in ['backup','delete']:
21 |             if action == 'backup' and config.backup:
22 |                 log.add('(DRY-RUN) backup: ' + path)
23 |             elif action=='delete' and config.backup:
24 |                 log.add('(DRY-RUN) delete (w/ backup): ' + path)
25 |             elif action=='delete' and not config.backup:
26 |                 log.add('(DRY-RUN) delete (w/o backup): ' + path)
27 |             else:
28 |                 pass # Do nothing for now
29 | 
30 | def transfer(tqA2B,tqB2A,log,filesA,filesB):
31 |     if len(tqA2B) > 0:
32 |         totA = 0.0
33 |         log.space = 1
34 |         log.add('(DRY-RUN) A >>> B')
35 |         log.space = 4
36 |         for item in tqA2B:
37 |             file = filesA.query_one(path=item)
38 |             if file is not None:
39 |                 totA += file['size']
40 |             log.add('(DRY-RUN) ' + item)
41 |         log.add('(DRY-RUN) Total Size: %0.2f %s' % utils.bytes2human(totA,short=False))
42 |         
43 |     else:
44 |         log.space=1
45 |         log.add('\nNo A >>> B transfers')
46 |         
47 |     if len(tqB2A) > 0:
48 |         totB = 0.0
49 |         log.space = 1
50 |         log.add('(DRY-RUN) A <<< B')
51 |         log.space = 4
52 |         for item in tqB2A:
53 |             file = filesB.query_one(path=item)
54 |             if file is not None:
55 |                 totB += file['size']
56 |             log.add('(DRY-RUN) ' + item)
57 |         log.add('(DRY-RUN) Total Size: %0.2f %s' % utils.bytes2human(totB,short=False))
58 |     else:
59 |         log.space=1
60 |         log.add('\nNo A <<< B transfers')
61 | 


--------------------------------------------------------------------------------
/changelog.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | This is for *major* changes only; especially ones that break or change functionality
 4 | 
 5 | ## 20210626.0:
 6 | 
 7 | - Changes the conflict "tag" names from, for example, `file.ext.nameA` to `file.nameA.ext`.
 8 |     - Tests updated to reflect that change
 9 | 
10 | ## 20200916.0
11 | 
12 | * Add `exclude_if_present` option to exclude certain directories based on the existence of a file. This is implemented as a post-listing filter so the directory is still transversed but then later filters. While less efficient, it is both simpler to code and allows for excludes to work on both sides making it *much* safer
13 | * Updates `ldtable` to `DictTable` (aa019ec800)
14 | * Adds note that rclone is still supported but users should migrate to [syncrclone](https://github.com/Jwink3101/syncrclone) as it is better for rclone remotes
15 | * *WARNING*: This is likely the last or nearly last version to support python2
16 | 
17 | ## 20200814.0
18 | 
19 | * Fixed buffsize warnings with python 3.8
20 | 
21 | ## 20200423.0:
22 | 
23 | The remote specification of the PyFiSync path has been changed to just specifying `remote_exe`. The prior settings `remote_program` and `PyFiSync_path` will still work but will throw a deprecation warning and may break in future releases.
24 | 
25 | No warnings will be issued but this is likely the last (or nearly last) release that will support Python 2.
26 | 
27 | Other minor changes.
28 | 
29 | * Added stats on transfers and file lists
30 | * Fixed a bug and added test wherein the config file was not executed from the sync directory as expected.
31 | 
32 | ## 20191119:
33 | 
34 | Minor bug fix for ssh+rsync backend where the default excludes (e.g. `.git`) were being applied even when they were explicitly *not* intended to be excluded.
35 | 
36 | ## 20191115:
37 | 
38 | This change is all about using file hashes.
39 | 
40 | * Added ability to tell PyFiSync to compare hashes instead of or in addition to `mtime`. This is *much* more robust, though more expensive
41 |     * `mtime` is still used to resolve conflicts but if two files have differing `mtime` but the same hash (and name), they are not transfered.
42 | * Added the ability to specify any `hashlib.algorithms_guaranteed` for local and rsync remotes.
43 |     * Also added dbhash
44 |     * Changed adler to return hex value. (and actually made adler an option)
45 | * Improved hashdb test to ensure it is *actually* being used (it was! It just wasn't tested well)
46 | 
47 | Plus minor bug fixes, typo fixes, and other improvements
48 | 
49 | ## 20190509:
50 | 
51 | This is a **major** change! Assuming PyFiSync has been updated on both sides, it is a good practice to copy your old config file, make a new one, and then manually update as needed.
52 | 
53 | Some (but not all) changes are:
54 | 
55 | * Added an rclone backend. The config file for rclone is very different but the rsync only has a few minor changes
56 | * Removed `git_exclude` completely. It was a nice feature but really could be accomplished by *only* allowing certain files in git and then excluding them from PyFiSync
57 | * Removed push/pull modes. They were not as robust and didn't add to the tool
58 | 


--------------------------------------------------------------------------------
/tests/test_pushpull.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | from __future__ import unicode_literals,print_function
  4 | 
  5 | try:
  6 |     from . import testutils
  7 | except (ValueError,ImportError):
  8 |     import testutils
  9 | testutils.add_module() # make sure the test are importing the NON-installed version
 10 | import PyFiSync
 11 | 
 12 | import os
 13 | import sys
 14 | import shutil
 15 | import itertools
 16 | from glob import glob
 17 | import re
 18 | from pprint import pprint
 19 | 
 20 | import pytest
 21 | 
 22 | ## Specify whether to test remotely or locally...or both
 23 | # remotes = [False]   # Just test locally
 24 | # remotes = [False,True]
 25 | remotes = [False,'python2','python3']
 26 | 
 27 | 
 28 | @pytest.mark.parametrize("remote,AB,all_", list(itertools.product(remotes,['A','B'],[True,False])))
 29 | def test_mod_new_different(remote,AB,all_):
 30 |     """ Different file modified on each side. Only one changes. Then add 'all'
 31 |     to make sure both have uploaded everything, even what is not modified """
 32 |     testpath = os.path.join(os.path.abspath(os.path.split(__file__)[0]),
 33 |             'test_dirs','pp','test_mod_new_different')
 34 |     try:
 35 |         shutil.rmtree(testpath)
 36 |     except:
 37 |         pass
 38 |     os.makedirs(testpath)
 39 |     testutil = testutils.Testutils(testpath=testpath)
 40 | 
 41 |     # Init
 42 |     testutil.write('A/fileA',text='fileA')
 43 |     testutil.write('A/fileB',text='fileB')
 44 | 
 45 |     # copy over
 46 |     testutil.copy_tree()
 47 | 
 48 |     # Start it
 49 |     config = testutil.get_config(remote=remote)
 50 |     testutil.init(config)
 51 | 
 52 |     # Apply actions
 53 |     testutil.write('A/fileA',text='Aaa',mode='a') # append it
 54 |     testutil.write('B/fileB',text='B',mode='a')
 55 | 
 56 |     testutil.write('A/fileA_new',text='fileA_new')
 57 |     testutil.write('B/fileB_new',text='fileB_new')
 58 | 
 59 |     # Sync
 60 |     if AB == 'A':
 61 |         mode = 'push'
 62 |     else:
 63 |         mode='pull'
 64 | 
 65 |     if all_:
 66 |         mode += '_all'
 67 | 
 68 |     testutil.run(config,mode=mode)
 69 |     # Check it -- Only need to check A
 70 |     diff = testutil.compare_tree()
 71 |     
 72 |     if all_:
 73 |         assert len(diff) == 1
 74 |     else:
 75 |         assert len(diff) == 2
 76 | 
 77 |     if AB == 'A':
 78 |         assert (u'missing_inA', u'fileB_new') in diff
 79 |         if not all_: # This change should be overwritten
 80 |             assert ('disagree', 'fileB') in diff
 81 |     else:
 82 |         assert (u'missing_inB', u'fileA_new') in diff
 83 |         if not all_:# This change should be overwritten
 84 |             assert ('disagree', 'fileA') in diff
 85 | 
 86 | 
 87 | @pytest.mark.parametrize("remote,AB,all_", list(itertools.product(remotes,['A','B'],[True,False])))
 88 | def test_move_overwrite(remote,AB,all_):
 89 |     """ A file move that will overwrite on recieveing end. Check backups """
 90 |     testpath = os.path.join(os.path.abspath(os.path.split(__file__)[0]),
 91 |             'test_dirs','pp','test_move_overwrite')
 92 |     try:
 93 |         shutil.rmtree(testpath)
 94 |     except:
 95 |         pass
 96 |     os.makedirs(testpath)
 97 |     testutil = testutils.Testutils(testpath=testpath)
 98 | 
 99 |     # Init
100 |     testutil.write('A/fileA0',text='fileA0')
101 |     testutil.write('A/fileB0',text='fileB0')
102 | 
103 |     # copy over
104 |     testutil.copy_tree()
105 | 
106 |     # Start it
107 |     config = testutil.get_config(remote=remote)
108 |     testutil.init(config)
109 | 
110 |     # Apply actions
111 |     testutil.write('A/fileA1',text='fileA1')
112 |     testutil.move('A/fileA0','A/fileB1')
113 | 
114 |     testutil.write('B/fileB1',text='fileB1')
115 |     testutil.move('B/fileB0','B/fileA1')
116 | 
117 |     # Sync
118 |     if AB == 'A':
119 |         mode = 'push'
120 |     else:
121 |         mode='pull'
122 | 
123 |     if all_:
124 |         mode += '_all'
125 | 
126 |     testutil.run(config,mode=mode)
127 | 
128 |     # Check it -- Only need to check A
129 |     diff = testutil.compare_tree()
130 | 
131 |     if all_:
132 |         assert len(diff) == 0
133 |         # In the end, all files are either moved or overwritten. We do not
134 |         # expect there to be any differences
135 |     elif AB == 'A': # Check backups in B
136 |         assert diff == [('missing_inB', 'fileB0')] # Never gets pushed
137 |         
138 |     elif AB == 'B': # Check backups in B
139 |         assert diff == [('missing_inA', 'fileA0')] # Never gets pulled
140 | 
141 | 
142 | 
143 | 
144 | 
145 | if __name__=='__main__':
146 |     pass
147 | 
148 | 


--------------------------------------------------------------------------------
/rclone.md:
--------------------------------------------------------------------------------
 1 | # Rclone
 2 | 
 3 | ---
 4 | **DEPRECATION**
 5 | rclone support exists and works but is not optimal. PyFiSync was designed around rsync and is therefore less efficient with rclone.
 6 | 
 7 | It is *much* better to use [syncrclone](https://github.com/Jwink3101/syncrclone) instead
 8 | 
 9 | ---
10 | 
11 | (beta)
12 | 
13 | [rclone](https://rclone.org/) is now a supported backend with PyFiSync but there are some important details.
14 | 
15 | First and foremost, **this is a beta feature**. Some aspects have been formally and informally tested but there are a lot more edge cases to consider. And, a fair amount options that may or may not be needed based on settings
16 | 
17 | ## Setup
18 | 
19 | Set up your remote as you want. You can do it into its own config file but be sure to then add the `--config PATH` flag.
20 | 
21 | See examples
22 | 
23 | ## Password protected config
24 | 
25 | If you password protect your config, you will need to either store your password in the PyFiSync config (*not* recommended) or you will have to enter it each time. To enter it each time, specify:
26 | 
27 |     # Specify False if config is not encrypted. True means it will prompt
28 |     # for your password and a string specifies the password (not recommended)
29 |     rclone_config_encrypted = True
30 | 
31 | The password is stored in memory only and passes as an environment variable to rclone. This *should* be secure enough but, presumably, an offender can do a memory dump. 
32 | 
33 | ## Flags
34 | 
35 | With this backend, you can (and usually *need*) to set some flags. Some of them are listed in the config file and they are often rclone remote dependent. Some of them have been tested but not all.
36 | 
37 | Some common ones:
38 | 
39 | * `--transfers NN`: How many transfers should be done at once
40 | * `--fast-list`: This should be used on all bucket (S3, B2, Swift) backends.
41 | * `--config PATH`: Specify a different path to the config file. This is very useful if you want to keep the config file somewhere else (including with the files synced)
42 | 
43 | These are to be specified as a list! 
44 | 
45 | **WARNING**: There is no validation performed on the specified flags. That means that you could specify some options that interfere with the expected behavior of of rclone including links.
46 | 
47 | ## Symlinks
48 | 
49 | The `copy_symlinks_as_links` setting does not work for some remotes. rclone claims to have a workaround but it is inconsistent. See [this github issue](https://github.com/ncw/rclone/issues/3163). May be fixed in the future
50 | 
51 | ## Attributes
52 | 
53 | In addition to the default `mtime`, the acceptable attributes are `size` and hashes as described below
54 | 
55 | 
56 | ### Hashes
57 | 
58 | Some, but not all, rlcone remotes support hashes. While on the local side, PyFiSync only supports sha1, the rclone backend will support whatever hash it can. This depends on the [remote]. To specify a hash as a `move_attribute` for rlcone specify as `hash.SHA-1` (where it must be `SHA-1` since that is what `lsjson` returns). If the remote does not support the specified hash, expect a key error!
59 | 
60 | ## Mod Time
61 | 
62 | rlcone remotes *must* support `ModTime` as per the [remote docs][remote]. If it does not, PyFiSync will likely fail and/or cause issues. There is no check to make sure this is case! It is up to the user 
63 | 
64 | ## Backups
65 | 
66 | Some [remotes][remote] do not natively support moves or even server-side copy. Rclone presents a unified interface to all of these systems so it replicates moves with either download + upload + delete or, if it can, server-side copy + delete. As such, for files that should be backed up (before overwrite or delete) you can instead just download and store the backup locally.
67 | 
68 | ## Tests 
69 | 
70 | Many of the sync tests are also tested with rclone. Some however are not because they are not expected to pass or because they require some custom change.
71 | 
72 | See known issues for a discussion of situations (and failed tests) because rclone cannot handle it.
73 | 
74 | ## Other situations
75 | 
76 | ### Missing Hashes
77 | 
78 | In general, a remote supports a certain type of hash and that can be specified. For example B2 supports SHA-1 (attribute `hash.SHA-1`) and S3 supports MD5 (attribute `hash.MD5`). Some remotes (e.g. crypt) do not support any hashes.
79 | 
80 | According to the [rclone docs](https://rclone.org/s3/) not all files have a hash even if the system other-wise supports it.
81 | 
82 | As such, if `imitate_hash = True` then a warning will be printed about the file but the code will imitate a hash by looking at the other metadata (which means it cannot be used for move tracking). Using `imitate_hash = True` with an incorrectly-specified hash (e.g. `hash.SHA1` instead of `hash.SHA-1`) will cause a lot of errors.
83 | 
84 | ## Known Issues
85 | 
86 | * When using rclone mode, folders are essentially ignored. Empty directories will remain. This is not an issue for remotes that do not explicitly view directories
87 | * If a file is deleted and then another is moved into its place, it will view it as a delete and then a new file (which will likely conflict). This is due to only specifying the path as a previous attribute so there is no way to know a file was moved vs deleted. This is tested with `test_file_deleted_replaced_with_move` (which the rclone-version would fail) but the issues is replicated in `test_file_replaced_deleted_other_prev_attr`
88 | * Since directories are not a concept in rclone nor some remotes, tests dealing with empty directories will fail. See 
89 |     * `test_delete_file_in_folder`,`test_delete_folder`
90 | * Symlinks do not work on some remotes unless when `copy_symlinks_as_links` is True. See [this github issue](https://github.com/ncw/rclone/issues/3163). A workaround may be included in the future
91 | 
92 | 
93 | 
94 | [remote]:https://rclone.org/overview/
95 | 


--------------------------------------------------------------------------------
/tests/test_file_tracking.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import unicode_literals,print_function
  3 | 
  4 | import pytest #with pytest.raises(ValueError):...
  5 | 
  6 | try:
  7 |     from . import testutils
  8 | except (ValueError,ImportError):
  9 |     import testutils
 10 | testutils.add_module()
 11 | 
 12 | from PyFiSync import utils,PFSwalk
 13 | from PyFiSync import main as PyFiSync # Need to fix
 14 | DictTable = main.DictTable
 15 | 
 16 | import os
 17 | import sys
 18 | import shutil
 19 | 
 20 | 
 21 | def _file_list(path,config=None):
 22 |     if config is None:
 23 |         config = utils.configparser()
 24 |     log = utils.logger(silent=True,path=None)
 25 |     _tmp = PFSwalk.file_list(path,config,log)
 26 |     return _tmp.files()
 27 | 
 28 | 
 29 | 
 30 | def test_untouched():
 31 |     """ File is not touched """
 32 |     name = 'test_untouched'
 33 |     testpath = os.path.join(os.path.abspath(os.path.split(__file__)[0]),
 34 |             'test_dirs','move_tests',name)
 35 |     try:
 36 |         shutil.rmtree(testpath)
 37 |     except:
 38 |         pass
 39 |     os.makedirs(testpath)
 40 |     testutil = testutils.Testutils(testpath=testpath)
 41 | 
 42 | 
 43 |     # Init
 44 |     testutil.write('file1.txt',text='test1')
 45 | 
 46 |     prev_attr = ['ino','path']
 47 |     move_attr = ['ino','birthtime']
 48 |     
 49 |     # Get and inject configs
 50 |     config = testutil.get_config()
 51 |     PyFiSync.config = config
 52 | 
 53 |     # old list
 54 |     files_old = DictTable(_file_list(testpath,config))
 55 |     
 56 |     # Apply actions
 57 |     
 58 |     
 59 |     # new list and track
 60 |     files_new = DictTable(_file_list(testpath,config))
 61 |     PyFiSync.file_track(files_old,files_new,prev_attr,move_attr)
 62 |     
 63 |     # Check
 64 |     assert {'path':'file1.txt','untouched':True,'prev_path':'file1.txt'} in files_new
 65 | 
 66 | def test_move(): # This used to be test 01
 67 |     """ File Moved. inode and birthtime tracking """
 68 |     name = 'test_move'
 69 |     testpath = os.path.join(os.path.abspath(os.path.split(__file__)[0]),
 70 |             'test_dirs','move_tests',name)
 71 |     try:
 72 |         shutil.rmtree(testpath)
 73 |     except:
 74 |         pass
 75 |     os.makedirs(testpath)
 76 |     testutil = testutils.Testutils(testpath=testpath)
 77 | 
 78 | 
 79 |     # Init
 80 |     testutil.write('file1.txt',text='test1')
 81 | 
 82 |     prev_attr = ['ino','path']
 83 |     move_attr = ['ino','birthtime']
 84 |     
 85 |     # Get and inject configs
 86 |     config = testutil.get_config()
 87 |     PyFiSync.config = config
 88 | 
 89 |     # old list
 90 |     files_old = DictTable(_file_list(testpath,config))
 91 |     
 92 |     # Apply actions
 93 |     testutil.move('file1.txt','file2.txt')
 94 |     
 95 |     # new list and track
 96 |     files_new = DictTable(_file_list(testpath,config))
 97 |     PyFiSync.file_track(files_old,files_new,prev_attr,move_attr)
 98 |     
 99 |     # Check
100 |     assert {'path':'file2.txt','moved':True,'prev_path':'file1.txt'} in files_new
101 | 
102 | @pytest.mark.parametrize("mode", ['birthtime','size'])
103 | def test_move_mod(mode):
104 |     """ test modification after move with different modes"""
105 |     name = 'test_move_mod_' + mode
106 |     testpath = os.path.join(os.path.abspath(os.path.split(__file__)[0]),
107 |             'test_dirs','move_tests',name)
108 |     try:
109 |         shutil.rmtree(testpath)
110 |     except:
111 |         pass
112 |     os.makedirs(testpath)
113 |     testutil = testutils.Testutils(testpath=testpath)
114 | 
115 | 
116 |     # Init
117 |     testutil.write('file1.txt',text='test1')
118 | 
119 |     prev_attr = ['ino','path']
120 |     move_attr = ['ino'] + [mode]
121 |     
122 |     # Get and inject configs
123 |     config = testutil.get_config()
124 |     PyFiSync.config = config
125 | 
126 |     # old list
127 |     files_old = DictTable(_file_list(testpath,config))
128 |     
129 |     # Apply actions
130 |     testutil.move('file1.txt','file2.txt')
131 |     testutil.write('file2.txt',text='mod',mode='a')
132 |     
133 |     # new list and track
134 |     files_new = DictTable(_file_list(testpath,config))
135 |     PyFiSync.file_track(files_old,files_new,prev_attr,move_attr)
136 |     
137 |     # Check
138 |     if mode == 'birthtime':
139 |         assert {'path':'file2.txt','moved':True,'prev_path':'file1.txt'} in files_new
140 |     elif mode == 'size':
141 |         assert not {'path':'file2.txt','moved':True,'prev_path':'file1.txt'} in files_new
142 |         assert {'path':'file2.txt','new':True,'prev_path':None} in files_new
143 |     else:
144 |         assert False
145 |     
146 | 
147 | def test_no_moves():
148 |     """ tests using name as the only attribute"""
149 |     name = 'pathonly'
150 |     testpath = os.path.join(os.path.abspath(os.path.split(__file__)[0]),
151 |             'test_dirs','move_tests',name)
152 |     try:
153 |         shutil.rmtree(testpath)
154 |     except:
155 |         pass
156 |     os.makedirs(testpath)
157 |     testutil = testutils.Testutils(testpath=testpath)
158 | 
159 | 
160 |     # Init
161 |     testutil.write('file1.txt',text='test1')
162 |     testutil.write('file2.txt',text='test2')
163 |     testutil.write('file3.txt',text='test3')
164 |     testutil.write('file4.txt',text='test4')
165 | 
166 |     prev_attr = ['path']
167 |     move_attr = ['path']
168 |     
169 |     # Get and inject configs
170 |     config = testutil.get_config()
171 |     PyFiSync.config = config
172 | 
173 |     # old list
174 |     files_old = DictTable(_file_list(testpath,config))
175 |     
176 |     # Apply actions
177 |     testutil.move('file2.txt','file22.txt')
178 |     testutil.move('file3.txt','file33.txt')
179 |     testutil.write('file3.txt',text='testnew',mode='w')
180 |     testutil.remove('file4.txt')
181 |     testutil.write('file5.txt',text='test5',mode='w')
182 |     
183 |     
184 |     # new list and track
185 |     files_new = DictTable(_file_list(testpath,config))
186 |     PyFiSync.file_track(files_old,files_new,prev_attr,move_attr)
187 |     
188 |     files_old.alwaysReturnList = True
189 |     files_new.alwaysReturnList = True
190 |     
191 |     ## Check
192 |     
193 |     # Even though 22 and 33 were moves they should show as new
194 |     # File5 should also be new (since it really is)
195 |     t1db = DictTable( files_new(new=True))
196 |     assert len(t1db) == 3
197 |     assert {'path':'file22.txt'} in t1db
198 |     assert {'path':'file33.txt'} in t1db
199 |     assert {'path':'file5.txt'} in t1db
200 |     assert len(list(t1db(new=True))) == 3
201 |     
202 |     # file 3 should show as being modified and not new
203 |     f3 = files_new.query_one(path='file3.txt')
204 |     assert not f3['new'] 
205 |     assert testutil.read('file3.txt') == 'testnew'
206 |     
207 |     # file2 should be deleted even though moved
208 |     assert files_old.query_one(path='file2.txt')['deleted'] 
209 |     
210 | if __name__=='__main__':
211 |     test_no_moves()# 
212 | #     test_move()
213 | #     test_move_mod()
214 |     
215 |     
216 |     
217 |     
218 |     
219 |     
220 |     
221 |     
222 |     
223 |     
224 |     
225 |     
226 | 


--------------------------------------------------------------------------------
/tests/testutils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import unicode_literals,print_function
  3 | #from io import open
  4 | 
  5 | import os
  6 | import sys
  7 | import random
  8 | import string
  9 | import shutil
 10 | from pprint import pprint
 11 | from glob import glob
 12 | 
 13 | 
 14 | def add_module():
 15 |     
 16 |     path = os.path.abspath(__file__)
 17 |     path = os.path.split(os.path.split(path)[0])[0] # Move up one
 18 |     sys.path.insert(0,path)
 19 | 
 20 | add_module()
 21 | import PyFiSync
 22 | import PyFiSync.utils
 23 | utils = PyFiSync.utils
 24 | 
 25 | MAX_TIME_MOD = 500;
 26 | 
 27 | class Testutils(object):
 28 |     def __init__(self,testpath=None):
 29 |         self.testpath = testpath
 30 | 
 31 |     def modtime_all(self):
 32 |         """ modified all of the times """
 33 |         random.seed(4474)
 34 |         pathA = os.path.join(self.testpath,'A')
 35 |         pathB = os.path.join(self.testpath,'B')
 36 | 
 37 |         for dirpath, dirnames, filenames in os.walk(pathA):
 38 |             for f in filenames:
 39 |                 change_time(os.path.join(dirpath,f),random.randint(-100*MAX_TIME_MOD,-(MAX_TIME_MOD+2)))
 40 |         try:
 41 |             os.makedirs(pathB)
 42 |         except:
 43 |             pass
 44 | 
 45 | 
 46 |     def write(self,path,time_adj=None,mode='w',text=None):
 47 |         """Write or append a file"""
 48 |         path = os.path.join(self.testpath,path)
 49 |         directory = os.path.split(path)[0]
 50 |         try:
 51 |             os.makedirs(directory)
 52 |         except OSError:
 53 |             pass
 54 | 
 55 |         if text is None:
 56 |             text = randstr()
 57 | 
 58 |         text += '\n'
 59 | 
 60 |         if mode == 'a' and time_adj == 0:
 61 |             time_adj = 1
 62 | 
 63 |         with open(path,mode) as F:
 64 |             F.write(text)
 65 | 
 66 |         if time_adj is None:
 67 |             change_time(path,random.randint(5,MAX_TIME_MOD))
 68 |         elif time_adj != 0:
 69 |             change_time(path,time_adj)
 70 |         
 71 | 
 72 |     def exists(self,path):
 73 |         path = os.path.join(self.testpath,path)
 74 |         return os.path.exists(path)
 75 |     def move(self,src,dst):
 76 |         """Move and makedirs if needed"""
 77 |         src = os.path.join(self.testpath,src)
 78 |         dst = os.path.join(self.testpath,dst)
 79 |         directory = os.path.split(dst)[0]
 80 |         try:
 81 |             os.makedirs(directory)
 82 |         except OSError:
 83 |             pass
 84 | 
 85 |         shutil.move(src,dst)
 86 | 
 87 |     def remove(self,path):
 88 |         """remove and makedirs if needed"""
 89 |         path = os.path.join(self.testpath,path)
 90 |         if os.path.isfile(path):
 91 |             os.remove(path)
 92 |         if os.path.isdir(path):
 93 |             shutil.rmtree(path)
 94 | 
 95 |     def read(self,item):
 96 |         path = os.path.join(self.testpath,item)
 97 | 
 98 |         assert os.path.exists(path), "file doesn't exist '%s'" % item
 99 | 
100 |         with open(path) as F:
101 |             return F.read().strip()
102 |     
103 |     def tree(self,path):
104 |         files = []
105 |         for dirpath, dirnames, filenames in os.walk(path,followlinks=True):
106 |             for d in ['.PyFiSync','.git']:
107 |                 try:
108 |                     dirnames.remove(d)
109 |                 except ValueError:
110 |                     pass
111 | 
112 |             files.extend(os.path.join(dirpath,filename) for filename in filenames)
113 |             if len(dirnames) == len(filenames) == 0:
114 |                 files.append(os.path.join(dirpath,'>>EMPTY<<'))
115 |         return files
116 |         
117 |     def compare_tree(self):
118 |         """ All file systems are identical"""
119 |         result = []
120 |         
121 |         pathA = os.path.join(self.testpath,'A')
122 |         pathB = os.path.join(self.testpath,'B')
123 | 
124 |         filesA = [os.path.relpath(f,pathA) for f in self.tree(pathA)]
125 |         filesB = [os.path.relpath(f,pathB) for f in self.tree(pathB)]
126 | 
127 |         filesAB = set(filesA).union(filesB)
128 |         for fileAB in sorted(list(filesAB)):
129 | 
130 |             fileA = os.path.join(self.testpath,'A',fileAB)
131 |             fileB = os.path.join(self.testpath,'B',fileAB)
132 |             try:
133 |                 fileAtxt = open(fileA).read()
134 |             except IOError:
135 |                 result.append( ('missing_inA',fileAB) )
136 |                 continue
137 |             
138 |             try:
139 |                 fileBtxt = open(fileB).read()
140 |             except IOError:
141 |                 result.append( ('missing_inB',fileAB) )
142 |                 continue
143 | 
144 |             if not fileAtxt == fileBtxt:
145 |                 result.append( ('disagree',fileAB))
146 |             
147 |         return result
148 | 
149 |     def get_config(self,remote=False):
150 |         if remote == 'rclone':
151 |             config = utils.configparser(remote='rclone')
152 |             config.move_attributesB = ['hash.SHA-1']
153 |         else:
154 |             config = utils.configparser(remote='rsync')
155 |             if remote:
156 |                 config.userhost = os.environ['USER'] + '@localhost'
157 |                 
158 |                 # Specify the full executable to make sure not using an installed copy
159 |                 if remote == 'python2':
160 |                     exe = 'python2'
161 |                 elif remote == 'python3':
162 |                     exe = 'python3'
163 |                 pfs_path = os.path.normpath(os.path.join(os.path.dirname(__file__),'..','PyFiSync.py')) 
164 |                 config.remote_exe = '{} {}'.format(exe,pfs_path)
165 |             else:
166 |                 config.userhost = ''
167 |             
168 |         
169 |         config.excludes += ['.DS_Store','.git/','Thumbs.db']
170 |         config.pathA = os.path.join(self.testpath,'A')
171 |         config.pathB = os.path.join(self.testpath,'B')
172 | 
173 |         return config
174 |         
175 |     def write_config(self,config):
176 |         if self.testpath is None:
177 |             return
178 |         config_path = os.path.join(self.testpath,'A','.PyFiSync','config')
179 |         config_file = open(config_path,'w')
180 |         
181 |         for key,val in config.__dict__.items():
182 |             if key.startswith('_') or key == 'pwprompt':
183 |                 continue
184 |             config_file.write(key + ' = ' )
185 |             pprint(val,stream=config_file)
186 |         
187 |         config_file.close()    
188 | 
189 |     def init(self,config):
190 |         pathA = os.path.join(self.testpath,'A')
191 |         pathB = os.path.join(self.testpath,'B')
192 |     
193 |         PyFiSync.cli(['init',pathA])
194 |         self.write_config(config)
195 |         PyFiSync.cli(['reset','--force',pathA])
196 |         PyFiSync.cli(['sync',pathA])
197 |         # At init, every file's mod time was changed to be at least -(MAX_TIME_MOD+2)
198 |         # so we do not need to modify the last_run
199 |         
200 | 
201 |     def run(self,config,mode='sync',silent=False,flags=tuple()):
202 |         pathA = os.path.join(self.testpath,'A')
203 |         pathB = os.path.join(self.testpath,'B')
204 | 
205 |         self.write_config(config)
206 |         if mode == 'sync':
207 |             cmd = ['sync'] + list(flags) + [pathA]
208 |             PyFiSync.cli(cmd)
209 |     
210 |     def get_log_txt(self,AB='A'):
211 |         log_path = glob(os.path.join(self.testpath,AB,'.PyFiSync','logs','20*.log'))
212 |         log_path.sort()
213 |         with open(log_path[-1]) as l: # latest one
214 |             return l.read() 
215 |     
216 | def randstr(N=10):
217 |     return ''.join(random.choice(string.lowercase+'0123456789') for _ in xrange(10))
218 | 
219 | def change_time(path,time_adj):
220 |     """ Change the time on a file path"""
221 |     try:
222 |         stat = os.stat(path)
223 |     except OSError as E:
224 |         print('path {:s} does not exist'.format(E))
225 | 
226 |     os.utime(path,(stat.st_atime+time_adj,stat.st_mtime+time_adj))
227 | 
228 | 
229 | 
230 | 
231 | 
232 | 
233 | 
234 | 
235 | 
236 | 
237 | 


--------------------------------------------------------------------------------
/PyFiSync/config_template.py:
--------------------------------------------------------------------------------
  1 | # PyFiSync Configuration
  2 | # This will be evaluated as Python code so indentation matters
  3 | #
  4 | # Note: 'A' always refers to the local machine and 'B' is the remote;
  5 | #       even if the "remote" a local path
  6 | #
  7 | # Specify strings with ' ' and lists with [ ... ]
  8 | 
  9 | # Local Machine
 10 | nameA = 'machineA'
 11 | 
 12 | # <rsync>
 13 | # These settings are the the ssh+rsync remote
 14 | remote = 'rsync'
 15 | 
 16 | # Remote Machine
 17 | nameB = 'machineB'
 18 | pathB = '/full/path/to/sync/dir'
 19 | 
 20 | # SSH settings
 21 | # Specify the user@host for a remote machine. Leave empty for a local
 22 | userhost = '' 
 23 | ssh_port = 22
 24 | 
 25 | # Create a persistant master SSH tunnel and multiplex over the connection.
 26 | # This works in practice but has not been as thouroughly tested.
 27 | persistant = True
 28 | 
 29 | # Specify the remote executable. If it is installed, it is just 'PyFiSync'. 
 30 | # Otherwise it may be something like '/path/to/python /path/to/PyFiSync.py'.
 31 | # Make sure the paths work via SSH. See the FAQs for details
 32 | remote_exe = 'PyFiSync'
 33 | # </rsync>
 34 | 
 35 | # <rclone>
 36 | 
 37 | # These settings are specific to rclone. 
 38 | remote = 'rclone'
 39 | 
 40 | # Remote Machine name
 41 | nameB = 'machineB'
 42 | 
 43 | # Specify the path as you would a remote in rclone.
 44 | pathB = 'myremote:bucket'
 45 | 
 46 | # Set the executable. If rclone is installed, should just be the default
 47 | rclone_executable = 'rclone'
 48 | 
 49 | # Specify an rclone config password if one is set. Or specify `pwprompt()` to
 50 | # have you promoted to enter one on each run. Specify False to ignore.
 51 | #
 52 | # Alternatively, you can do something like the following: Write the password in
 53 | # something like ".PyFiSync/PW.txt" where, by putting it in the .PyFiSync 
 54 | # directory, it will not be synchronized. Then:
 55 | #
 56 | #     with open(".PyFiSync/PW.txt",'rt') as file:
 57 | #         rclone_pw = file.read().strip() 
 58 | #
 59 | # WARNINGS:
 60 | #   - Specifying the password in plain text may not be secure if this config file
 61 | #     is compromised
 62 | #   - The password is passed to rclone via environment variables. Alternatively,
 63 | #     use --password-command manually with flags. An improved method may be
 64 | #     implemented in the future.
 65 | rclone_pw = False
 66 | 
 67 | # Specify some flags to include in all rclone calls. Must be specified as a 
 68 | # list/tuple/set. These can be used to tune transfers and improve performance. 
 69 | # Some of them have been tested but not all.
 70 | #
 71 | # WARNING: There is no validation performed on the specified flags. That means 
 72 | #          that you could specify some options tha
 73 | #
 74 | # Other Examples:
 75 | #   `--transfers NN`: How many transfers should be done at once
 76 | #   `--fast-list`: This should be used on all bucket (S3, B2, Swift) backends.
 77 | #   `--config PATH`: Specify a different path to the config file. This is very 
 78 | #   useful if you want to keep the config file somewhere else (including with 
 79 | #   the files synced). Rclone is always evaluated in the root sync directory
 80 | #   so the path can be relative to that.
 81 | rclone_flags = ['--transfers', '15',
 82 |                 '--fast-list',
 83 |                 '--checkers', '10']
 84 | 
 85 | # Some remotes (e.g. Backblaze B2) do not support any server-side move/copy
 86 | # opperations. As such, moving files is very inefficient as they must
 87 | # be downloaded and then re-uploaded. For backups, this is a waste of effort
 88 | # so instead, we can *just* backup via a local copy
 89 | rclone_backup_local = False
 90 | 
 91 | # Some remotes (e.g. S3) do not provide hashes for all files (such as those
 92 | # uploaded with multi-part). As such PyFiSync can imitate a hash when missing
 93 | # based on the other metadata (so it cannot track remote moves). Warning: if
 94 | # this is set with an incorrectly specified hash, (a) the screen will fill with
 95 | # warnings and (b) no moves will be tracked
 96 | imitate_missing_hash = False
 97 | 
 98 | # </rclone>
 99 | 
100 | # File Settings:
101 | # move_attributes specify which attributes to determine a move or previous file.
102 | # Options for local and rsync remote 
103 | #   'path','ino','size','birthtime','mtime' 'adler','dbhash', PLUS any 
104 | #   `hashlib.algorithms_guaranteed`
105 | # 
106 | # Options for rclone remotes: 'path','size','mtime', and hashes as noted in the 
107 | # readme
108 | #
109 | 
110 | # <rsync>
111 | # Prev File Suggestions (to determine if a file is new or it's only mod time
112 | #   ['ino','path']
113 | # Move Suggestions: (see readme for discussion)
114 | #     macOS: ['ino','birthtime']
115 | #     Linux: ['ino','size'] --OR-- ['ino'] (birthtime isn't avalible and inodes
116 | #                                           get reused)
117 | # MUST specify as a list
118 | move_attributesA = ['ino','birthtime']
119 | prev_attributesA = ['ino','path']
120 | 
121 | move_attributesB = ['ino','birthtime'] # ['ino','size'] --OR-- ['ino','sha1']
122 | prev_attributesB = ['ino','path']
123 | # </rsync>
124 | # <rclone>
125 | # Prev File Suggestions: 
126 | #   ['path']
127 | # Move Suggestions: Note with rclone, there is no advantage to moving an
128 | #                   also-modified file
129 | # move_attributesA
130 | #   ['ino','mtime']
131 | # move_attributesB
132 | #   If hashes are supported: ['hash.SHA-1'] or whatever hash
133 | #   If hashes are not supported: ['path'] # This essentially doesn't track moves
134 | #
135 | # MUST specify as a list
136 | move_attributesA = ['ino','mtime']
137 | prev_attributesA = ['ino','path']
138 | 
139 | move_attributesB = ['path'] # --OR-- ['hash.SHA-1']
140 | prev_attributesB = ['path']
141 | # </rclone>
142 | 
143 | ## Conflict Settings
144 | move_conflict = 'A'     # 'A' or 'B': When moves are conflicting
145 | 
146 | # Modification date conflicts can be resolved as follows:
147 | # 'A','B' -- Always accept either A or B's copy regardless
148 | # 'both' -- Tag BOTH files to with the extension of the computer name
149 | # 'newer' -- Always keep the newer version
150 | # 'newer_tag' -- Keep the newer version un-named and tag the older
151 | mod_conflict = 'both'
152 | mod_resolution = 2.5    # (s) How much time difference is allowed between files
153 | 
154 | # Specify attributes to comprare from A and B. Must specify as a list of
155 | # (attribA,attribB) tuple, See the examples. Note that `mtime` will use the
156 | # mod_resolution time
157 | mod_attributes = [('mtime','mtime')]
158 | 
159 | # examples:
160 | # mod_attributes = [('sha1','sha1')] # for rsync remote
161 | # mod_attributes = [('sha1','hash.SHA-1')] # for rclone
162 | # mod_attributes = [('dbhash','hash.DropboxHash')] # dropbox rclone
163 | 
164 | 
165 | 
166 | # Symlinked directories are ALWAYS follow unless excluded. However, if 
167 | # copy_symlinks_as_links=False, symlinked files sync their referent (and 
168 | # rsync uses `-L`) If True (default), symlinks copy the link itself (a la git)
169 | #
170 | # WARNING1: setting to True with links to files inside the sync root will cause
171 | #           issues with tracking
172 | # WARNING2: Recursive symlinks will NOT be caught.
173 | copy_symlinks_as_links = True
174 | 
175 | ## Other settings
176 | backup = True    # Backup before deletion or overwriting
177 | 
178 | # If a file is deleted but a new one is in the same place, do not treat it as 
179 | # a delete. Useful when programs overwrite rather than update files. Final 
180 | # sync will look the same but this will optimize using rsync on that file
181 | check_new_on_delete = True  
182 | 
183 | # Set whether or not to create a database of hash values if (and only if) using
184 | # sha1 or adler32 as an attribute. If True (default), the code will not re-hash
185 | # a file unless the path, size, and mtime has changed. This leaves an edge
186 | # case, though rare
187 | use_hash_db = True
188 | 
189 | ## Exclusions.
190 | # * If an item ends in `/` it is a folder exclusion
191 | # * If an item starts with `/` it is a full path relative to the root
192 | # * Wildcards and other patterns are accepted
193 | #
194 | # | Pattern  | Meaning                            |
195 | # |----------|------------------------------------|
196 | # | `*`      | matches everything                 |
197 | # | `?`      | matches any single character       |
198 | # | `[seq]`  | matches any character in `seq`     |
199 | # | `[!seq]` | matches any character not in `seq` |
200 | #
201 | # Specify as a single list.
202 | # These are suggestions. They can be included if desired
203 | 
204 | # excludes = ['.DS_Store','.git/','Thumbs.db'] # Suggested
205 | excludes = []
206 | 
207 | # This sets a specified filename (such as '.PyFiSync_skip') wherein if PyFiSync
208 | # sees this file in a directory, it will exclude it. If the file is found on
209 | # either side, it is applied to *both* sides.
210 | exclude_if_present = ''
211 | 
212 | # The following can be used to perform certain tasks pre and post sync.
213 | # Called the root of the syn direcotory (i.e. they start with 
214 | #       $ cd $PyFiSync_root
215 | # Example uses include cleanup, git push,pull, sync.
216 | pre_sync_bash = ''
217 | post_sync_bash = ''
218 | 
219 | 
220 | 
221 | 
222 | 


--------------------------------------------------------------------------------
/FAQs.md:
--------------------------------------------------------------------------------
  1 | (work in progress)
  2 | 
  3 | ## Why is this better than Unison
  4 | 
  5 | Well, it may not be! I have only dabbled in Unison. Unison seems like a great tool but I wanted my own that I could develop, test, and design (within my abilities). I wanted backups-before-overwrite/delete to be a baseline feature and I also wanted to be able to track file moves.
  6 | 
  7 | Plus, this was a great learning tool for python. Developing this was a lot of fun. And I am also really happy with [ldtable](https://github.com/Jwink3101/ldtable) which I developed in concert with this.
  8 | 
  9 | ## Are files encrypted?
 10 | 
 11 | Using the rsync mode, the files are encrypted **in transit** via SSH. However, since this is not inherently a server-client model, the files are unencrypted at rest.
 12 | 
 13 | I suggest [Cryptomator](https://cryptomator.org/) for encrypted files as it is cross-platform, doesn't introduce much overhead, and is efficient. It encrypts on a file-by-file basis (with obfuscated names) so changing a file will only require syncing that file (and some ancillary data). Speedups from rsync will not be realized.
 14 | 
 15 | If using only macOS, encrypted disk images can also work well. If using encrypted disk images, I recommend using *sparse* disk image. Sparse images create bands (16mb if I recall correctly) so, while not file-by-file, they are more efficient but less than purely file-by-file. Regular encrypted disk images will, of course, work but *any* change will require syncing the entire thing. These are not recommended.
 16 | 
 17 | Also, if using the rclone remote, you can use a crypt remote. Details are in the [rclone_b2](rclone_b2.md) guide.
 18 | 
 19 | ## I set up SSH keys. Why is it asking me for my key password each time?
 20 | 
 21 | If you set up SSH keys with a password, you still need to unlock the key. If you're in an interactive session (e.g. directly in macOS terminal or the same for Linux), it does this for you (usually). If you're also SSHed in, you may need to start the `ssh-agent`:
 22 | 
 23 |     $ ssh-agent
 24 |     $ ssh-add
 25 |     
 26 | Or, you can put the following in your `.bashrc` and call:
 27 | 
 28 |     $ start-ssh-agent
 29 |     
 30 | code:
 31 | 
 32 | ```bash
 33 | start-ssh-agent () {
 34 |     if [ -z "$SSH_AUTH_SOCK" ]; then
 35 |         eval $(ssh-agent);
 36 |         ssh-add;
 37 |     else
 38 |         echo "SSH agent already running";
 39 |     fi
 40 | }
 41 | ```
 42 | 
 43 | ## Why am I getting Unicode errors? Did you mess up?
 44 | 
 45 | PyFiSync can handle unicode filenames without a problem. It is likely your terminal encoding.
 46 | 
 47 | On some `openssh` installations on macOS (anecdotally, from `brew`), there seems to be a problem with sending the wrong encoding to the remote terminal which makes it *seem* like there is a unicode error in PyFiSync. This is actually related to sending terminal encoding. See [this](https://askubuntu.com/a/874765) where they had the *opposite* problem.
 48 | 
 49 | The fix is to add the following to `/etc/ssh/ssh_config` or `~/.ssh/config`:
 50 | 
 51 |     Host *
 52 |         SendEnv LANG LC_* # Send locale to remote
 53 | 
 54 | ## How does PyFiSync handle interruptions
 55 | 
 56 | The short answer is: If you run it again, everything *should* be fine.
 57 | 
 58 | The slightly longer answer is that all actions are (more-or-less) safe since backups are made before anything is overwritten and moves are logged.
 59 | 
 60 | The more detailed answer: The code does a few steps when you run it. First, it makes a file list of the current state of each side. Then, from that list and stored list of the last run, it compares all files to see what is a) new (called `prev_attributes` in the config); b) moved (called `move_attributes`); or c) deleted. This is independent of each side and does *not* determine if files are modified\*. The reason this is useful is purely to propagate moves on each side so rsync is more efficient. The determination of *what* to transfer happens *after* this independent one and is only based on comparing the mod times (if it exists on both sides)
 61 | 
 62 | So, to better answer this question, consider the times an interruption can happen:
 63 | 
 64 | * **initial listing and comparison**: No problem. No file has been touched
 65 |     * This is also when the file-transfers are determined!
 66 | * **Propgate moves, perform deletions and/or backups**: If a file is moved, when PyFiSync is rerun, even without transfer, the system will think the file was moved on both sides regardless of failure. It won't need to do anything. If it is interrupted during "deletion" (which is really just a move to a backup dir), then the file will have been moved and all is good. If it happens during a backup, then you may have extra backup copies. No problem
 67 | * **During file transfer**: If a file was successfully transferred, then when it is rerun, they will match mod-time and nothing happens. Otherwise, they will have the same outcome as before. An additional backup may be made, but that doesn't hurt.
 68 | * **After transfer, before last file listings**: The last listing is only needed to get up-to-date `inode` numbers, etc. It is stored for the next run for moves and tracking. Therefore, rerunning it will not have any moves or deletions to propagate so nothing bad will happen.
 69 | 
 70 | One additional case is if you didn’t realize it failed and run again later. In this case, the following would be the worst outcomes:
 71 | 
 72 | * A file delete will not propagate and may be restored from the other side. No real harm here!
 73 | * A file that *could* have been a move will actually end up being treated as a delete + new file. Some extra bandwidth but otherwise harmless.
 74 | 
 75 | While I think I thought through all of these conditions, I may have missed something. I personally run extensive backups (since, even though this backs up prior to delete or overwrite, it is a *sync* code, not *backup*). I cannot recall a time I had to dig into them because my code failed. And, except in the early days, I do not remember having to manually unwind anything. Even then, it performs safe opperations but I have since figured out the missed edge case, handled it, and wrote a test!
 76 | 
 77 | I hope this clears it up a bit!
 78 | 
 79 | \* The original plan had it determine transfers from its own previous state but I moved to comparing the two sides since it was robust to a) failure and b) deletion of the stored file lists
 80 | 
 81 | ## Does this support Windows
 82 | 
 83 | No. The remote file listing is handled via an SSH call and the transfers are via rsync. It *may* work in Windows Subshell but it has not been tested.
 84 | 
 85 | Also, I suspect that file tracking will be less robust since there are no inode numbers. SHA1 should would to track but that adds a lot of overhead
 86 | 
 87 | ## Why can't I sync two rclone remotes
 88 | 
 89 | This tool was built with rsync in mind and, in particular, syncing your local file system to a remote remote. The infrastructure was designed to be flexible for the remote only. With that said, I suspect I *could* make it work to handle two rclone remotes but I don't have the need. If there is interest, I may re-evaluate.
 90 | 
 91 | ## Why use `ldtable` instead of something like SQLite
 92 | 
 93 | The simple answer is, at the time, I didn't know much SQL. And building out `ldtable` was a lot of fun. It is very useful for in-memory data queries. The more complex answer is that `ldtable` is much easier. Since I do not know all attributes until PyFiSync is instantiated, I would need a variable schema. And since I may or may not query on different combinations of attributes, I would need many table indicies.
 94 | 
 95 | Also, `ldtable` has proven to be sufficiently performant. Even on my 60,000 item (~200gb) photo collection. The database is well within memory constraints. I may consider SQLite in the future though.
 96 | 
 97 | ## When should I use rsync+SSH vs rclone?
 98 | 
 99 | First of all, if you want *anything* other than ssh or local, you *need* to use rclone.  If you plan to use SFTP in rclone, use the ssh+rsync mode!
100 | 
101 | If you are interested in SSH-based remotes, you are almost universally better off using the rsync+SSH mode. First and foremost, rclone does not support any kind of transfer deduplication while rsync is built around it! With rsync, if only a small part of the file changes, only those changes (plus some overhead) are transfered. 
102 | 
103 | Furthermore, in ssh+rsync mode, you can have things like a hash database (if set and using hashes) to greatly speed things up. And, all opperations are done on a persistant tunnel.
104 | 
105 | ## I installed PyFiSync on the remote computer but I am getting an error.
106 | 
107 | If you are getting a `bash: PyFiSync: command not found` error, it is likely because the remote machine either doesn't have PyFiSync installed or there is an issue with your `.bashrc` and/or `.bash_profile`.
108 | 
109 | If your paths are set up in `.bash_profile`, move them to `.bashrc` and add the following to `.bash_profile`:
110 | ```bash
111 | if [ -f ~/.bashrc ]; then
112 |   . ~/.bashrc
113 | fi
114 | ```
115 | In addition, some versions of linux add the following line(s) to the `.bashrc`:
116 | ```bash
117 | # If not running interactively, don't do anything
118 | [ -z "$PS1" ] && return
119 | ```
120 | You need to comment out that second line or still set up paths.
121 | 
122 | Finally, you can instead specify the full path in the config file with `remote_exe`
123 | 
124 | ## Why do you still support Python 2?
125 | 
126 | When I first wrote PyFiSync, I didn't use Python 3! Part of that was because at work, I was limited to Python 2 and part of it was just that I learned on Python 2. However, when I made the transition to Python 3 for my own stuff, I made PyFiSync compatible with both.
127 | 
128 | At this point, it is pretty easy to maintain that. If in the future I want to use a Python 3 only feature, I will probably just drop Python 2.
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 


--------------------------------------------------------------------------------
/rclone_b2.md:
--------------------------------------------------------------------------------
  1 | # Rclone with B2 (and S3)
  2 | 
  3 | (See the bottom for noted on using S3 instead of B2)
  4 | 
  5 | B2 is a very inexpensive storage platform that is great for syncing. It does have a few limitations, notably <strike>the lack of server-side copy. As such, any "moves" are actually download+upload+delete.</strike> (This is no longer the case for newer rclone versions with the latest B2 API. Therefore, some of these steps may not be necessary).
  6 | 
  7 | As such, we make two major changes to our configuration:
  8 | 
  9 | 1. Download backups locally rather than move them to the backup dir
 10 |     * Any remote backups aren't strictly needed since the buckets can be set to backup but we will keep this here for consistency 
 11 | 2. Do not do any file moves! We *could* let rclone handle the moves but it is less transparent. See below when using S3 because that is not the best setting
 12 | 
 13 | Finally, another limitation as noted in the readmes relates to deleting a file and then renaming another in its place.
 14 | 
 15 | See notes at the end for the differences with S3 setup
 16 | 
 17 | ## Plan
 18 | 
 19 | We will use the *same* bucket to set up two different PyFiSync repositores. The first will be unencrypted and the second will be encrypted. We will have to adjust the settings accordingly.
 20 | 
 21 | It is a good idea to use encryption if you do not trust the remote or want extra protection. In this example, we also password protect the config file but that is really *not* needed on your local machine.
 22 | 
 23 | ## Rclone Setup
 24 | 
 25 | ### Preliminaries
 26 | 
 27 | Assume your local machine directory is `/path/to/PFS/`
 28 | 
 29 |     $ cd /path/to/PFS/
 30 |     
 31 | And that you have a bucket called `MYBUCKET`
 32 | 
 33 | ### Set up B2 base version
 34 | 
 35 | Create a local config file
 36 | 
 37 |     $ rclone config --config rclone.cfg
 38 | 
 39 | Create a new B2 remote
 40 | 
 41 | ```rclone
 42 | No remotes found - make a new one
 43 | n) New remote
 44 | s) Set configuration password
 45 | q) Quit config
 46 | n/s/q> n
 47 | 
 48 | name> b2base
 49 | 
 50 | Type of storage to configure.
 51 | Storage> b2
 52 | 
 53 | [...]
 54 | ```
 55 | 
 56 | The final config should look something like
 57 | 
 58 | ```rclone
 59 | [b2base]
 60 | type = b2
 61 | account = **ACCOUNT**
 62 | key = **KEY**
 63 | ```
 64 | 
 65 | ### Set up rclone encrypted
 66 | 
 67 | We *could* set up the encrypted in the same config file but we will later encrypt it. Since we also want the non-encrypted, it is easier to make a *copy*. Note that you will have to make changes in both if you change a setting
 68 | 
 69 | It is **NOT vital** to encrypt the encrypted B2 config since you probably trust your own computer. If you do not want to encrypt that then there is no reason to make a copy and you can skip the relevant sections
 70 | 
 71 | Follow the following
 72 | 
 73 |     $ cp rclone.cfg rcloneCRYPT.cfg
 74 |     $ rclone config --config rcloneCRYPT.cfg
 75 | 
 76 | Then
 77 | 
 78 | ```
 79 | n) New remote
 80 | d) Delete remote
 81 | r) Rename remote
 82 | c) Copy remote
 83 | s) Set configuration password
 84 | q) Quit config
 85 | e/n/d/r/c/s/q> n
 86 | 
 87 | name> b2crypt
 88 | 
 89 | Storage> crypt
 90 | 
 91 | remote> b2base:MYBUCKET/crypt
 92 | 
 93 | filename_encryption> standard
 94 | 
 95 | directory_name_encryption> true
 96 | 
 97 | Password or pass phrase for encryption.
 98 | y) Yes type in my own password
 99 | g) Generate random password
100 | n) No leave this optional password blank
101 | y/g/n> g
102 | 
103 | Bits> 256
104 | 
105 | y/n> y
106 | 
107 | Password or pass phrase for salt. Optional but recommended.
108 | Should be different to the previous password.
109 | y) Yes type in my own password
110 | g) Generate random password
111 | n) No leave this optional password blank
112 | y/g/n> g
113 | 
114 | Bits> 256
115 | 
116 | y/n> y
117 | ```
118 | 
119 | Now your config should look something like the following:
120 | 
121 | ```
122 | [b2base]
123 | type = b2
124 | account = **ACCOUNT**
125 | key = **KEY**
126 | 
127 | [b2crypt]
128 | type = crypt
129 | remote = b2base:MYBUCKET/crypt
130 | filename_encryption = standard
131 | directory_name_encryption = true
132 | password = **PW2**
133 | password2 = **PW**
134 | ```
135 | 
136 | But you will want to encrypt that!
137 | 
138 |     $ rclone config --config rcloneCRYPT.cfg
139 | 
140 | ```
141 | Name                 Type
142 | ====                 ====
143 | b2base               b2
144 | b2crypt              crypt
145 | 
146 | e) Edit existing remote
147 | n) New remote
148 | d) Delete remote
149 | r) Rename remote
150 | c) Copy remote
151 | s) Set configuration password
152 | q) Quit config
153 | e/n/d/r/c/s/q> s
154 | 
155 | a/q> a
156 | 
157 | **enter password**
158 | 
159 | Your configuration is encrypted.
160 | c) Change Password
161 | u) Unencrypt configuration
162 | q) Quit to main menu
163 | c/u/q> q
164 | 
165 | e/n/d/r/c/s/q> q
166 | ```
167 | 
168 | Now you have two configuration files for rclone. Again, you could do this as one but it is nicer to not have to enter your password for the unencrypted but you want to keep your config encrypted for the other
169 | 
170 | **BACKUP** the config file since if you lose these machine-generated passwords, you will lose access to your files.
171 | 
172 | ## Set up PyFiSync
173 | 
174 | ### Unencrypted
175 | 
176 | These aren't *all* settings, but should give the general idea
177 | 
178 |     $ cd /path/to/PFS/
179 |     $ mkdir reg
180 |     $ cd reg
181 |     $ PyFiSync init --remote rclone
182 | 
183 | Now edit the config. This is again, not *all* of it. The comments are **not** the default comments in the documentation. They are my notes
184 | 
185 | ```python
186 | pathB = 'b2base:MYBUCKET/reg'
187 | 
188 | rclone_pw = False
189 | 
190 | # B2 should use fast-list to reduce API calls. The rclone docs
191 | # (https://rclone.org/b2/) suggest 32 transfers. The config flag is needed to 
192 | # specify where the configuration file is that we created above. Note that 
193 | # rclone is always executed in the sync directory
194 |  
195 | rclone_flags = ['--transfers', '32',
196 |                 '--fast-list',
197 |                 '--checkers','10',
198 |                 '--config','../rclone.cfg']
199 | 
200 | # Do backups to the local machine since we can't move files with B2
201 | rclone_backup_local = True         
202 |                 
203 | # We never want to do a move with B2. Note that for S3 this is NOT the case
204 | move_attributesB = ['ino','path']
205 | prev_attributesB = ['ino','path']
206 | 
207 | # B2 supports SHA-1. 
208 | move_attributesB = ['hash.SHA-1']
209 | prev_attributesB = ['path']
210 |         
211 | # Rclone does not like symlinks with B2 and their workaround appears broken
212 | # as of writing. See https://github.com/ncw/rclone/issues/3163
213 | copy_symlinks_as_links = False
214 |                 
215 | ```
216 | 
217 | Notes:
218 | 
219 | * We didn't specify it but you may want to add the flag `--b2-hard-delete` since we are doing backups.
220 | 
221 | Set up then add some files
222 | 
223 |     $ PyFiSync reset --force
224 | 
225 | ...add some files and test
226 | 
227 | ### Encrypted
228 | 
229 | Most of this is the same but we will reproduce it all just to be sure
230 | 
231 |     $ cd /path/to/PFS/
232 |     $ mkdir crypt
233 |     $ cd crypt
234 |     $ PyFiSync init --remote rclone
235 | 
236 | Now edit the config. This is again, not *all* of it. The comments are **not** the default comments in the documentation. They are my notes
237 | 
238 | ```python
239 | pathB = 'b2crypt'
240 | 
241 | # Make it ask each time. You can also either enter the password
242 | # here or choose to not encrypt the rclone config.
243 | rclone_pw = pwprompt() 
244 | 
245 | # B2 should use fast-list to reduce API calls. The rclone docs
246 | # (https://rclone.org/b2/) suggest 32 transfers. The config flag is needed to 
247 | # specify where the configuration file is that we created above. Note that 
248 | # rclone is always executed in the sync directory
249 |  
250 | rclone_flags = ['--transfers', '32',
251 |                 '--fast-list',
252 |                 '--checkers','10',
253 |                 '--config','../rcloneCRYPT.cfg']
254 | 
255 | # Do backups to the local machine since we can't move files with B2
256 | rclone_backup_local = True         
257 |                 
258 | # As noted above, we only want to do a "move" when the file is unmodified
259 | # since, unlike rsync, rclone cannot make use of existing data
260 | move_attributesA = ['ino','mtime']
261 | prev_attributesA = ['ino','path']
262 | 
263 | # Crypt does not support any hashes. No remote move tracking!
264 | # (which means you may redownload when not needed)
265 | move_attributesB = ['path']
266 | prev_attributesB = ['path']
267 |         
268 | # Rclone does not like symlinks with B2 and their workaround appears broken
269 | # as of writing. See https://github.com/ncw/rclone/issues/3163
270 | copy_symlinks_as_links = False
271 |                 
272 | ```
273 | 
274 | Notes:
275 | 
276 | * We didn't specify it but you may want to add the flag `--b2-hard-delete` since we are doing backups.
277 | 
278 | Set up then add some files
279 | 
280 |     $ PyFiSync reset --force
281 | 
282 | Now you should be good to go! You will get some "untracked file" warnings on the first sync for files that are not on the same side.
283 | 
284 | ## S3 Notes
285 | 
286 | The same process can be used for S3-based backends with a few changes.
287 | 
288 | * S3 supports server-side copy so you *can* worry less about doing backups on the remote side but rclone still cannot use existing data so there is no need to move a modified file
289 | * Since S3 support server-side copy, it also behooves us to track moves. Change the *local* settings to
290 |       
291 |         # As noted above, we only want to do a "move" when the file is unmodified
292 |         # since, unlike rsync, rclone cannot make use of existing data
293 |         # move_attributesA = ['ino','mtime']
294 |         # prev_attributesA = ['ino','path']
295 | 
296 | * S3 supports MD5 hashes, not SHA-1
297 | 
298 | 
299 | 
300 | 
301 | 
302 | 
303 | 
304 | 
305 | 
306 | 
307 | 
308 | 
309 | 
310 | 
311 | 
312 | 
313 | 
314 | 
315 | 
316 | 
317 | 
318 | 
319 | 
320 | 
321 | 
322 | 
323 | 
324 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PyFiSync
  2 | 
  3 | Python (+ rsync or rclone) based intelligent file sync with automatic backups and file move/delete tracking.
  4 | 
  5 | ## Features
  6 | 
  7 | * Robust tracking of file moves
  8 |     * Especially powerful on MacOS, but works well enough on linux.
  9 | * rsync Mode:
 10 |     * Works out of the box with Python (tested on 2.7 and 3.5+) for rsync
 11 |     * Works over SSH for secure and easy connections with rsync mode
 12 |     * Uses rsync for actual file transfers to save bandwidth and make use of existing file data
 13 | * [rclone][rclone] mode: (beta!)
 14 |     * Can connect to a wide variety of cloud-services and offers encryption
 15 |     * Note that rclone is still supported and works but **it is better to use** [syncrclone](https://github.com/Jwink3101/syncrclone) instead.
 16 |         * rclone support may be deprecated in the future!
 17 | * Extensively tested for a **huge** variety of edge-cases
 18 | 
 19 | ## Details
 20 | 
 21 | PyFiSync uses a small database of files from the last sync to track moves and deletions (based on changeable attributes such as inode numbers, sha1 hashes, and/or create time). It then compares `mtime` from both sides on all files to decide on transfers.
 22 | 
 23 | ### Backups
 24 | 
 25 | By default, any time a file is to be overwritten or modified, it is backed up on the machine first. No distinction is made in the backup for overwrite vs delete.
 26 | 
 27 | ### Attributes
 28 | 
 29 | Moves and deletions are tracked via attributes described below.
 30 | 
 31 | Move attributed are used to track if a file has moved while the `prev_attributes` are used to determine if a file is the same as before
 32 | 
 33 | Note: On HFS+ (and maybe APFS?), macOS's file system, inodes are not reused quickly. On ext3 (Linux) they are recycled rapidly leading to issues when files are deleted and new ones are made. Do not use inodes alone on these systems
 34 | 
 35 | #### Common attributes
 36 | 
 37 | * `path` -- This essentially means that moves are not tracked. If a file has the same name, it is considered the same file
 38 | * `size` -- File size. Do not use alone. Also, this attribute means that the file may not change between moves. See examples below
 39 | * `mtime` -- When the file was modified. Use with `ino` to track files
 40 | 
 41 | #### rsync and local attributes
 42 | 
 43 | Attributes for the local machine and an rsync remote
 44 | 
 45 | * `ino` (inode number)-- Track the filesystem inode number. May be safely used alone on HFS+ but not on ext3 since it reuses inodes. In that case, use with another attribute
 46 | * hashes -- Very robust to track file moves but like `size`, requires the file not change. Also, slow to calculate (though, by default, they are not recalculated on every sync). Options:
 47 |     * `adler` -- Fast but less secure
 48 |     * `dbhash` -- Used for dropbox. Useful if comparing on hash
 49 |     * any `hashlib.algorithms_guaranteed`: `sha384`,`sha3_224`,`sha3_512`,`md5`,`sha512`,`sha3_256`,`blake2b`,`sha3_384`,`shake_128`,`blake2s`,`sha256`,`shake_256`,`sha1`,`sha224`
 50 | * `birthtime` -- Use the file create time. This does not exist on some linux machines, some python implementations (PyPy), and/or is unreliable
 51 | 
 52 | #### rclone attributes
 53 | 
 54 | * `hash.HASH` -- Use a hash from rclone. Depends on which hashes are available.
 55 | 
 56 | #### Suggested move Attribute Combinations
 57 | 
 58 | For rsync
 59 | 
 60 | * On macOS, the following is suggested: `[ino,birthtime]`
 61 | * On linux, the following is suggested: `[inode,mtime]`
 62 |     * This means that **moved files should not be modified** on that side of the sync.
 63 | 
 64 | #### Hashes
 65 | 
 66 | As noted, any `hashlib.algorithms_guaranteed` is supported for rsync mode and the local machine. In order to save time, a database is used of the previous file. This can be turned off in the config forcing all of the files to be read and hashed again.
 67 | 
 68 | 
 69 | ### Empty Directories
 70 | 
 71 | PyFiSync syncs files and therefore will *not* sync empty directories from one machine to the other. However, if, and only if, a directory is *made* empty by the sync, it will be deleted. That includes nested directories. In rclone mode, empty directories are not handled at all by PyFiSync
 72 | 
 73 | ## Install
 74 | 
 75 | This are *no dependancies!* (for rsync). Everything is included in the package (though `ldtable` is also separately developed [here](https://github.com/Jwink3101/ldtable)) (now `DictTable`)
 76 | 
 77 | To install:
 78 | 
 79 |     $ python -m pip install git+https://github.com/Jwink3101/PyFiSync
 80 |     
 81 | Or download the zip file and run
 82 | 
 83 |     $ python setup.py install
 84 | 
 85 | If using the rclone remote (see setup below), install it on the remote machine too.
 86 | 
 87 | Note: On the remote machine, the path to PyFiSync must be found via SSH. For example, if your python is from (Ana/Mini)conda, then it places the paths into the `.bash_profile`. Move the paths to `.bashrc` so that PyFiSync can be found. 
 88 | 
 89 | Alternatively, specify `remote_exe`.
 90 | 
 91 | ## Setup
 92 | 
 93 | See [rsync](rsync.md) for setup of the default mode. PyFiSync must be installed on both machines (or the Python scripts must be there and configured)
 94 | 
 95 | Setting up rclone is a bit more involved since you must set up an appropriate rclone remote. See [rclone readme](rclone.md) for general details and [rclone\_b2](rclone_b2.md) for a detailed walk through of setting up with B2 (and S3 with small noted changes). 
 96 | 
 97 | To initiate an rclone-based repo, do
 98 | 
 99 |     $ PyFiSync init --remote rclone
100 | 
101 | ## Settings
102 | 
103 | There are many settings, all documented in the config file written after an `init`. Here are a few:
104 | 
105 | ### Exclusions
106 | 
107 | Exclusion naming is done is such a way that it replicated a *subset* of `rsync` exclusions. That is, the following pattern is what **this** code follows. `rsync` has its own exclusion engine which is more advanced but should be have similarly.
108 | 
109 | * If an item ends in `/` it is a folder exclusion
110 | * If an item starts with `/` it is a full path relative to the root
111 | * Wildcards and other patterns are accepted
112 | 
113 | | Pattern  | Meaning                            |
114 | |----------|------------------------------------|
115 | | `*`      | matches everything                 |
116 | | `?`      | matches any single character       |
117 | | `[seq]`  | matches any character in `seq`     |
118 | | `[!seq]` | matches any character not in `seq` |
119 | 
120 | Examples:
121 | 
122 | * Exclude **all** git directories: `.git/`
123 | * Exclude a specific folder: `/path/to/folder/` (where `/` is the start of the sync directory
124 | * Exclude all files that start with `file`: `file*`
125 | * Exclude all files that start with `file` in a specific directory: `/path/to/file*`
126 | 
127 | 
128 | #### Exclude if Present
129 | 
130 | PyFiSync allows for exclusion of a directory due to the presence of a specified file name (the contents of the file do not matter, only the presence of it).
131 | 
132 | Unlike regular exclusions which halt traversing deeper into an excluded directory tree, `exclude_if_present` is a filter applied after the fact. This approach is safer as adding an exclusion file on one side will not cause a delete to be incorrectly propagated. It does come at a small performance penalty as the excluded directory is is initially traversed
133 | 
134 | ### Symlinks
135 | 
136 | First note that **all directory links are followed** regardless of setting. Use exclusions to avoid syncing a linked directory.
137 | 
138 | If `copy_symlinks_as_links=False` symlinked files sync their referent (and rsync uses `-L`) If `True` (default), symlinks copy the link itself (a la how git works)
139 | 
140 | WARNINGS:
141 | 
142 | * If `copy_symlinks_as_links = False` and there are symlinked files to another IN sync root, there will be issues with the file tracking. Do not do this!
143 | * As also noted in Python's documentation, there is no **safeguard against recursively symlinked directories**.
144 | * rsync may throw warnings for broken links
145 | * rclone's support of symlinks is unreliable at the moment.
146 | 
147 | 
148 | ### Pre and Post Bash
149 | 
150 | There is the option to also add some bash scripts pre and post sync. These may be useful if you wish to do a git push, pull, etc either remote or local.
151 | 
152 | They are ALWAYS executed from the sync root (a `cd /path/to/syncroot` is inserted above).
153 | 
154 | ## Running Tests
155 | 
156 | To run the test, in bash, do:
157 | 
158 |     $ source run_test.sh
159 | 
160 | In addition to testing a whole slew of edge cases, it also  will test all actions on a local sync, and remote to both python2 and python3 (via `ssh localhost`). The run script will try to call `py.test` for both versions of python locally.
161 | 
162 | ## Known Issues and Limitations
163 | 
164 | The test suite is **extremely** extensive as to cover tons of different and difficult scenarios. See the tests for further exploration of how the code handles these cases. Please note that unless specified explicitly in the config or the command-line flag, all deletions and (future) overwrites first perform a backup. Moves are not backed up but make likely be unwound from the logs.
165 | 
166 | A few notable limitations are as follows:
167 | 
168 | * Symlinks are followed (optionally) but if the file they are linking to is also in the sync folder, it may confuse the move tracking
169 | * File move tracking
170 |     * A file moved with a new name that is excluded will propagate as deleted. This is expected since the code no longer has a way to "see" the file on the one side.
171 |     * A file that is moved on one side and deleted on the other will NOT have the deletion propagated regardless of modification
172 | * Sync is based on modification time metadata. This is fairly robust but could still have issues. In rsync mode, even if PyFiSync decides to sync the files, it may just update the metadata. In that case, you may just want to disable backups. With rclone, it depends on the remote and care should be taken.    
173 | 
174 | There is also a potential issue with the test suite. In order to ensure that the files are noted as changed (since they are all modified so quickly), the times are often adjusted via some random amounts. There is a *small* chance some tests could fail due to a small number not changing. Running the tests again should pass.
175 | 
176 | See [rclone readme](rclone.md) for some rclone-related known issues
177 | 
178 | ## Other Questions
179 | 
180 | See the (growing) [FAQ](FAQs.md) for some more details and/or troubleshooting
181 | 
182 | [rclone]:https://rclone.org/
183 | 
184 | 
185 | 


--------------------------------------------------------------------------------
/PyFiSync/PFSwalk.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Main tool for walking a directory that is tuned to PyFiSync's needs and 
  5 | uses scandir if it can!
  6 | """
  7 | 
  8 | from __future__ import division, print_function, unicode_literals
  9 | from io import open
 10 | 
 11 | import sys
 12 | import os
 13 | import fnmatch
 14 | import subprocess
 15 | import json
 16 | import time
 17 | 
 18 | try:
 19 |     from os import scandir as _scandir
 20 | except ImportError:
 21 |     try:
 22 |         from scandir import scandir as _scandir
 23 |     except ImportError:
 24 |         _scandir = None
 25 | 
 26 | try:
 27 |     from itertools import imap as map
 28 |     from itertools import izip as zip 
 29 | except ImportError: # python >2
 30 |     pass
 31 | 
 32 | from itertools import repeat
 33 | from functools import partial
 34 | 
 35 | from . import utils
 36 | from .dicttable import DictTable
 37 | 
 38 | def fnmatch_mult(name,patterns):
 39 |     """
 40 |     will return True if name matches any patterns
 41 |     """
 42 |     return any(fnmatch.fnmatch(name,pat) for pat in patterns)
 43 | 
 44 | 
 45 | class file_list:
 46 |     def __init__(self,path,config,log,
 47 |             attributes=(),
 48 |             empty='store',use_hash_db=True):
 49 |         """
 50 |         Main interface to the walk. the final list will
 51 |         be in the attribute file_list.
 52 |         
 53 |         Empty:
 54 |             'store':    stores a list of empty directories
 55 |             'remove':   Deletes all empty directories if (and only if) they 
 56 |                         were *not* empty before. Also removes stored list
 57 |             'reset':    Removes stored list
 58 |         """
 59 |         self.path = path
 60 |         self.config = config
 61 |         self.log = log
 62 |         self.attributes = attributes
 63 |         self.use_hash_db = use_hash_db
 64 |         
 65 |         self.empty = empty
 66 |         self.empties = set()
 67 |         
 68 |         self._set_exclusions()
 69 | 
 70 |     def files(self,parallel=False):
 71 |         """
 72 |         Process the files. 
 73 |         if parallel is False or <= 1 will run hashes serially
 74 |         Otherwise specify True to use all cores or specify a number
 75 |         """
 76 |         
 77 |         # The hash_db is essentially the same as the filelist but it does
 78 |         # *not* need to be the latest a certain sync-pair has seen. It should
 79 |         # just be the latest parse of the files. The general idea is that
 80 |         # if the ['mtime','path','size'] are identical, no need to recalculate
 81 |         # the sha1 or adler of the file.
 82 |         
 83 |         self.hashes = any(a in utils.HASHFUNS for a in self.attributes)
 84 |         
 85 |         if self.hashes:
 86 |             self.load_hash_db()
 87 |         
 88 |         if parallel and self.hashes:
 89 |             import multiprocessing as mp
 90 |             if not isinstance(parallel,int):
 91 |                 parallel = mp.cpu_count()
 92 |             pool = mp.Pool(parallel)
 93 |             _map = partial(pool.imap,chunksize=10)
 94 |         else:
 95 |             pool = None
 96 |             _map = map 
 97 |             
 98 |         # Set this up as a chain of generators
 99 |         items = self._walk(self.path)   # Tuples of DirEnty,rootpath
100 |         items = map(self._file_info,items)   # Dictionaries
101 |         
102 |         ## Here is where we add hashes
103 |         for attribute in self.attributes:
104 |             if attribute in utils.HASHFUNS:
105 |                 items = _map(partial(self.add_hash,hashname=attribute),zip(items,repeat(self.path)))
106 |          
107 |          # Run it!
108 |         result = list(items)
109 |      
110 |         if pool is not None:
111 |             pool.close()
112 |         self.process_empty()
113 |      
114 |         if self.hashes:
115 |             self.save_hash_db(result)
116 |      
117 |         return result   
118 |     def process_empty(self):
119 |         """
120 |         Process empties based on self.empty and self.empties
121 |         """
122 |         empty_path = os.path.join(self.path,'.PyFiSync','empty_dirs')
123 |         if self.empty == 'reset':
124 |             try:
125 |                 os.remove(empty_path)
126 |             except OSError:
127 |                 pass
128 |         elif self.empty == 'store':
129 |             try:
130 |                 os.makedirs(os.path.dirname(empty_path))
131 |             except OSError:
132 |                 pass
133 |             empties = list(self.empties)    
134 |             with open(empty_path,'wt',encoding='utf8') as fobj:
135 |                 fobj.write(utils.to_unicode(json.dumps(empties,ensure_ascii=False)))
136 |         elif self.empty == 'remove':
137 |             with open(empty_path,'rt') as fobj:
138 |                 prev = set(json.loads(fobj.read()))
139 |             # Remove the remaining empty dirs
140 |             empties = self.empties - prev # both sets
141 |             
142 |             # Loop through empty dirs and remove. Do it longest to shortest so
143 |             # that nested empty dirs are removed
144 |             empties = sorted(empties,key=lambda a: (-len(a),a.lower()))
145 |             for empty_dir in empties:
146 |                 try:
147 |                     os.removedirs(empty_dir)
148 |                 except OSError:
149 |                     pass # May just be empty due to exclusion
150 | 
151 |     def _set_exclusions(self):
152 |         """
153 |         Set up and control exclusion
154 |         
155 |         Note that we separate those with glob patterns from those without
156 |         so that those without can be checked via O(1) set while those with
157 |         will go through fnmatch. If there is a false positive for glob, 
158 |         it won't change the final outcome; it will just be slower on that
159 |         single run.
160 |         """
161 |         GLOBS = '*?[]!'
162 |         self.all_excludes = set(self.config.excludes)
163 |         
164 |         self.exclude_file_full = set()
165 |         self.exclude_file      = set()
166 |         
167 |         # non-glob patters are done separately since the exclude can be checked faster
168 |         # using a set `in`  O(1) instead of fnmatch O(n) (or worse)
169 |         self.exclude_file_full_no_glob   = set()
170 |         self.exclude_file_no_glob        = set()
171 |         
172 |         
173 |         self.exclude_dirs_full = set()
174 |         self.exclude_dirs      = set()
175 |         
176 | 
177 |         for e in self.all_excludes:
178 |             e = utils.to_unicode(e)
179 |             if e.startswith('/'): # Full
180 |                 if e.endswith('/'):
181 |                     self.exclude_dirs_full.add(e)
182 |                 elif any(g in e for g in GLOBS):
183 |                     self.exclude_file_full.add(e)
184 |                 else:
185 |                     self.exclude_file_full_no_glob.add(e)
186 |             else:
187 |                 if e.endswith('/'):
188 |                     self.exclude_dirs.add(e)
189 |                 elif any(g in e for g in GLOBS):
190 |                     self.exclude_file.add(e)
191 |                 else:
192 |                     self.exclude_file_no_glob.add(e)
193 |     
194 |     def _walk(self,path,_d=0):
195 |         """
196 |         Yields tuples of (DirEntry,relpath) since relpath is already computed
197 |         and avoids recompute
198 |         """
199 |         if path.endswith('/'):
200 |             path = path[:-1] # Remove trailing / so we can avoid os.path.join
201 | 
202 |         
203 |         # We only care if anything was returned; directory or file
204 |         # Note that based on the nested nature of this, directories are trans-
205 |         # versed deep first so if the end is empty (and nothing from them was
206 |         # returned) to empty will propagate upwards and will be deleted 
207 |         # if applicable.
208 |         no_returns = True 
209 |         for item in scandir(path):
210 |             
211 |             itemname = utils.to_unicode(item.name)
212 |             relpath = _relpath(item.path,self.path)
213 |             if item.is_dir(follow_symlinks=True): # Always follow directory links
214 |                 
215 |                 if fnmatch_mult(itemname +'/',self.exclude_dirs):
216 |                     continue
217 |                 
218 |                 if fnmatch_mult('/'+relpath +'/',self.exclude_dirs_full):
219 |                     continue
220 |                 
221 |                 for subitem in self._walk(item.path,_d=_d+1):
222 |                     no_returns = False
223 |                     yield subitem
224 |             
225 |             elif item.is_file():
226 |                 if itemname in self.exclude_file_no_glob:
227 |                     continue
228 |             
229 |                 if '/'+relpath in self.exclude_file_full_no_glob:
230 |                     continue
231 |             
232 |                 if fnmatch_mult(itemname,self.exclude_file):
233 |                     continue
234 | 
235 |                 if fnmatch_mult('/'+relpath,self.exclude_file_full):
236 |                     continue
237 |                 
238 |                 no_returns = False
239 |                 yield item,relpath
240 |             
241 |             elif item.is_symlink(): # Must be broken!
242 |                 self.log.add_err('ERROR: Could not find information on {}\n'.format(relpath) +
243 |                                  '       May be a BROKEN link. Skipping\n')
244 |                 
245 | 
246 |         # Was it empty? Note that if there is nothing returned because
247 |         # of exclusions, it is still considered empty. 
248 |         if no_returns:
249 |             self.empties.add(path)
250 |             
251 |     def _file_info(self,item_relpath):
252 |         item,relpath = item_relpath
253 |     
254 |         stat_attributes = ['ino','size','mtime','birthtime']
255 |         file = {'path':relpath}
256 |         
257 |         
258 |         follow_symlinks = not self.config.copy_symlinks_as_links
259 | 
260 |         try:
261 |             stat = item.stat(follow_symlinks=follow_symlinks)
262 |         except OSError as E:
263 |             self.log.add_err('\n' + 
264 |                              'ERROR: Could not find information on {}\n'.format(relpath) +
265 |                              '         May be a BROKEN link.\n MSG: {}\nskipping...\n'.format(E))
266 |             return
267 |             
268 |         for attrib in stat_attributes:
269 |             try:
270 |                 file[attrib] = getattr(stat,'st_'+attrib)
271 |             except AttributeError:
272 |                 file[attrib] = 0.0
273 | 
274 |         # if it cannot get mtime, set to future:
275 |         if file['mtime'] == 0: file['mtime'] = time.time()+3600
276 | 
277 |         return file
278 |     
279 |     def filter_old_list(self,old_list):
280 |         """
281 |         Use the exclusions to filter the old lists
282 |         """
283 |         out_list = []
284 |         for ix,file in enumerate(old_list):
285 |             dirname,filename = os.path.split(file['path'])
286 | 
287 |             # file name only -- w/o glob
288 |             if filename in self.exclude_file_no_glob:
289 |                 continue
290 |                 
291 |             # file name only -- w/ glob
292 |             if fnmatch_mult(filename,self.exclude_file):
293 |                 continue
294 | 
295 |             # Full file
296 |             fullfile = '/' + file['path']
297 |             
298 |             if fullfile in self.exclude_file_full_no_glob:
299 |                 continue
300 |                 
301 |             if fnmatch_mult(fullfile,self.exclude_file_full):
302 |                 continue
303 |                 
304 |             # Dirnames. Need to test the full build up
305 |             
306 |             #dname = []
307 |             
308 |             # dirname only -- test
309 |             dname_list = []
310 |             dflag = False
311 |             for dname in dirname.split('/'):
312 |                 dname_list.append(dname)
313 |                 if fnmatch_mult(dname + '/',self.exclude_dirs):
314 |                     dflag = True
315 |                     break
316 |                 # Full dir
317 |                 if fnmatch_mult('/'+'/'.join(dname_list)+'/',self.exclude_dirs_full):
318 |                     dflag = True
319 |                     break
320 |             if dflag:
321 |                 continue
322 |                 
323 |             out_list.append(file)
324 |         return out_list
325 |     
326 | 
327 |     def add_hash(self,file_rootpath,hashname=None):
328 |         """
329 |         Add the hash but check the db first. Note that if use_hash_db=False,
330 |         the load_hash_db made it empty so we won't find it in the query.
331 |         """
332 |         file,rootpath = file_rootpath
333 |         
334 |         query = {k:file[k] for k in ['mtime','path','size']}
335 |         dbitem = self.hash_db.query_one(**query)
336 |         
337 |         if dbitem and hashname in dbitem:
338 |             file[hashname] = dbitem[hashname]
339 |         else:
340 |             fullpath = os.path.join(rootpath,file['path'])
341 |             file[hashname] = utils.HASHFUNS[hashname](fullpath)
342 |         
343 |         return file          
344 |                 
345 | 
346 |     def load_hash_db(self):
347 |         hash_db = list()
348 |         
349 |         hash_path = os.path.join(self.path,'.PyFiSync','hash_db.json')
350 |         if self.use_hash_db and os.path.exists(hash_path):
351 |             with open(hash_path,'rt',encoding='utf8') as F:
352 |                 hash_db = json.loads(F.read())
353 |         
354 |         self.hash_db = DictTable(hash_db,fixed_attributes=['mtime','path','size'])
355 |          
356 |     def save_hash_db(self,files):
357 |         if not self.use_hash_db:
358 |             return
359 |         hash_path = os.path.join(self.path,'.PyFiSync','hash_db.json')
360 |         try:
361 |             os.makedirs(os.path.dirname(hash_path))
362 |         except OSError:
363 |             pass
364 |         with open(hash_path,'wt',encoding='utf8') as F:
365 |             F.write(utils.to_unicode(json.dumps(files)))
366 |                     
367 | def _relpath(*A,**K):
368 |     """
369 |     Return the results of os.relpath but remove leading ./
370 |     """
371 |     res = os.path.relpath(*A,**K)
372 |     res = utils.to_unicode(res)
373 |     if res.startswith('./'):
374 |         return res[2:]
375 |     if res == '.':
376 |         return ''
377 |     return res      
378 | 
379 | 
380 | def exclude_if_present(filesA,filesB,exclude_filename):
381 |     """
382 |     Apply a filter to filesA and filesB to exclude any files below
383 |     exclude_if_present files
384 |     
385 |     filesA and filesB are assumed to be DictTables and this happens in place
386 |     """
387 |     exclude_dirs = set()
388 |     for file in list(filesA) + list(filesB):
389 |         path = file['path']
390 |         dirname,filename = os.path.split(path)
391 |         if filename == exclude_filename:
392 |             exclude_dirs.add(dirname)
393 |     
394 |     for exclude_dir in exclude_dirs:
395 |         try:
396 |             filesA.remove(filesA.Q.filter(lambda a:a['path'].startswith(exclude_dir)))
397 |         except ValueError:
398 |             pass
399 |         try:
400 |             filesB.remove(filesB.Q.filter(lambda a:a['path'].startswith(exclude_dir)))
401 |         except:
402 |             pass
403 |     
404 |     
405 | 
406 | def scandir(path,force_listdir=False):
407 |     if _scandir is not None and not force_listdir:
408 |         for item in _scandir(path):
409 |             yield item
410 |     
411 |     else:
412 |         for item in os.listdir(path):
413 |             fullpath = os.path.join(path,item)
414 |             yield fake_DirEntry(fullpath)
415 | 
416 | 
417 | class fake_DirEntry(object):
418 |     """
419 |     Fake DirEntry object.
420 | 
421 |     Will be used by backup scandir
422 |     """
423 |     # Use __slots__ for better memory
424 |     __slots__ = ('path','name','_lstat','_stat','_is_dir','_is_symlink')        
425 |     
426 |     def __init__(self,path):
427 |         self.path = path
428 |         self.name = os.path.basename(path)
429 |         
430 |         self._stat = None
431 |         self._lstat = None
432 |         self._is_dir = None
433 |         self._is_symlink = None
434 | 
435 |     def inode(self,follow_symlinks=True):
436 |         """
437 |         The main object doesn't seem to be clear on whether or not
438 |         if follows sym links. I added it but call stat first!!!
439 |         """
440 |         if self._stat is None:
441 |             self.stat(follow_symlinks=follow_symlinks)
442 |         
443 |         return self._stat.st_ino
444 | 
445 |     def is_dir(self,follow_symlinks=True):
446 |         if self.is_symlink() and not follow_symlinks:
447 |             return False # Symlinks are NEVER dirs when follow_symlinks is False
448 |         
449 |         if self._is_dir is None:
450 |             self._is_dir = os.path.isdir(self.path)
451 |         return self._is_dir
452 | 
453 |     def is_file(self,follow_symlinks=True):
454 |         # Make sure it is not a broken link b/c DirEntry will 
455 |         # tell you both false for file and dir
456 |         if self.is_symlink():
457 |             try:
458 |                 self.stat(follow_symlinks=True)
459 |             except OSError:
460 |                 return False # Broken link
461 |         
462 |         return not self.is_dir(follow_symlinks=follow_symlinks)
463 |     
464 |     def stat(self,follow_symlinks=True):
465 |         if follow_symlinks:
466 |             if self._stat is None:
467 |                 self._stat = os.stat(self.path)
468 |             return self._stat
469 |         else:
470 |             if self._lstat is None:
471 |                 self._lstat = os.lstat(self.path)
472 |             return self._lstat
473 |             
474 |     
475 |     def is_symlink(self):
476 |         if self._is_symlink is None:
477 |             self._is_symlink = os.path.islink(self.path)
478 |         return self._is_symlink
479 |         
480 |     def __str__(self):
481 |         return '<{0}: {1!r}>'.format(self.__class__.__name__, self.name)
482 |     __repr__ = __str__
483 |         
484 |             
485 |         
486 |     
487 |         
488 | 


--------------------------------------------------------------------------------
/PyFiSync/utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Collection of utilities I have written along the way that may be useful
  4 | 
  5 | parallel_map -- Improved multiprocessing.map. See references therein
  6 | """
  7 | from __future__ import division, print_function, unicode_literals, absolute_import
  8 | 
  9 | import hashlib
 10 | import os
 11 | import sys
 12 | import datetime
 13 | import re
 14 | import zlib
 15 | from io import open
 16 | import itertools
 17 | import argparse
 18 | import copy
 19 | from threading import Thread
 20 | import getpass
 21 | from functools import partial
 22 | 
 23 | try:
 24 |     from queue import Queue
 25 | except ImportError:
 26 |     from Queue import Queue
 27 | 
 28 | if sys.version_info >= (3,):
 29 |     unicode = str
 30 |     xrange = range
 31 | 
 32 | class logger(object):
 33 |     def __init__(self,path=None,silent=False):
 34 | 
 35 |         self.silent = silent
 36 |         self.path = path
 37 | 
 38 |         if path is not None:
 39 |             filepath = os.path.abspath(os.path.join(path,'.PyFiSync','logs'))
 40 |             self.path = filepath
 41 |             try:
 42 |                 os.makedirs(filepath)
 43 |             except OSError:
 44 |                 pass # Already exists
 45 |             self.filepath = os.path.join(filepath,
 46 |                 datetime.datetime.now().strftime('%Y-%m-%d_%H%M%S') + '.log')
 47 | 
 48 |             # Write the file with nothing but this will overwrite it
 49 |             with open(self.filepath,'w') as F:
 50 |                 F.write(' ')
 51 | 
 52 | 
 53 |         self.space = 0
 54 |         self.prepend = ''
 55 |     def add(self,text,end=u'\n',return_out=False):
 56 | 
 57 |         if text is None:
 58 |             return
 59 |         text = text.split('\n')
 60 |         out = []
 61 |         for line in text:
 62 |             out.append(self.prepend + ' '*self.space + line)
 63 | 
 64 |         out = '\n'.join(out)
 65 |         out = to_unicode(out)
 66 |         
 67 |         if not self.silent:
 68 |             try:
 69 |                 print(out,end=end)
 70 |             except UnicodeEncodeError: # This is a bit hacky but I think there are issues with remove queues and printing
 71 |                 print(out.encode('utf-8'),end=end.encode('utf8'))
 72 | 
 73 |         out = out.encode('utf-8')
 74 | 
 75 |         if self.path is not None:
 76 |             with open(self.filepath,'ba') as F:
 77 |                 F.write(out + end.encode('utf-8'))
 78 |         
 79 |         if return_out:
 80 |             return out
 81 |     
 82 |     def add_err(self,*A,**K):
 83 |         """
 84 |         Same as add except that it will write it to stderr instead of with 'print'
 85 |         """
 86 |         silent0 = self.silent
 87 |         self.silent = True
 88 |         
 89 |         out = self.add(return_out=True,*A,**K)
 90 |         
 91 |         self.silent = silent0
 92 |         
 93 |         # Now print it to stderr (even if silent!)
 94 |         end = K.get('end',u'\n')
 95 |         out = to_unicode('\n') + to_unicode(out) + to_unicode(end)
 96 |         try:
 97 |             sys.stderr.write(out)
 98 |         except UnicodeEncodeError: # This is a bit hacky but I think there are issues with remove queues and printing      
 99 |             sys.stderr.write(out.encode('utf-8'))
100 |             
101 |     def add_close(self):
102 |         if self.path is None:
103 |             return
104 |         self.line()
105 |         self.add('Log saved in {path:s}'.format(path=self.filepath))
106 | 
107 |     def line(self):
108 |         self.add('='*50,end='\n')
109 | 
110 | class configparser(object):
111 |     """This will eventually be the configuration"""
112 |     default_path = os.path.join(os.path.dirname(__file__),'config_template.py')
113 |     def __init__(self,sync_dir=None,remote=None):
114 | 
115 |         self.sync_dir = sync_dir
116 |         
117 |         # These must be a lists!
118 |         self._listattr =  ['move_attributesA','move_attributesB',
119 |                            'prev_attributesA','prev_attributesB',
120 |                            'excludes']
121 | 
122 |         # These must be changed from the defaults (They are not parsed in 
123 |         # defaults and must be set later)
124 |         self._reqattr = ['pathB']
125 |         if sync_dir is not None:
126 |             # We parse the input twice since we want to know the remote before
127 |             # parsing defaults. But, we do not want to prompt for a password
128 |             # twice so we tell it to ignore it here
129 |             _tmp = self.parse(getdict=True,pw=False)
130 |             self.pathA = os.path.abspath(sync_dir)
131 |             if 'remote' not in _tmp:
132 |                 print('ERROR: Must specify a remote. Must update config file for PyFiSync',file=sys.stderr)
133 |                 sys.exit(2)
134 |             self.parse_defaults(remote=_tmp['remote'])
135 |             self.parse()
136 |         else:
137 |             self.parse_defaults(remote=remote)
138 |         
139 |         # Some special things
140 |         self.excludes = list(set(self.excludes + ['.PBrsync/','.PyFiSync/']))
141 | 
142 |     def parse_defaults(self,remote=None):
143 |         """
144 |         Parse all defaults from the template except those in self._reqattr
145 |         """
146 |         config = dict()
147 |         try:
148 |             with open(self.default_path,'rt') as F:
149 |                 txt = F.read()
150 |         except:
151 |             # This is a hack for when it is in an egg file. I need to figure
152 |             # out a better way
153 |             import zipfile
154 |             _zf = zipfile.ZipFile(self.default_path[:self.default_path.find('/PyFiSync/config_template.py')])
155 |             txt = _zf.read('PyFiSync/config_template.py')
156 |             txt = to_unicode(txt)
157 |         
158 |         if remote is None:
159 |             remote = 'rsync'
160 |         txt = self._filterconfig(txt,remote=remote)
161 |         
162 |         exec_(txt,config)       
163 |         
164 |         for key,val in config.items():
165 |             # Unike `parse`, there is no need to check for lists since 
166 |             # this isn't user code
167 |             if key in self._reqattr:
168 |                 continue
169 |             setattr(self,key,val)
170 | 
171 |     @property
172 |     def configpath(self):
173 |         for ext in ['','.py']:
174 |             config_path = os.path.join(self.sync_dir,'.PyFiSync','config'+ext)
175 |             if os.path.exists(config_path):
176 |                 break
177 |         else:
178 |             sys.stderr.write('ERROR Could not find config file. Did you run `init`?\n')
179 |             sys.exit(2)    
180 |         return config_path
181 | 
182 |     def parse(self,getdict=False,pw=True):
183 |         none = lambda *a,**k:None
184 |         config = dict(pwprompt=getpass.getpass if pw else none)
185 |         with open(self.configpath,'rt') as F:
186 |             txt = F.read()
187 |             # Make sure this is executed from the base path
188 |             # but then switch back just in case it messes up other stuff
189 |             pwd0 = os.getcwd()   
190 |             os.chdir(self.sync_dir)         
191 |             exec_(txt,config)    
192 |             os.chdir(pwd0)
193 |             
194 |         if getdict:
195 |             return config
196 |         
197 |         for key,val in config.items():
198 |             if key in self._listattr and not isinstance(val,list):
199 |                 if isinstance(val,(set,tuple)):
200 |                     val = list(val)
201 |                 else:
202 |                     val = [val]
203 |             setattr(self,key,val)
204 | 
205 |         # Some minor adjustments        
206 |         self.mod_resolution = float(self.mod_resolution)
207 |         # Aliases
208 |         if hasattr(self,'symlinks'):
209 |             self.copy_symlinks_as_links = not self.symlinks
210 | 
211 |     @classmethod
212 |     def config_example(cls,remote='rsync'):
213 |         """ Return an example configuration"""
214 |         with open(cls.default_path,'rt') as F:
215 |             config = F.read()
216 |         return configparser._filterconfig(config,remote=remote)
217 |    
218 |     @staticmethod
219 |     def _filterconfig(config,remote='rsync'):   
220 |         # remove anything that is not part of this remote
221 |         from .remote_interfaces import REMOTES
222 |         if remote not in REMOTES:
223 |             raise ValueError('Not a valid remote')
224 |         for rem in REMOTES:
225 |             if rem == remote:
226 |                 repr = r'\1'
227 |             else:
228 |                 repr = ''
229 |             regex = r'^#[\ \t]*?\<FLAG\>(.*?)#[\ \t]*?\<[\/\\]FLAG\>'.replace('FLAG',rem)
230 |             config = re.sub(regex,repr,config,flags=re.MULTILINE|re.DOTALL)
231 |         
232 |         # Remove more than one empty line from the above replacements
233 |         # This can be done with regex but easier with python.
234 |         config = '\n'.join(c.rstrip() for c in config.split('\n')) # All empty lines are blank
235 |         while '\n\n\n' in config:
236 |             config = config.replace('\n\n\n','\n\n')
237 |         return config
238 | 
239 | def hashlibhash(filepath,BLOCKSIZE=1*1024**2,name='sha1'):
240 |     """
241 |     http://pythoncentral.io/hashing-files-with-python/
242 |     
243 |     1024*1024: 1 mb
244 |     4*1024: 4 kb
245 |     
246 |     """
247 |     hasher = hashlib.new(name)
248 |     with open(filepath, 'rb') as afile:
249 |         buf = afile.read(BLOCKSIZE)
250 |         while len(buf) > 0:
251 |             hasher.update(buf)
252 |             buf = afile.read(BLOCKSIZE)
253 |     if name.startswith('shake'):
254 |         return hasher.hexdigest(32)
255 |     else:
256 |         return hasher.hexdigest()
257 |         
258 | def adler(filepath,BLOCKSIZE=1*1024**2):
259 |     """
260 |     Create an additive adler32 checksum. Faster than sha1.
261 | 
262 |     From the documentation:
263 |      > Changed in version 3.0: Always returns an unsigned value.
264 |      > To generate the same numeric value across all Python versions and
265 |      > platforms, use adler32(data) & 0xffffffff.
266 |     """
267 |     csum = 1
268 |     with open(filepath, 'rb') as afile:
269 |         buf = afile.read(BLOCKSIZE)
270 |         while len(buf) > 0:
271 |             csum = zlib.adler32(buf,csum)
272 |             buf = afile.read(BLOCKSIZE)
273 |     csum = csum & 0xffffffff
274 |     return ('0'*8 + hex(csum)[2:])[-8:] # Preceding 0s
275 | 
276 | def dropboxhash(filename):
277 |     """
278 |     Compute the dropbox hash of a given file. See [1] for details
279 |     This was tested on thier example and against rclone
280 |     
281 |     [1]: https://www.dropbox.com/developers/reference/content-hash
282 | 
283 |     """
284 |     subhashes = []
285 |     with open(filename,'rb') as file:
286 |         while True:
287 |             buf = file.read(4*1024**2)
288 |             if len(buf) == 0:
289 |                 break
290 |             subhashes.append(hashlib.sha256(buf).digest())
291 |     return hashlib.sha256(b''.join(subhashes)).hexdigest()
292 | 
293 | HASHFUNS = {
294 |     'adler':adler,
295 |     'dbhash':dropboxhash}
296 | for name in hashlib.algorithms_guaranteed:
297 |     HASHFUNS[name] = partial(hashlibhash,name=name)
298 | 
299 | def to_unicode(txt,verbose=False):
300 |     """
301 |     Convert input to unicode if it can!
302 |     """
303 |     for objtype in [list,tuple,set]:
304 |         if isinstance(txt,objtype):
305 |             return objtype(to_unicode(a) for a in txt)
306 |     if isinstance(txt,unicode):
307 |         return txt
308 |     if hasattr(txt,'decode'):
309 |         return txt.decode('utf8')
310 | 
311 | class RawSortingHelpFormatter(argparse.RawDescriptionHelpFormatter):
312 |     """
313 |     argparse help formatter that uses RawDescriptionHelpFormatter but
314 |     alphebatizes by the long-form action and lower case
315 |     
316 |     Based on https://stackoverflow.com/a/12269143/3633154
317 |     WARNING: Uses non-documented behavior but it *should* be fine
318 |     """
319 |     # override parent
320 |     def add_arguments(self, actions):
321 |         actions = sorted(actions, key=self._sortkey)
322 |         super(RawSortingHelpFormatter, self).add_arguments(actions)
323 |     
324 |     # new
325 |     def _sortkey(self,action):
326 |         """
327 |         Sorter for optional strings. Sort by lower case of long 
328 |         argument otherwise short
329 |         """
330 |         options = copy.copy(action.option_strings)
331 |         options.sort(key=self._count_leading_dash)
332 |         return tuple(opt.lower() for opt in options)
333 |    
334 |     def _count_leading_dash(self,item):
335 |         count = 0
336 |         while item.startswith('-'):
337 |             count += -1
338 |             item = item[1:]
339 |         return count
340 | 
341 | 
342 | def move_txt(src,dst):
343 |     """Apply some pretty printing to moves"""
344 |     _fjoin = lambda s: '' if len(s) == 0 else (os.sep if s[0] == '' else '') + os.sep.join(s)
345 | 
346 |     # Split as sep and add it in
347 |     srcs = src.split(os.sep)
348 |     dsts = dst.split(os.sep)
349 |     
350 |     comb = []
351 |     
352 |     for s,d in zip(srcs,dsts):
353 |         if s != d:
354 |             break
355 |         comb.append(s)
356 |     sremain = _fjoin(srcs[len(comb):])
357 |     dremain = _fjoin(dsts[len(comb):])
358 |     comb = _fjoin(comb)
359 | 
360 |     if len(comb)>2 and len(sremain)>0 and len(dremain)>0: 
361 |         # Just so that we aren't doing this for nothing
362 |         mtxt = comb + os.sep + '{' + sremain + ' --> ' + dremain + '}'
363 |     else:
364 |         mtxt = '{src:s} --> {dst:s}'.format(src=src,dst=dst)
365 |     
366 |     while os.sep*2 in mtxt:
367 |         mtxt = mtxt.replace(os.sep*2,os.sep)
368 |     
369 |     return mtxt
370 |     
371 | 
372 | 
373 | class ReturnThread(Thread):
374 |     """
375 |     Like a regular thread except when you `join`, it returns the function
376 |     result. And it assumes a target is always passed
377 |     """
378 |     def __init__(self,**kwargs):
379 |         self.target = kwargs.pop('target',False)
380 |         if self.target is False:
381 |             raise ValueError('Must specify a target')
382 |         self.q = Queue()
383 |         super(ReturnThread, self).__init__(target=self._target,**kwargs)
384 |     
385 |     def _target(self,*args,**kwargs):
386 |         self.q.put( self.target(*args,**kwargs) )
387 |     
388 |     def join(self,**kwargs):
389 |         super(ReturnThread, self).join(**kwargs)
390 |         res = self.q.get()
391 |         self.q.task_done()
392 |         self.q.join()
393 |         return res
394 |             
395 | def RFC3339_to_unix(timestr):
396 |     """
397 |     Parses RFC3339 into a unix time
398 |     """
399 |     d,t = timestr.split('T')
400 |     year,month,day = d.split('-')
401 |     
402 |     t = t.replace('Z','-00:00') # zulu time
403 |     t = t.replace('-',':-').replace('+',':+') # Add a new set
404 |     hh,mm,ss,tzhh,tzmm = t.split(':')
405 |     
406 |     offset = -1 if tzhh.startswith('-') else +1
407 |     tzhh = tzhh[1:]
408 |     
409 |     try:
410 |         ss,micro = ss.split('.')
411 |     except ValueError:
412 |         ss = ss
413 |         micro = '00'
414 |     micro = micro[:6] # Python doesn't support beyond 999999
415 |     
416 |     dt = datetime.datetime(int(year),int(month),int(day),
417 |                            hour=int(hh),minute=int(mm),second=int(ss),
418 |                            microsecond=int(micro))
419 |     unix = (dt - datetime.datetime(1970,1,1)).total_seconds()
420 |     
421 |     # Account for timezone which counts backwards so -=
422 |     unix -= int(tzhh)*3600*offset
423 |     unix -= int(tzmm)*60*offset
424 |     return unix    
425 |     
426 | def imitate_hash(mydict):
427 |     """
428 |     Imitate the hash. This is crude and imperfect but fine for replacing
429 |     a missing hash
430 |     """
431 |     hasher = hashlib.sha1()
432 |     hasher.update(repr(mydict).encode('utf8'))
433 |     return  hasher.hexdigest()
434 |     
435 | 
436 | def bytes2human(byte_count,base=1024,short=True):
437 |     """
438 |     Return a value,label tuple
439 |     """
440 |     if base not in (1024,1000):
441 |         raise ValueError('base must be 1000 or 1024')
442 |     
443 |     labels = ['kilo','mega','giga','tera','peta','exa','zetta','yotta']
444 |     name = 'bytes'
445 |     if short:
446 |         labels = [l[0] for l in labels]
447 |         name = name[0]
448 |     labels.insert(0,'') 
449 |     
450 |     best = 0
451 |     for ii in range(len(labels)): 
452 |         if (byte_count / (base**ii*1.0)) < 1:
453 |             break
454 |         best = ii
455 |     
456 |     return byte_count / (base**best*1.0),labels[best] + name  
457 | 
458 | def file_summary(files):
459 |     N = len(files)
460 |     s = sum(f['size'] for f in files if f)
461 |     s = bytes2human(s)
462 |     return "{:d} files, {:0.2f} {:s}".format(N,s[0],s[1])
463 | 
464 | 
465 | ########################### six extracted codes ###########################
466 | # This is pulled from the python six module (see links below) to work 
467 | # around some python 2.7.4 issues
468 | # Links:
469 | #   https://github.com/benjaminp/six
470 | #   https://pypi.python.org/pypi/six
471 | #   http://pythonhosted.org/six/
472 | ##############################################################################
473 | # Copyright (c) 2010-2018 Benjamin Peterson
474 | # 
475 | # Permission is hereby granted, free of charge, to any person obtaining a copy
476 | # of this software and associated documentation files (the "Software"), to deal
477 | # in the Software without restriction, including without limitation the rights
478 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
479 | # copies of the Software, and to permit persons to whom the Software is
480 | # furnished to do so, subject to the following conditions:
481 | # 
482 | # The above copyright notice and this permission notice shall be included in
483 | # all copies or substantial portions of the Software.
484 | # 
485 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
486 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
487 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
488 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
489 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
490 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
491 | # THE SOFTWARE.
492 | #############################################################################
493 | if sys.version_info[0]>2:
494 |     exec('exec_ = exec')
495 | else:
496 |     def exec_(_code_, _globs_=None, _locs_=None):
497 |         """Execute code in a namespace."""
498 |         if _globs_ is None:
499 |             frame = sys._getframe(1)
500 |             _globs_ = frame.f_globals
501 |             if _locs_ is None:
502 |                 _locs_ = frame.f_locals
503 |             del frame
504 |         elif _locs_ is None:
505 |             _locs_ = _globs_
506 |         exec("""exec _code_ in _globs_, _locs_""")
507 | 
508 | 
509 | 
510 |     
511 |     
512 | 


--------------------------------------------------------------------------------
/PyFiSync/dicttable.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import unicode_literals
  3 | 
  4 | __version__ = "20200825"
  5 | __author__ = "Justin Winokur"
  6 | 
  7 | import copy
  8 | from collections import defaultdict
  9 | import uuid
 10 | import types
 11 | import sys
 12 | 
 13 | if sys.version_info[0] > 2:
 14 |     unicode = str
 15 | 
 16 | class ExcludedAttributeError(ValueError):
 17 |     pass
 18 | 
 19 | class DictTable(object):
 20 |     """
 21 |     DictTable:
 22 |     Create an in-memeory single table DB from a list of dictionaries that may 
 23 |     be queried by any specified attribute.
 24 | 
 25 |     This is useful since, once created, lookup/query/"in" checks areO(1), 
 26 |     Creation is still O(N)
 27 | 
 28 |     Note: Unless an entry is changed with update(), it must be reindexed
 29 | 
 30 |     Inputs:
 31 |     --------
 32 |     items  [ *empty* ] (list)
 33 |         Iterable of dictionaries with each attribute. Can also be a DictTable.
 34 |         If specified as a DictTable, other options are still settable here.
 35 | 
 36 |     fixed_attributes [None] (list, None)
 37 |         Specify _specific_ attributes to index for each item. Will *only* index
 38 |         them unless add_fixed_attribute('new_attribute') is called.
 39 |         
 40 |         If None, will use _all_ attributed *except* those of exclude_attributes
 41 |         
 42 |     exclude_attributes [ *empty* ] (list)
 43 |         Attributes that shouldn't ever be added even if attributes=None for 
 44 |         dynamic addition of attributes.
 45 |         
 46 |     Multiple Values per attribute
 47 |     -----------------------------
 48 |     A "row" can have multiple values per attribute as follows:
 49 |         
 50 |         {'attribute':[val1,val2,val3]}
 51 |         
 52 |     and can be queried for any (or all) values.
 53 |     
 54 |     Additional Opperations:
 55 |     ----------------------
 56 |     This supports index-lookup with a dictionary as well as
 57 |     a python `in` check and lookup by a dictionary
 58 |     
 59 |     The code will allow you to edit/delete/update multiple items at once
 60 |     (just like a standard database). Use with caution.
 61 | 
 62 |     Tips:
 63 |     ------
 64 |     * You can simply dump the DB with JSON using the DB.items()
 65 |       and then reload it with a new DB
 66 |     
 67 |     * There is also an attribute called `_index` which can be used to
 68 |       query by index.
 69 | 
 70 |     """
 71 |     def __init__(self, items=None, 
 72 |                  fixed_attributes=None,exclude_attributes=None):
 73 |         
 74 |         # These are used to make sure the DB.Query is (a) from this DB and (b)
 75 |         # the DB hasn't changed. This *should* always be the case
 76 |         self._id = unicode(uuid.uuid4()) 
 77 |         self._c = 0
 78 |         
 79 |         # Handle inputs
 80 |         if items is None:
 81 |             items = list()
 82 |             
 83 |         if exclude_attributes is None:
 84 |             exclude_attributes = set()
 85 |         if isinstance(exclude_attributes,(str,unicode)):
 86 |             exclude_attributes = [exclude_attributes]
 87 |         self.exclude_attributes = set(exclude_attributes)
 88 |         
 89 |         if fixed_attributes:
 90 |             if isinstance(fixed_attributes,(str,unicode)):
 91 |                 fixed_attributes = [fixed_attributes]
 92 |             self.fixed_attributes = list(fixed_attributes)
 93 |         else:
 94 |             self.fixed_attributes = list() 
 95 |         
 96 |         self.N = 0 # Will keep track
 97 |         self._list = []
 98 |         self._lookup = defaultdict(_new_defaultdict_list)
 99 | 
100 |         self._empty = _emptyList()
101 |         self._ix = set()
102 | 
103 |         # Add the items
104 |         for item in items:
105 |             self.add(item)
106 | 
107 |     def add(self,item):
108 |         """
109 |         Add an item or items to the DB
110 |         """
111 |         if isinstance(item,(list,tuple,types.GeneratorType)):
112 |             for it in item:
113 |                 self.add(it)
114 |             return
115 |         
116 |         ix = len(self._list) # The length will be 1+ the last ix so do not change this
117 |                     
118 |         # Add built in ones if it is there
119 |         attribs = self.fixed_attributes if self.fixed_attributes else item.keys()
120 |         
121 |         for attrib in attribs:
122 |             if attrib not in item or attrib in self.exclude_attributes:
123 |                 continue
124 |             self._append(attrib,item[attrib],ix) # Add it to the index
125 | 
126 |         # Finally add it
127 |         self._list.append(item)
128 |         self.N += 1
129 |         self._ix.add(ix)
130 | 
131 |     def query(self,*args,**kwargs):
132 |         """
133 |         Query the value for attribute. Will always an iterator. Use
134 |         `list(DB.query())` to return a list
135 |         
136 |         Usage
137 |         -----
138 |         
139 |         Any combination of the following will works
140 |         
141 |         Keywords: Only check equality
142 |                    
143 |         >>> DB.query(attrib=val)
144 |         >>> DB.query(attrib1=val1,attrib2=val2)  # Match both
145 |         
146 |         >>> DB.query({'attrib':val})
147 |         >>> DB.query({'attrib1':val1,'attrib2':val2}) # Match Both
148 |                                       
149 |         Query Objects (DB.Q, DB.Query)
150 |         
151 |         >>> DB.query(DB.Q.attrib == val)
152 |         >>> DB.query( (DB.Q.attrib1 == val1) &  (DB.Q.attrib1 == val2) )  # Parentheses are important!
153 |         >>> DB.query( (DB.Q.attrib1 == val1) &  (DB.Q.attrib1 != val2) )
154 |                                    
155 |         """
156 |         ixs = self._ixs(*args,**kwargs)
157 |         for ix in ixs:
158 |             yield self._list[ix]
159 |     
160 |     def query_one(self,*args,**kwargs):
161 |         """
162 |         Return a single item from a query. See "query" for more details.
163 |         
164 |         Returns None if nothing matches
165 |         """
166 |         try:
167 |             return next(self.query(*args,**kwargs))
168 |         except StopIteration:
169 |             return None
170 | 
171 |     def count(self,*args,**kwargs):
172 |         """
173 |         Return the number of matched rows for a given query. See "query" for
174 |         details on query construction
175 |         """
176 |         return len(self._ixs(*args,**kwargs))
177 |     
178 |     def isin(self,*args,**kwargs):
179 |         """
180 |         Check if there is at least one item that matches the given query
181 |         
182 |         see query() for usage
183 |         """
184 |         return self.count(*args,**kwargs) > 0
185 | 
186 |     def reindex(self,*attributes):
187 |         """
188 |         Reindex the dictionary for specified attributes (or all)
189 |         
190 |         Usage
191 |         -----
192 |         
193 |         >>> DB.reindex()                # All
194 |         >>> DB.reindex('attrib')        # Reindex 'attrib'
195 |         >>> DB.reindex('attrib1','attrib2') # Multiple
196 |         
197 |         See Also
198 |         --------
199 |             update() method which does not require reindexing
200 |         """
201 |         if len(attributes) == 0:
202 |             attributes = self.attributes
203 |         
204 |         if any(a in self.exclude_attributes for a in attributes):
205 |             raise ValueError('Cannot reindex an excluded attribute')
206 | 
207 |         for attribute in attributes:
208 |             self._lookup[attribute] = defaultdict(list) # Reset
209 |         
210 |         for ix,item in enumerate(self._list):
211 |             if item is None: continue
212 |             for attrib in attributes:
213 |                 if attrib in item:
214 |                     self._append(attrib,item[attrib],ix)
215 |     
216 |     def update(self,*args,**queryKWs):
217 |         """
218 |         Update an entry without needing to reindex the DB (or a specific
219 |         attribute)
220 |         
221 |         Usage:
222 |         ------
223 |         
224 |         >>> DB.update(updated_dict, query_dict_or_Query, query_attrib1=val1,...)
225 |         >>> DB.update(updated_dict, query_attrib1=val1,...)
226 |         
227 |         Inputs:
228 |         -------
229 |         
230 |         updated_dict : Dictionary with which to update the entry. This is
231 |                        done using the typical dict().update() construct to
232 |                        overwrite it
233 |         
234 |         query_dict_or_Query
235 |                      : Either the dictionary used in the query or a Query that
236 |                        defines a more advanced query
237 |         
238 |         query_attrib1=val1
239 |                      : Additional (or sole) query attributes
240 |     
241 |         Notes:
242 |         ------
243 |             * Updating an item requires a deletion in a list that has length n
244 |               equal to the number of items matching an attribute. This is O(n).
245 |               However changing the entry directly and reindexing is O(N) where
246 |               N is the size of the DB. If many items are changing and you do not
247 |               need to query them in between, it *may* be faster to directly
248 |               update the item and reindex
249 |         """
250 |         
251 |         if len(args) == 1:
252 |             updated_dict = args[0]
253 |             query = {}
254 |         elif len(args) == 2:
255 |             updated_dict,query = args
256 |         else:
257 |             raise ValueError('Incorrect number of inputs. See documentation')
258 |         
259 |         if not isinstance(updated_dict,dict):
260 |             raise ValueError('Must specify updated values as a dictionary')
261 |         
262 |         if isinstance(query,Query):
263 |             ixs = self._ixs(query,**queryKWs)
264 |         elif isinstance(query,dict):
265 |             queryKWs.update(query)
266 |             ixs = self._ixs(**queryKWs)
267 |         else:
268 |             raise ValueError('Unrecognized query {:s}. Must be a dict or Query',format(type(query)))
269 |         
270 |         if len(ixs) == 0:
271 |             raise ValueError('Query did not match any results')
272 |         
273 |         for ix in ixs:
274 |             # Get original item
275 |             item = self._list[ix]
276 |             
277 |             # Allow the update to also include non DB attributes.
278 |             # The intersection will eliminate any exclude_attributes
279 |             attributes = set(updated_dict.keys()).intersection(self.attributes)
280 |             
281 |             for attrib in attributes: # Only loop over the updated attribs
282 |                 value = item[attrib] # get old value
283 |                 self._remove(attrib,value,ix) # Remove any ix matching it
284 |                 value = updated_dict[attrib] # Get new value
285 |                 self._append(attrib,value,ix) # Add ix to any new value
286 |                 
287 |             item.update(updated_dict) # Update the item
288 |     
289 |     def add_fixed_attribute(self,attrib,force=False):
290 |         """
291 |         Adds a fixed attribute. If there are NO fixed attributes (i.e. it is
292 |         dynamic attributes), do *NOT* add them unless force.
293 |         
294 |         Will reindex either way
295 |         """
296 |         if attrib in self.exclude_attributes:
297 |             raise ExcludedAttributeError("'{}' is excludes".format(attrib))
298 |         
299 |         if self.fixed_attributes or force and attrib not in self.fixed_attributes: # Must already be filled or forced
300 |             self.fixed_attributes.append(attrib)
301 |         
302 |         self.reindex(attrib)
303 |           
304 |     def remove(self,*args,**kwargs):
305 |         """
306 |         Remove item that matches a given attribute or dict. See query() for
307 |         input specification
308 |         -----------
309 |         """
310 |         ixs = list(self._ixs(*args,**kwargs))
311 | 
312 |         if len(ixs) == 0:
313 |             raise ValueError('No matching items')
314 | 
315 |         items = []
316 | 
317 |         for ix in ixs[:]: # Must remove it from everything.
318 |             # not sure what is happening, but it seems that I need to make a copy
319 |             # since Python is doing something strange here...
320 | 
321 |             item = self._list[ix]
322 |             for attrib in self.attributes:
323 |                 if attrib in item:
324 |                     self._remove(attrib,item[attrib],ix)
325 |                 
326 |             # Remove it from the list by setting to None. Do not reshuffle
327 |             # the indices. A None check will be performed elsewhere
328 |             self._list[ix] = None
329 |             self._ix.difference_update([ix])
330 |             self.N -= 1
331 |     
332 |     def copy(self):
333 |         return DictTable(self,
334 |                          exclude_attributes=copy.copy(self.exclude_attributes),
335 |                          fixed_attributes=copy.copy(self.fixed_attributes))
336 |     __copy__ = copy
337 |             
338 |     @property
339 |     def Query(self):
340 |         """
341 |         Query object already loaded with the DB
342 |         
343 |             DB.Query <==> DB.Q
344 |         """
345 |         return Query(self)
346 |     Q = Query
347 |     
348 |     def _ixs(self,*args,**kwargs):
349 |         """
350 |         Get the inde(x/ies) of matching information
351 |         """
352 |         if not hasattr(self,'_lookup') or self.N==0: # It may be empty
353 |             return []
354 |  
355 |         # Make the entire kwargs be lists with default of []. Edge case of
356 |         # multiple items
357 |         for key,val in kwargs.items():
358 |             if not isinstance(val,list):
359 |                 kwargs[key] = [val]
360 |         kwargs = defaultdict(list,kwargs)
361 |         
362 |         Q = Query(self) # Empty object
363 |         for arg in args:
364 |             if isinstance(arg,Query):
365 |                 if arg._id != self._id:
366 |                     raise ValueError("Cannot use another DictTable's Query object")
367 |                 
368 |                 Q = Q & arg # Will add these conditions. If Q is empty, will just be arg
369 |                 continue
370 |             if isinstance(arg,dict):
371 |                 for key,val in arg.items(): # Add it rather than update in case it is already specified
372 |                     kwargs[key].append(val)
373 |             else:
374 |                 raise ValueError('unrecognized input of type {:s}'.format(str(type(arg))))
375 |         
376 |         # Construct a query for kwargs
377 |         for key,value in kwargs.items():
378 |             if isinstance(value,list) and len(value) == 0:
379 |                 value = [self._empty]
380 |             for val in _makelist(value):
381 |                 Qtmp = Query(self)
382 |                 Qtmp._attr = key
383 |                 Q = Q & (Qtmp == val)
384 | 
385 |         ixs = Q._ixs
386 |         return list(ixs)
387 |         
388 |     def _index(self,ix):
389 |         """
390 |         Return ix if it hasn't been deleted
391 |         """
392 |         try:
393 |             item = self._list[ix]
394 |         except IndexError:
395 |             return []
396 |         
397 |         if item is None:
398 |             return []
399 |         return [ix]
400 |     
401 |     def _append(self,attrib,value,ix):
402 |         """
403 |         Add to the lookup and update the modify time
404 |         """
405 |         # Final check but we should be guarded from this
406 |         if attrib in self.exclude_attributes:
407 |             #print('BAD! Should guard against this in public methods!')
408 |             raise ValueError('Cannot reindex an excluded attribute')
409 |         
410 |         valueL = _makelist(value)
411 |         for val in valueL:
412 |             self._lookup[attrib][val].append(ix)
413 |         if len(valueL) == 0:
414 |             self._lookup[attrib][self._empty].append(ix) # empty list
415 |         
416 |         self._c += 1
417 |     
418 |     def _remove(self,attrib,value,ix):
419 |         """
420 |         Remove from the lookup and update the modify time
421 |         """
422 |         valueL = _makelist(value)
423 |         for val in valueL:
424 |             try:
425 |                 self._lookup[attrib][val].remove(ix)
426 |             except ValueError:
427 |                 raise ValueError('Item not found in internal lookup. May need to first call reindex()')
428 |         if len(valueL) == 0:
429 |             self._lookup[attrib][self._empty].remove(ix) # empty list
430 |     
431 |         self._c += 1
432 |     
433 |     def __contains__(self,check_diff):
434 |         if not ( isinstance(check_diff,dict) or isinstance(check_diff,Query)):
435 |             raise ValueError('Python `in` queries should be a of {attribute:value} or Query')
436 |         return self.isin(check_diff)
437 | 
438 |     def __len__(self):
439 |         return self.N
440 | 
441 |     def __getitem__(self,item):
442 |         if isinstance(item,dict) or isinstance(item,Query):
443 |             return self.query_one(item)
444 |         elif isinstance(item,int): # numbered item
445 |             if self._list[item] is not None:
446 |                 return self._list[item]
447 |             else:
448 |                 raise ValueError("Index has been deleted")
449 |         else:
450 |             raise ValueError("Must specify DB[{'attribute':val}] or DB[index]'")
451 |     __call__ = query
452 |     
453 |     def __iter__(self):
454 |         return (item for item in self._list if item is not None)
455 |     items = __iter__
456 |     
457 |     @property
458 |     def attributes(self):
459 |         # The attributes are the keys of _lookup but _lookup is a defaultdict
460 |         # of a defaultdict(list) so need to check that it is also empy
461 |         if self.fixed_attributes:
462 |             return self.fixed_attributes
463 |         
464 |         attribs = []
465 |         # This seems slow but the second for-loop will break at the first non-empty
466 |         # item (likely the first one)
467 |         for attrib,val in self._lookup.items():
468 |             if not val: # Empty
469 |                 continue
470 |             for v in val.values():
471 |                 if v: break
472 |             else:
473 |                 continue
474 |             
475 |             attribs.append(attrib)
476 |         attribs.sort()
477 |         return attribs
478 |     
479 | def _makelist(input):
480 |     if isinstance(input,list):
481 |         return input
482 |     return [input]
483 | 
484 | class _emptyList(object):
485 |     def __init__(self):
486 |         pass
487 |     def __hash__(self):
488 |         return 9999999999999
489 |     def __eq__(self,other):
490 |         return isinstance(other,list) and len(other)==0
491 | 
492 | def _new_defaultdict_list():
493 |     return defaultdict(list)
494 |        
495 | class Query(object):
496 |     """
497 |     Query objects. This works by returning an updated *copy* of the object
498 |     whenever it is acted upon
499 |     
500 |     Calling
501 |         * Q.attribute sets attribute and returns a copy
502 |         * Q.attribute == val (or any other comparison) set the index of elements
503 |         * Q1 & Q1 or other boolean perform set operations
504 |         
505 |     Useful Methods:
506 |         _filter : (or just `filter` if not an attribute): Apply a filter
507 |                   to the DB
508 |     """
509 |     def __init__(self,DB):
510 |         self._DB = DB
511 |         self._ixs = DB._ix # Everything. Do *NOT* copy but also never modify in place
512 |         self._attr = None
513 |         
514 |         self._c = DB._c
515 |         self._id = DB._id
516 |         
517 |     def _valid(self):
518 |         if self._c != self._DB._c:
519 |             raise ValueError('This query object is out of date from the DB. Create a new one')
520 |         
521 |     def _filter(self,filter_func):
522 |         """
523 |         If 'filter' is NOT an attribute of the DB, this can be called
524 |         with 'filter' instead of '_filter'
525 |         
526 |         Apply a filter to the data that returns True if it matches and False
527 |         otherwise
528 |         
529 |         Note that filters are O(N)
530 |         """
531 |         self._valid() # Actually, these would still work but still check
532 |         ixs = set()
533 |         for ix,item in enumerate(self._DB._list): # loop all
534 |             if item is None:
535 |                 continue
536 |             if filter_func(item):
537 |                 ixs.add(ix)
538 |         self._ixs = ixs # reset it
539 |         return self
540 | 
541 |     # Comparisons
542 |     def __eq__(self,value):
543 |         self._valid()
544 | 
545 |         if not self._ixs:
546 |             return self
547 |         
548 |         # Account for '_index' attribute (May be deprecated in the future...)
549 |         if self._attr == '_index':
550 |             self._ixs = self._ixs.intersection({value}) # replace, don't update
551 |             return self
552 |         for val in _makelist(value):
553 |              self._ixs = self._ixs.intersection(self._DB._lookup[self._attr][val]) # Will return [] if _attr or val not there . Replace, don't update
554 |         return self
555 |      
556 |     def __ne__(self,value):
557 |         self._ixs = self._DB._ix - (self == value)._ixs
558 |         return self
559 |     
560 |     def __lt__(self,value):
561 |         self._valid() # Actually, these would still work but still check
562 |         ixs = set()
563 |         for ix,item in enumerate(self._DB._list): # loop all
564 |             if item is None or self._attr not in item:
565 |                 continue
566 |             for ival in _makelist(item[self._attr]):
567 |                 if ival < value:
568 |                     ixs.add(ix)
569 |         self._ixs = ixs
570 |         return self
571 | 
572 |     def __le__(self,value):
573 |         self._valid() # Actually, these would still work but still check
574 |         ixs = set()
575 |         for ix,item in enumerate(self._DB._list): # loop all
576 |             if item is None:
577 |                 continue
578 |             if self._attr in item and item[self._attr] <= value:
579 |                 ixs.add(ix)
580 |         self._ixs = ixs
581 |         return self
582 |         
583 |     def __gt__(self,value):
584 |         self._valid() # Actually, these would still work but still check
585 |         ixs = set()
586 |         for ix,item in enumerate(self._DB._list): # loop all
587 |             if item is None:
588 |                 continue
589 |             if self._attr in item and item[self._attr] > value:
590 |                 ixs.add(ix)
591 |         self._ixs = ixs
592 |         return self
593 |         
594 |     def __ge__(self,value):
595 |         self._valid() # Actually, these would still work but still check
596 |         ixs = set()
597 |         for ix,item in enumerate(self._DB._list): # loop all
598 |             if item is None:
599 |                 continue
600 |             if self._attr in item and item[self._attr] >= value:
601 |                 ixs.add(ix)
602 |         self._ixs = ixs
603 |         return self
604 |     
605 |     # Logic
606 |     def __and__(self,Q2):
607 |         self._ixs = self._ixs.intersection(Q2._ixs)
608 |         return self
609 |         
610 |     def __or__(self,Q2):
611 |         self._ixs = self._ixs.union(Q2._ixs)
612 |         return self
613 |         
614 |     def __invert__(self):
615 |         self._ixs = self._DB._ix - self._ixs
616 |         return self
617 |     
618 |     # Attributes
619 |     def __getattr__(self,attr):
620 |         if self._attr is not None:
621 |             raise ValueError('Already set attribute')
622 |         if attr == 'filter' and 'filter' not in self._DB.attributes:
623 |             return self._filter
624 |         
625 |         self._attr = attr
626 |         if attr == '_index':
627 |             return self
628 |             
629 |         ixs = set()
630 |         for vals in self._DB._lookup[attr].values():
631 |             ixs.update(vals)
632 |         self._ixs = ixs
633 |         return self
634 |     
635 |     
636 |     
637 |     
638 |     
639 |     
640 |     
641 |     
642 |     
643 |     
644 |     
645 |     
646 |     
647 |     
648 |     
649 |     
650 |     
651 |     
652 |     
653 | 


--------------------------------------------------------------------------------
/PyFiSync/ldtable.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import unicode_literals
  3 | 
  4 | __version__ = "20191123"
  5 | __author__ = "Justin Winokur"
  6 | 
  7 | import copy
  8 | from collections import defaultdict
  9 | import time
 10 | import types
 11 | 
 12 | 
 13 | class ldtable(object):
 14 |     def __init__(self, items=None, attributes=None, default_attribute=None,
 15 |                  exclude_attributes=None, indexObjects=False):
 16 |         """
 17 |         ldtable:
 18 |         Create an in-memeory single table DB from a list of dictionaries that 
 19 |         may be queried by any specified attribute.
 20 | 
 21 |         This is useful since, once created, lookup/query/"in" checks are
 22 |         O(1), Creation is still O(N)
 23 | 
 24 |         Note: Unless an entry is changed with update(), it must be reindexed
 25 | 
 26 |         Inputs:
 27 |         --------
 28 |         items  [ *empty* ] (list)
 29 |             List of dictionaries with each attribute
 30 | 
 31 |         attributes [None] (list, None)
 32 |             Either a list of attributes to index, or if specified as None
 33 |             (default) it will add every attribute of every item and assign
 34 |             missing ones to be `default_attribute` (unless excluded)
 35 |         
 36 |             NOTE: If attributes is set, you may still add items with extra
 37 |                   attributes. They just won't be indexed.
 38 |                   Or use add_attribute()
 39 |         
 40 |         exclude_attributes [ *empty* ] (list)
 41 |             Attributes that shouldn't ever be added even if attributes=None
 42 |             for dynamic addition of attributes.
 43 |         
 44 |         default_attribute [None] (*any*)
 45 |             Default attribute to assign if `attributes=None` and it is missing.
 46 |             If the specified object is callable, it will call it (for example,
 47 |             `default_attribute=list` will make it an empty list), Otherwise,
 48 |             it will set it to whatever value is specified.
 49 |         
 50 |         Options: (These may be changed later too)
 51 |         --------
 52 |         indexObjects: [False]
 53 |             If True, will automatically take any object and use its
 54 |             __dict__ as the dict.
 55 |             
 56 |             Note
 57 |                 * Changing to False after adding an object will cause issues.
 58 |                 * Does not support __slots__ since they are immutable
 59 |             
 60 |         Multiple Values per attribute
 61 |         -----------------------------
 62 |         A "row" can have multiple values per attribute as follows:
 63 |             
 64 |             {'attribute':[val1,val2,val3]}
 65 |             
 66 |         and can be queried for any (or all) values.
 67 |         
 68 |         Additional Opperations:
 69 |         ----------------------
 70 |         This supports index-lookup with a dictionary as well as
 71 |         a python `in` check and lookup by a dictionary
 72 |         
 73 |         The code will allow you to edit/delete/update multiple items at once
 74 |         (just like a standard database). Use with caution.
 75 | 
 76 |         Tips:
 77 |         ------
 78 |         * You can simply dump the DB with JSON using the DB.items()
 79 |           and then reload it with a new DB
 80 |         
 81 |         * There is also an attribute called `_index` which can be used to
 82 |           query by index.
 83 | 
 84 |         """
 85 |         
 86 |         # Handle inputs
 87 |         if items is None:
 88 |             items = list()
 89 | 
 90 |         if exclude_attributes is None:
 91 |             exclude_attributes = list()
 92 |         self.indexObjects = indexObjects
 93 | 
 94 | 
 95 |         self.attributes = attributes # Will be reset in first add
 96 |         self._is_attr_None = attributes is None
 97 |         self.default_attribute = default_attribute
 98 |         self.exclude_attributes = exclude_attributes
 99 | 
100 |         self.N = 0 # Will keep track
101 |         self._list = []
102 | 
103 |         self._empty = _emptyList()
104 |         self._ix = set()
105 | 
106 |         # Add the items
107 |         for item in items:
108 |             self.add(item)
109 |                 
110 |         self._time = time.time()
111 |     
112 |         # Edge case: No items
113 |         if self.attributes is None:
114 |             self.attributes = []
115 |         
116 |     def add(self,item):
117 |         """
118 |         Add an item or items to the DB
119 |         """
120 |         if isinstance(item,(list,tuple,types.GeneratorType)):
121 |             for it in item:
122 |                 self.add(it)
123 |             return
124 |         
125 |         # handle other object types
126 |         item0 = item
127 |         item = self._convert2dict(item)
128 |         
129 |         if self.N == 0:
130 |             attributes = self.attributes
131 |             if attributes is None:
132 |                 attributes = list(item.keys())
133 | 
134 |             self.attributes = [attrib for attrib in attributes \
135 |                                 if attrib not in self.exclude_attributes] # Make a copy
136 |             
137 | 
138 |             # Set up the lookup
139 |             self._lookup = {attribute:defaultdict(list) for attribute in self.attributes}
140 |         
141 |         ix = len(self._list) # The length will be 1+ the last ix so do not change this
142 | 
143 |         if self._is_attr_None: # Set to None which means we add all
144 |             for attrib in item.keys():
145 |                 if attrib in self.exclude_attributes:
146 |                     continue
147 |                 if attrib not in self.attributes:
148 |                     self.add_attribute(attrib,self.default_attribute)
149 |         # Add built in ones
150 |         for attrib in self.attributes:
151 |             if attrib not in item:
152 |                 if hasattr(self.default_attribute, '__call__'):
153 |                     item[attrib] = self.default_attribute()
154 |                 else:
155 |                     item[attrib] = self.default_attribute
156 |                     
157 |             value = item[attrib]
158 |             self._append(attrib,value,ix)
159 | 
160 |         # Finally add it
161 |         self._list.append(item0)
162 |         self.N += 1
163 |         self._ix.add(ix)
164 | 
165 |     def query(self,*A,**K):
166 |         """
167 |         Query the value for attribute. Will always an iterator. Use
168 |         `list(DB.query())` to return a list
169 |         
170 |         Usage
171 |         -----
172 |         
173 |         Any combination of the following will works
174 |         
175 |         Keywords: Only check equality
176 |                    
177 |         >>> DB.query(attrib=val)
178 |         >>> DB.query(attrib1=val1,attrib2=val2)  # Match both
179 |         
180 |         >>> DB.query({'attrib':val})
181 |         >>> DB.query({'attrib1':val1,'attrib2':val2}) # Match Both
182 |                                       
183 |         Query Objects (DB.Q, DB.Qobj)
184 |         
185 |         >>> DB.query(DB.Q.attrib == val)
186 |         >>> DB.query( (DB.Q.attrib1 == val1) &  (DB.Q.attrib1 == val2) )  # Parentheses are important!
187 |         >>> DB.query( (DB.Q.attrib1 == val1) &  (DB.Q.attrib1 != val2))
188 |                                    
189 |         """
190 |         ixs = self._ixs(*A,**K)
191 |         for ix in ixs:
192 |             yield self._list[ix]
193 |     
194 |     def query_one(self,*A,**K):
195 |         """
196 |         Return a single item from a query. See "query" for more details.
197 |         
198 |         Returns None if nothing matches
199 |         """
200 |         try:
201 |             return next(self.query(*A,**K))
202 |         except StopIteration:
203 |             return None
204 | 
205 |     def count(self,*A,**K):
206 |         """
207 |         Return the number of matched rows for a given query. See "query" for
208 |         details on query construction
209 |         """
210 |         return len(self._ixs(*A,**K))
211 |     
212 |     def isin(self,*A,**K):
213 |         """
214 |         Check if there is at least one item that matches the given query
215 |         
216 |         see query() for usage
217 |         """
218 | 
219 |         return len(self._ixs(*A,**K))>0
220 | 
221 |     def reindex(self,*args):
222 |         """
223 |         Reindex the dictionary for specified attributes (or all)
224 |         
225 |         Usage
226 |         -----
227 |         
228 |         >>> DB.reindex()                # All
229 |         >>> DB.reindex('attrib')        # Reindex 'attrib'
230 |         >>> DB.reindex('attrib1','attrib2') # Multiple
231 |         
232 |         See Also
233 |         --------
234 |             update() method which does not require reindexing
235 |         """
236 |         if len(args) == 0:
237 |             attributes = self.attributes
238 |             
239 |             # Just an extra check (and makes a copy)
240 |             attributes = [attr for attr in attributes \
241 |                         if attr not in self.exclude_attributes]
242 |         else:
243 |             attributes = args
244 |             if any(a in self.exclude_attributes for a in args):
245 |                 raise ValueError('Cannot reindex an excluded attribute')
246 | 
247 |         for attribute in attributes:
248 |             self._lookup[attribute] = defaultdict(list) # Reset
249 |         
250 |         for ix,item in enumerate(self._list):
251 |             if item is None: continue
252 |             item = self._convert2dict(item)
253 |             for attrib in attributes:
254 |                 value = item[attrib]
255 |                 self._append(attrib,value,ix)
256 |     
257 |     def update(self,*args,**queryKWs):
258 |         """
259 |         Update an entry without needing to reindex the DB (or a specific
260 |         attribute)
261 |         
262 |         Usage:
263 |         ------
264 |         
265 |         >>> DB.update(updated_dict, query_dict_or_Qobj, query_attrib1=val1,...)
266 |         >>> DB.update(updated_dict, query_attrib1=val1,...)
267 |         
268 |         Inputs:
269 |         -------
270 |         
271 |         updated_dict : Dictionary with which to update the entry. This is
272 |                        done using the typical dict().update() construct to
273 |                        overwrite it
274 |         
275 |         query_dict_or_Qobj
276 |                      : Either the dictionary used in the query or a Qobj that
277 |                        defines a more advanced query
278 |         
279 |         query_attrib1=val1
280 |                      : Additional (or sole) query attributes
281 |     
282 |         Notes:
283 |         ------
284 |             * Updating an item requires a deletion in a list that has length n
285 |               equal to the number of items matching an attribute. This is O(n).
286 |               However changing the entry directly and reindexing is O(N) where
287 |               N is the size of the DB. If many items are changing and you do not
288 |               need to query them in between, it *may* be faster to directly
289 |               update the item and reindex
290 |         """
291 |         
292 |         if len(args) == 1:
293 |             updated_dict = args[0]
294 |             query = {}
295 |         elif len(args) == 2:
296 |             updated_dict,query = args
297 |         else:
298 |             raise ValueError('Incorrect number of inputs. See documentation')
299 |         
300 |         updated_dict = self._convert2dict(updated_dict)
301 |         if not isinstance(updated_dict,dict):
302 |             raise ValueError('Must specify updated values as a dictionary')
303 |         
304 |         query = self._convert2dict(query)
305 |         if isinstance(query,Qobj):
306 |             ixs = self._ixs(query,**queryKWs)
307 |         elif isinstance(query,dict):
308 |             queryKWs.update(query)
309 |             ixs = self._ixs(**queryKWs)
310 |         else:
311 |             raise ValueError('Unrecognized query {:s}. Must be a dict or Qobj',format(type(query)))
312 |         
313 |         if len(ixs) == 0:
314 |             raise ValueError('Query did not match any results')
315 |         
316 |         for ix in ixs:
317 |             # Get original item
318 |             item = self._list[ix]
319 |             item = self._convert2dict(item)
320 |             
321 |             # Allow the update to also include non DB attributes.
322 |             # The intersection will eliminate any exclude_attributes
323 |             attributes = set(updated_dict.keys()).intersection(self.attributes)
324 |             
325 |             for attrib in attributes: # Only loop over the updated attribs
326 |                 # get old value
327 |                 value = item[attrib]
328 |                 
329 |                 # Remove any ix matching it
330 |                 self._remove(attrib,value,ix)
331 |                 
332 |                 # Get new value
333 |                 value = updated_dict[attrib]
334 |                 
335 |                 # Add ix to any new value
336 |                 self._append(attrib,value,ix)
337 |                 
338 |             # Update the item
339 |             item.update(updated_dict)
340 |         
341 |         return
342 |         
343 |     def add_attribute(self,attribute,*default):
344 |         """
345 |         Add an attribute to the index attributes.
346 | 
347 |         Usage
348 |         -----
349 |         >>> DB.add_attribute('new_attrib') # Will raise an error if *any*
350 |                                            # items don't have 'new_attrib'
351 |         >>> DB.add_attribute('new_attrib',default)
352 |                                            # Set any missing to the default
353 |         
354 |         If the `default` is callable, it will call it instead. (such as `list`
355 |         to add an empty list)
356 |         
357 |         """
358 |         if attribute in self.exclude_attributes:
359 |             raise ValueError("Can't add exclude_attributes")
360 |         
361 |         attrib = attribute
362 |         if not hasattr(self,'_lookup'):
363 |             self._lookup = {}
364 |         self._lookup[attribute] = defaultdict(list)
365 | 
366 |         set_default = False
367 |         if len(default) >0:
368 |             set_default = True
369 |             default = default[0]
370 | 
371 |         for ix,item in enumerate(self._list):
372 |             if item is None: continue
373 |             item = self._convert2dict(item)
374 |             try:
375 |                 value = item[attribute]
376 |                 self._append(attrib,value,ix)
377 |             except KeyError as KE:
378 |                 if set_default:
379 |                     if hasattr(default, '__call__'):
380 |                         item[attribute] = default()
381 |                     else:
382 |                         item[attribute] = default
383 |                 else:
384 |                     raise KeyError("Attribute {:s} not found".format(attrib))
385 | 
386 |                 value = item[attribute]
387 |                 self._append(attrib,value,ix)
388 | 
389 |         self.attributes.append(attribute)
390 | 
391 |     def remove(self,*A,**K):
392 |         """
393 |         Remove item that matches a given attribute or dict. See query() for
394 |         input specification
395 | 
396 | 
397 |         DB Options:
398 |         This is set at instantiation but can be changed directly
399 |         -----------
400 |         
401 |         """
402 |         ixs = list(self._ixs(*A,**K))
403 | 
404 |         if len(ixs) == 0:
405 |             raise ValueError('No matching items')
406 | 
407 |         items = []
408 | 
409 |         for ix in ixs[:]: # Must remove it from everything.
410 |             # not sure what is happening, but it seems that I need to make a copy
411 |             # since Python is doing something strange here...
412 | 
413 |             item = self._list[ix]
414 |             item = self._convert2dict(item)
415 | 
416 |             for attrib in self.attributes:
417 |                 value = item[attrib]
418 |                 self._remove(attrib,value,ix)
419 |                 
420 |             # Remove it from the list by setting to None. Do not reshuffle
421 |             # the indices. A None check will be performed elsewhere
422 |             self._list[ix] = None
423 |             self._ix.difference_update([ix])
424 |             self.N -= 1
425 |     
426 |             
427 |     @property
428 |     def Qobj(self):
429 |         """
430 |         Query object already loaded with the DB
431 |         
432 |             DB.Qobj <==> DB.Q <==> Qobj(DB)
433 |         """
434 |         return Qobj(self)
435 |     Q = Qobj
436 |     
437 |     def _convert2dict(self,obj):
438 |         """
439 |         Convert objects to a regular dictionary for the sake of indexing
440 |         
441 |         Also return Qobjs untouched since they may be used in queries too
442 |         
443 |         If it not an Qobj or dict, it will try to get it's __dict__ and if
444 |         that doesn't work, will just return it
445 |         """
446 |         if isinstance(obj,Qobj):
447 |             return obj
448 |         
449 |         if isinstance(obj,dict): # Also accounts for OrderedDicts or ...
450 |             return obj           # ... anything that inherits dict
451 |         
452 |         if self.indexObjects and hasattr(obj,'__dict__'):
453 |             return obj.__dict__
454 |             
455 |         return obj
456 |         
457 |     
458 |     def _ixs(self,*args,**kwords):
459 |         """
460 |         Get the inde(x/ies) of matching information
461 |         """
462 |         if not hasattr(self,'_lookup') or self.N==0: # It may be empty
463 |             return []
464 |  
465 |         # Make the entire kwords be lists with default of []. Edge case of
466 |         # multiple items
467 |         for key,val in kwords.items():
468 |             if not isinstance(val,list):
469 |                 kwords[key] = [val]
470 |         kwords = defaultdict(list,kwords)
471 |         
472 |         Q = Qobj(self) # Empty object
473 |         for arg in args:
474 |             arg = self._convert2dict(arg) # handle other object types
475 |             if isinstance(arg,Qobj):
476 |                 Q = Q & arg # Will add these conditions. If Q is empty, will just be arg
477 |                 continue
478 |             if isinstance(arg,dict):
479 |                 for key,val in arg.items(): # Add it rather than update in case it is already specified
480 |                     kwords[key].append(val)
481 |             else:
482 |                 raise ValueError('unrecognized input of type {:s}'.format(str(type(arg))))
483 |         
484 |         # Construct a query for kwords
485 |         for key,value in kwords.items():
486 |             if isinstance(value,list) and len(value) == 0:
487 |                 value = [self._empty]
488 |             for val in _makelist(value):
489 |                 Qtmp = Qobj(self)
490 |                 Qtmp._attr = key
491 |                 Q = Q & (Qtmp == val)
492 | 
493 |         ixs = Q._ixs
494 |         # Ensure one match
495 |         if ixs is None:
496 |             ixs = []
497 |         return list(ixs)
498 |         
499 |     
500 |     def _index(self,ix):
501 |         """
502 |         Return ix if it hasn't been deleted
503 |         """
504 |         try:
505 |             item = self._list[ix]
506 |         except IndexError:
507 |             return []
508 |         
509 |         if item is None:
510 |             return []
511 |         
512 |         return [ix]
513 |     
514 |     def _append(self,attrib,value,ix):
515 |         """
516 |         Add to the lookup and update the modify time
517 |         """
518 |         # Final check but we should be guarded from this
519 |         if attrib in self.exclude_attributes:
520 |             print('BAD! Should guard against this in public methods!')
521 |             raise ValueError('Cannot reindex an excluded attribute')
522 |         
523 |         valueL = _makelist(value)
524 |         for val in valueL:
525 |             self._lookup[attrib][val].append(ix)
526 |         if len(valueL) == 0:
527 |             self._lookup[attrib][self._empty].append(ix) # empty list
528 |         self._time = time.time()
529 |     
530 |     def _remove(self,attrib,value,ix):
531 |         """
532 |         Remove from the lookup and update the modify time
533 |         """
534 |         valueL = _makelist(value)
535 |         for val in valueL:
536 |             try:
537 |                 self._lookup[attrib][val].remove(ix)
538 |             except ValueError:
539 |                 raise ValueError('Item not found in internal lookup. May need to first call reindex()')
540 |         if len(valueL) == 0:
541 |             self._lookup[attrib][self._empty].remove(ix) # empty list
542 |     
543 |         self._time = time.time()
544 |     
545 |     def __contains__(self,check_diff):
546 |         check_diff = self._convert2dict(check_diff)
547 |         if not ( isinstance(check_diff,dict) or isinstance(check_diff,Qobj)):
548 |             raise ValueError('Python `in` queries should be a of {attribute:value} or Qobj')
549 |         return self.isin(check_diff)
550 | 
551 |     def __len__(self):
552 |         return self.N
553 | 
554 |     def __getitem__(self,item):
555 |         item = self._convert2dict(item)
556 |         if isinstance(item,dict) or isinstance(item,Qobj):
557 |             return self.query_one(item)
558 |         elif isinstance(item,int): # numbered item
559 |             if self._list[item] is not None:
560 |                 return self._list[item]
561 |             else:
562 |                 raise ValueError("Index has been deleted")
563 |         else:
564 |             raise ValueError("Must specify DB[{'attribute':val}] or DB[index]'")
565 |     __call__ = query
566 |     
567 |     def __iter__(self):
568 |         return (item for item in self._list if item is not None)
569 |     items = __iter__
570 |     
571 | def _makelist(input):
572 |     if isinstance(input,list):
573 |         return input
574 |     return [input]
575 | 
576 | class _emptyList(object):
577 |     def __init__(self):
578 |         pass
579 |     def __hash__(self):
580 |         return 9999999999999
581 |     def __eq__(self,other):
582 |         return isinstance(other,list) and len(other)==0
583 |         
584 | class Qobj(object):
585 |     """
586 |     Query objects. This works by returning an updated *copy* of the object
587 |     whenever it is acted upon
588 |     
589 |     Calling
590 |         * Q.attribute sets attribute and returns a copy
591 |         * Q.attribute == val (or any other comparison) set the index of elements
592 |         * Q1 & Q1 or other boolean perform set operations
593 |         
594 |     Useful Methods:
595 |         _filter : (or just `filter` if not an attribute): Apply a filter
596 |                   to the DB
597 |     """
598 |     def __init__(self,DB,ixs=None,attr=None):
599 |         self._DB = DB
600 |         self._ixs = ixs
601 |         self._attr = attr
602 |         
603 |         self._time = time.time()
604 |         
605 |     
606 |     def _valid(self):
607 |         if self._time < self._DB._time:
608 |             raise ValueError('This query object is out of date from the DB. Create a new one')
609 |     
610 |     def _filter(self,filter_func):
611 |         """
612 |         
613 |         If 'filter' is NOT an attribute of the DB, this can be called
614 |         with 'filter' instead of '_filter'
615 |         
616 |         Apply a filter to the data that returns True if it matches and False
617 |         otherwise
618 |         
619 |         Note that filters are O(N)
620 |         """
621 |         self._valid() # Actually, these would still work but still check
622 |         ixs = set()
623 |         for ix,item in enumerate(self._DB._list): # loop all
624 |             item = self._DB._convert2dict(item)
625 |             if item is None:
626 |                 continue
627 |             if filter_func(item):
628 |                 ixs.add(ix)
629 |         self._ixs = ixs
630 |         return self.copy()
631 |          
632 |             
633 |     # Comparisons
634 |     def __eq__(self,value):
635 |         self._valid()
636 |         
637 |         if self._DB.N == 0:
638 |             self._ixs = set()
639 |             return self.copy()
640 |         
641 |         first_set = True
642 |         for val in _makelist(value): # Account for list inputs
643 |             if self._attr == '_index':
644 |                 if first_set:
645 |                     ixs = set(self._DB._index(val))
646 |                     first_set = False
647 |                 else:
648 |                     ixs = ixs.intersection(self._DB._index(val))
649 |                 continue
650 |             if self._attr not in self._DB.attributes:
651 |                 raise KeyError("'{:s}' is not an attribute".format(self._attr))
652 |                 
653 |             ixs_at = self._DB._lookup[self._attr][val]
654 |             if first_set:
655 |                 ixs = set(ixs_at)
656 |                 first_set = False
657 |             else:
658 |                 ixs = ixs.intersection(ixs_at)
659 |         
660 |         self._ixs = ixs
661 |         return self.copy()
662 |      
663 |     def __ne__(self,value):
664 |         self._ixs = self._DB._ix - (self == value)._ixs
665 |         return self.copy()
666 |     
667 |     def __lt__(self,value):
668 |         self._valid() # Actually, these would still work but still check
669 |         ixs = set()
670 |         for ix,item in enumerate(self._DB._list): # loop all
671 |             item = self._DB._convert2dict(item)
672 |             if item is None:
673 |                 continue
674 |             for ival in _makelist(item[self._attr]):
675 |                 if ival < value:
676 |                     ixs.add(ix)
677 |         self._ixs = ixs
678 |         return self.copy()
679 | 
680 |     def __le__(self,value):
681 |         self._valid() # Actually, these would still work but still check
682 |         ixs = set()
683 |         for ix,item in enumerate(self._DB._list): # loop all
684 |             item = self._DB._convert2dict(item)
685 |             if item is None:
686 |                 continue
687 |             if item[self._attr] <= value:
688 |                 ixs.add(ix)
689 |         self._ixs = ixs
690 |         return self.copy()
691 |         
692 |     def __gt__(self,value):
693 |         self._valid() # Actually, these would still work but still check
694 |         ixs = set()
695 |         for ix,item in enumerate(self._DB._list): # loop all
696 |             item = self._DB._convert2dict(item)
697 |             if item is None:
698 |                 continue
699 |             if item[self._attr] > value:
700 |                 ixs.add(ix)
701 |         self._ixs = ixs
702 |         return self.copy()
703 |     
704 |     def __ge__(self,value):
705 |         self._valid() # Actually, these would still work but still check
706 |         ixs = set()
707 |         for ix,item in enumerate(self._DB._list): # loop all
708 |             item = self._DB._convert2dict(item)
709 |             if item is None:
710 |                 continue
711 |             if item[self._attr] >= value:
712 |                 ixs.add(ix)
713 |         self._ixs = ixs
714 |         return self.copy()
715 |     
716 |     # Logic
717 |     def __and__(self,Q2):
718 |         if self._ixs == None:  # An empty object and another will just return other
719 |             return Q2
720 |         self._ixs.intersection_update(Q2._ixs)
721 |         return self.copy()
722 |     def __or__(self,Q2):
723 |         self._ixs.update(Q2._ixs)
724 |         return self.copy()
725 |     def __invert__(self):
726 |         self._ixs = self._DB._ix - self._ixs
727 |         return self.copy()
728 |     
729 |     def __getattr__(self,attr):
730 |         if attr == 'filter' and 'filter' not in self._DB.attributes:
731 |             return self._filter
732 |         self._attr = attr
733 |         return self.copy()
734 |     
735 |     def copy(self):
736 |         new = Qobj(self._DB,ixs=self._ixs,attr=self._attr)
737 |         # Reset the time
738 |         new._time = self._time
739 |         return new
740 |     
741 |     
742 |     
743 |     
744 |     
745 |     
746 |     
747 |     
748 |     
749 |     
750 |     
751 |     
752 |     
753 |     
754 |     
755 |     
756 |     
757 |     
758 |     
759 |     
760 |     
761 |     
762 | 


--------------------------------------------------------------------------------
/PyFiSync/remote_interfaces.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Remote interfaces
  5 | 
  6 | For now, it is *just* SSH + rsync to work within the subprocess on Unix/Linux
  7 | (macOS is Unix). It is separated to more easily make other backends
  8 | 
  9 | An interface must have the following methods and behaviors from 
 10 | 'remote_interface_base'. Note that optional methods have a pass but the 
 11 | required ones will raise a NotImplementedError
 12 | """
 13 | from __future__ import division, print_function, unicode_literals
 14 | 
 15 | import subprocess
 16 | import re
 17 | import sys
 18 | import os
 19 | import random
 20 | import string
 21 | import json
 22 | import zlib
 23 | import shlex
 24 | import tempfile
 25 | import datetime
 26 | 
 27 | from io import open
 28 | 
 29 | if sys.version_info[0] > 2:
 30 |     xrange = range
 31 |     unicode = str
 32 | 
 33 | from . import utils
 34 | 
 35 | REMOTES = ['rsync','rclone']
 36 | 
 37 | class remote_interface_base(object):
 38 |     def __init__(self,config,log=None):
 39 |         """
 40 |         * Just pass it the configuration file
 41 |         * Optionally, pass it the log object to modify
 42 |         """
 43 |         raise NotImplementedError()
 44 |     def file_list(self,attributes,empty):
 45 |         """
 46 |         * Attributes are a list of requested attributes but generally, more
 47 |           should be returned in case attributes change.
 48 |         * follow the `empty` settings of PFSwalk -- if applicable
 49 |             'store':    stores a list of empty directories
 50 |             'remove':   Deletes all empty directories if (and only if) they 
 51 |                         were *not* empty before. Also removes stored list
 52 |             'reset':    Removes stored list
 53 |         """
 54 |         raise NotImplementedError()
 55 |         
 56 |     def apply_queue(self,queue,force):
 57 |         """
 58 |         * queue is the action queue that takes the following form
 59 |             * {'backup':[file_path]}  # Make a copy to the backup
 60 |             * {'move': [src,dest]}    # Move the file
 61 |             * {'delete': [file_path]} # Move the file into the backup. Essentially a backup
 62 |         * Force tells it to allow a file to be moved into another
 63 | 
 64 |         Notes:
 65 |             * overwriting moves have already been removed
 66 |             * Delete should backup first if set config.backup == True
 67 |             * Backup should NOT happen if config.backup == False
 68 |         """
 69 |         raise NotImplementedError()
 70 |     
 71 |     def transfer(self,tqA2B,tqB2A):
 72 |         """
 73 |         * Apply the trasnfer from B to A and from A to B
 74 |         * MUST maintain modification times upon transfer
 75 |         """
 76 |         raise NotImplementedError()
 77 | 
 78 |     def close(self):
 79 |         """
 80 |         * If it has this function, it will try to call it at the very end
 81 |         """
 82 |         pass
 83 | 
 84 |     @staticmethod
 85 |     def cli(argv): 
 86 |         """
 87 |         should be decorated with @staticmethod
 88 |         All of the commands will be passed. Can use this to communicate remotely
 89 |         if needed
 90 | 
 91 |         For example
 92 |             ./PyFiSync.py _api file_list --flag1 val1 --flag2
 93 | 
 94 |             will pass argv = ['file_list', '--flag1', 'val1', '--flag2']
 95 |         """
 96 |         pass
 97 | 
 98 | class ssh_rsync(remote_interface_base):
 99 |     def __init__(self,config,log=None):
100 |         self.config = config
101 |         if log is None:
102 |             log = utils.logger(silent=False,path=None)
103 |         self.log = log
104 |         self._debug = getattr(config,'_debug',False)
105 |         
106 |         if config.persistant:
107 |             # Set up master connection for 600 seconds 
108 |             self.sm = '-S /tmp/' + _randstr(5)
109 |             cmd = 'ssh -N -M {sm:s} -p {ssh_port:d} -q {userhost:s}'.\
110 |                    format(sm=self.sm,**config.__dict__)            
111 |             
112 |             self.persistant_proc = subprocess.Popen(shlex.split(cmd))
113 |             
114 |         else:
115 |             self.sm = '' # Do nothings
116 |         
117 |     def file_list(self,attributes,empty=None):
118 |         """
119 |         Get the file list in B (remote)
120 |         """        
121 |         attributes = list(set(attributes))
122 |         config = self.config
123 |         log = self.log
124 | 
125 |         # Construct the command
126 |         cmd = 'ssh {sm} -p {ssh_port:d} -q {userhost:s} "'.format(sm=self.sm,**config.__dict__)
127 | 
128 |         if hasattr(config,'PyFiSync_path') and hasattr(config,'remote_program'):
129 |             log.add("DEPRECATION WARNING: 'PyFiSync_path' and 'remote_program' are deprecated. Use 'remote_exe'")
130 |             # construct the call cmd
131 |             if len(config.PyFiSync_path) == 0:
132 |                 cmd += 'PyFiSync _api file_list"'
133 |             else:
134 |                 cmd += config.remote_program + ' '
135 |                 if any(config.PyFiSync_path.endswith('PyFiSync'+ext) for ext in ['','.py']):
136 |                     cmd += config.PyFiSync_path + ' _api file_list"'
137 |                 else:
138 |                     cmd += os.path.join(config.PyFiSync_path,'PyFiSync.py _api file_list"')
139 |         else:
140 |             cmd += '{} _api file_list"'.format(config.remote_exe)
141 |         
142 |         remote_config = dict()
143 |         
144 |         remote_config['path'] = config.pathB
145 |         remote_config['excludes'] = list(set(config.excludes))
146 |         remote_config['empty'] = empty
147 |         remote_config['attributes'] = list(set(attributes))
148 |         remote_config['copy_symlinks_as_links'] = config.copy_symlinks_as_links
149 |         remote_config['use_hash_db'] = config.use_hash_db
150 |         
151 |         log.add('Calling for remote file list')
152 |         
153 |         # Encode the config. Just in case there is any additional cruft, add
154 |         # a starting sentinel
155 |         sentinel = _randstr(N=10).encode('ascii')
156 |         cmd = shlex.split(cmd)
157 |         cmd[-1] += ' ' + sentinel.decode('ascii') # Add the sentinel to the final command
158 |         
159 |         json_config = sentinel+json.dumps(remote_config,ensure_ascii=False).encode('utf8')
160 |         
161 |         # Use a tempfile to prevent a buffering issue
162 |         outfile =  tempfile.NamedTemporaryFile(mode='wb',delete=False)
163 |         
164 |         proc = subprocess.Popen(cmd,stdin=subprocess.PIPE, 
165 |                                     stdout=outfile,
166 |                                     stderr=subprocess.PIPE, 
167 |                                     shell=False)        
168 |         _,err = proc.communicate(json_config)
169 |         
170 |         if len(err)>0:
171 |             err = utils.to_unicode(err)
172 |             log.add('Remote Call returned warnings:')
173 |             log.space = 4
174 |             log.add(err)
175 |             log.space = 0
176 | 
177 |         # Read back the output, find the sentinel, decompress and return the output
178 |         with open(outfile.name,'rb') as F:
179 |             out = F.read()
180 |         out = out[out.find(sentinel) + len(sentinel):]
181 | 
182 |         try:
183 |             out = zlib.decompress(out)    
184 |         except:
185 |             return
186 |         
187 |         return json.loads(out)
188 | 
189 |     def apply_queue(self,queue,force=False):
190 |         """
191 |         Remote call to apply queue assumeing B is remote
192 |         """
193 |         log = self.log
194 |         config = self.config
195 | 
196 |         if len(queue) == 0:
197 |             log.add('  >> No remote actions <<')
198 |             return
199 | 
200 |         sentinel = _randstr(N=10).encode('ascii')
201 |         
202 |         queue_bytes = json.dumps(queue,ensure_ascii=False).encode('utf8')
203 |         
204 |         # Construct the command
205 |         cmd = 'ssh {sm} -p {ssh_port:d} -q {userhost:s} "'.format(
206 |             sm=self.sm,**config.__dict__)
207 | 
208 |         # construct the call cmd
209 |         if hasattr(config,'PyFiSync_path') and hasattr(config,'remote_program'):
210 |             log.add("DEPRECATION WARNING: 'PyFiSync_path' and 'remote_program' are deprecated. Use 'remote_exe'")
211 |             if len(config.PyFiSync_path) == 0:
212 |                 cmd += 'PyFiSync _api apply_queue'
213 |             else:
214 |                 cmd += config.remote_program + ' '
215 |                 if any(config.PyFiSync_path.endswith('PyFiSync'+ext) for ext in ['','.py']):
216 |                     cmd += config.PyFiSync_path + ' _api apply_queue'
217 |                 else:
218 |                     cmd += os.path.join(config.PyFiSync_path,'PyFiSync.py _api apply_queue')
219 |         else:
220 |             cmd += '{} _api apply_queue'.format(config.remote_exe)
221 |         
222 |         if force:
223 |             cmd += ' --force '
224 | 
225 |         if not config.backup:
226 |             cmd += ' --no-backup '
227 | 
228 |         cmd += ' ' + config.pathB + ' {}"'.format(sentinel.decode('ascii'))
229 | 
230 |         out = ''
231 |         err = ''
232 | 
233 |         log.space=0
234 |         log.add('\nApplying queue on remote')
235 |         log.prepend = '>  '
236 | 
237 |         started = False
238 |         cmd = shlex.split(cmd)
239 |         proc = subprocess.Popen(cmd,stdin=subprocess.PIPE,
240 |                                     stdout=subprocess.PIPE,
241 |                                     stderr=subprocess.PIPE, 
242 |                                     shell=False)
243 |         
244 |         proc.stdin.write(sentinel + queue_bytes)
245 |         proc.stdin.close()
246 |         
247 |         with proc.stdout as stdout:
248 |             for line in iter(stdout.readline, b''):
249 |                 line = utils.to_unicode(line)
250 |                 if not started and line.find('START>>>>>>>')>=0:
251 |                     started = True
252 |                     continue
253 | 
254 |                 if line.find('<<<<<<<END')>=0:
255 |                     started = False
256 | 
257 |                 if started:
258 |                     log.add(line.rstrip())
259 | 
260 | 
261 |         with proc.stderr as stderr:
262 |             for line in iter(stderr.readline, b''):
263 |                 err += utils.to_unicode(line)
264 |         proc.wait()
265 |         log.prepend = ''
266 |         if len(err)>0:
267 |             log.add('Remote Call returned warnings:')
268 |             log.space = 4
269 |             log.add(err)
270 | 
271 |     def transfer(self,tqA2B,tqB2A):
272 |         config = self.config
273 |         log = self.log
274 | 
275 |         pwd0 = os.getcwd()
276 |         os.chdir(config.pathA)
277 | 
278 |         # Build the command
279 |         cmd = 'rsync -azvi -hh ' \
280 |             + '--keep-dirlinks --copy-dirlinks ' # make directory links behave like they were folders
281 |         
282 |         if not config.copy_symlinks_as_links:
283 |             cmd += '--copy-links '    
284 |         
285 |         if len(config.userhost) >0:
286 |             cmd += '-e "ssh -q -p {p:d} {sm}" '.format(p=config.ssh_port,sm=self.sm)
287 |             B = '{userhost:s}:{pathB:s}'.format(**config.__dict__)
288 |         else:
289 |             B = '{pathB:s}'.format(**config.__dict__)
290 | 
291 |         cmd += ' --files-from={files:s} {src:s}/ {dest:s}/'
292 | 
293 |         log.add('(using rsync)')
294 | 
295 |         if len(tqA2B) > 0:
296 | 
297 |             # A2B
298 |             tmp_file = '/tmp/tqA2B' + _randstr()
299 | 
300 |             for ix,item in enumerate(tqA2B): # Opperate on the list IN PLACE
301 |                 item = item.encode('utf-8')
302 |                 tqA2B[ix] = item
303 | 
304 |             with open(tmp_file,'wb') as F:
305 |                 F.write('\n'.encode('utf-8').join(tqA2B))
306 | 
307 |             cmdA2B = cmd.format(files=tmp_file,src=config.pathA,dest=B)
308 | 
309 |             log.space=1
310 |             log.add('Running rsync A >>> B')
311 |             log.add('  cmd = ' + cmdA2B)
312 |             log.space=4
313 | 
314 | 
315 |             proc = subprocess.Popen(cmdA2B, stdout=subprocess.PIPE,shell=True)
316 |             with proc.stdout:
317 |                 for line in iter(proc.stdout.readline, b''):
318 |                     line = self._proc_final_log(line)
319 |                     log.add(line)
320 | 
321 |             proc.wait()
322 |         else:
323 |             log.space=1
324 |             log.add('\nNo A >>> B transfers')
325 | 
326 |         #########
327 | 
328 |         if len(tqB2A) > 0:
329 |             # B2A
330 |             tmp_file = '/tmp/tqB2A' + _randstr()
331 |             for ix,item in enumerate(tqB2A): # Opperate on the list IN PLACE
332 |                 item = item.encode('utf-8')
333 |                 tqB2A[ix] = item
334 | 
335 |             with open(tmp_file,'wb') as F:
336 |                 F.write('\n'.encode('utf-8').join(tqB2A))
337 | 
338 |             cmdB2A = cmd.format(files=tmp_file,dest=config.pathA,src=B)
339 | 
340 |             log.space=1
341 |             log.add('\nRunning rsync A <<< B')
342 |             log.add('  cmd = ' + cmdB2A)
343 |             log.space=4
344 | 
345 |             proc = subprocess.Popen(cmdB2A, stdout=subprocess.PIPE,shell=True)
346 |             with proc.stdout:
347 |                 for line in iter(proc.stdout.readline, b''):
348 |                     line = self._proc_final_log(line)
349 |                     log.add(line)
350 | 
351 |             proc.wait()
352 |         else:
353 |             log.space=1
354 |             log.add('\nNo A <<< B transfers')
355 | 
356 |         os.chdir(pwd0)
357 | 
358 | 
359 |     def _proc_final_log(self,line):
360 |         line = line.strip()
361 |         if len(line) == 0: return None
362 |         try:
363 |             line = utils.to_unicode(line)
364 |         except:
365 |             return None
366 |         try:
367 |             action_path = [i.strip() for i in line.split(' ',1)]
368 |         except UnicodeDecodeError: # A bit of a hack but this works to make py2 happy
369 |             action_path = [utils.to_unicode(a) for a in line.decode('utf8').split(' ')]
370 |             
371 |         if len(action_path) != 2:
372 |             return 'could not parse action: {:s}'.format(line)
373 | 
374 |         action = action_path[0]
375 |         path = action_path[1]
376 | 
377 |         action = action.replace('<','>')
378 | 
379 |         if action.startswith('sent'):
380 |             return '\n' + line
381 |         if action.startswith('total'):
382 |             return line
383 | 
384 |         if any([action.startswith(d) for d in ['receiving','building']]):
385 |             return None
386 | 
387 |         if action.startswith('>'):  return 'Transfer  ' + path
388 |         if action.startswith('cd'): return 'mkdir     ' + path
389 |         if action.startswith('cL'): return 'link      ' + path
390 |         if action.startswith('.'):  return None
391 | 
392 |         return line
393 | 
394 |     @staticmethod
395 |     def cli(argv):
396 |         from . import PFSwalk
397 |         from . import main
398 |             
399 |         mode = argv[0]
400 |         argv = argv[1:]
401 |         if mode == 'file_list':
402 |             # Get the sentinel
403 |             sentinel = argv[0].encode('ascii')
404 |             
405 |             # For python3 to read bytes
406 |             stdin = sys.stdin
407 |             if hasattr(stdin,'buffer'):
408 |                 stdin = stdin .buffer
409 |             stdout = sys.stdout
410 |             if hasattr(stdout,'buffer'):
411 |                 stdout = stdout.buffer
412 |             
413 |             # Read the config, find and cut up to the sentinel, convert to 
414 |             # unicode and json load
415 |             
416 |             
417 |             remote_config_bytes = stdin.read()  
418 |             remote_config_bytes = remote_config_bytes[remote_config_bytes.find(sentinel)+len(sentinel):]
419 |             remote_config_bytes = remote_config_bytes.decode('utf8')
420 |             remote_config = json.loads(remote_config_bytes)
421 |             
422 |             # Process the input
423 |             path = remote_config['path']
424 |             config = utils.configparser()
425 |             config.pathA = path
426 | 
427 |             empty = remote_config['empty']
428 |             config.copy_symlinks_as_links = remote_config['copy_symlinks_as_links']
429 |             config.excludes = list(set(remote_config['excludes'])) # do *not* use default excludes
430 |             config.use_hash_db = remote_config['use_hash_db']
431 |             
432 |             # Generate the list. This may raise errors so do not start
433 |             # capture until later
434 |             log = utils.logger(silent=True,path=None)
435 |             _tmp = PFSwalk.file_list(path,config,log,
436 |                                     attributes=remote_config['attributes'],
437 |                                     empty=empty,
438 |                                     use_hash_db=config.use_hash_db)
439 |             flist = _tmp.files()
440 | 
441 |             out = json.dumps(flist,ensure_ascii=False)
442 |             out = zlib.compress(out.encode('utf8'),9) # Compress it
443 |             
444 |             stdout.write(sentinel + out) # write the bytes
445 |             
446 |         elif mode == 'apply_queue':
447 |             import getopt  # Even though it is "old school" use getopt here 
448 |                            # since it is easier and this interface is never 
449 |                            # exposed to the user
450 |             # For python3 to read bytes
451 |             stdin = sys.stdin
452 |             if hasattr(stdin,'buffer'):
453 |                 stdin = stdin.buffer
454 |             stdout = sys.stdout
455 |             if hasattr(stdout,'buffer'):
456 |                 stdout = stdout.buffer
457 |             
458 |             try:
459 |                 opts, args = getopt.getopt(argv, "",['force','no-backup'])
460 |             except getopt.GetoptError as err:
461 |                 print(str(err)) #print error
462 |                 sys.exit(2)
463 | 
464 |             path,sentinel = args
465 | 
466 |             config = utils.configparser()
467 |             config.pathA = path
468 | 
469 |             # Place the config into PyFiSync
470 |             main.config = config
471 | 
472 |             force = False
473 |             for opt,val in opts:
474 |                 if opt == '--force':
475 |                     force = True
476 |                 if opt == '--no-backup':
477 |                     config.backup = False
478 | 
479 |             log = utils.logger(path=path,silent=False)
480 | 
481 |             sys.stdout.write('START>>>>>>>\n')
482 | 
483 |             # Get the queue from stdin
484 |             sentinel = sentinel.encode('ascii')
485 |             queue = stdin.read()
486 |             queue = queue[queue.find(sentinel)+len(sentinel):]
487 |             queue = queue.decode('utf8')
488 | 
489 |             try:
490 |                 queue = json.loads(queue)
491 |             except Exception as E:
492 |                 sys.stderr.write('could not parse input. Error: "{}"'.format(E))
493 |                 sys.exit(2)
494 | 
495 |             print('Successfully loading action queue of {:d} items'.format(len(queue)))
496 | 
497 |             main.apply_action_queue(path,queue)
498 | 
499 |             sys.stdout.write('\n<<<<<<<END')
500 |     def close(self):
501 |         if self.config.persistant:
502 |             self.persistant_proc.terminate()
503 |             # Remove the socket. The other connection will die soon
504 |             try:
505 |                 os.remove(self.sm.replace('-S','').strip())
506 |             except Exception as E:
507 |                 pass#print('---{}'.format(E))
508 | 
509 | 
510 | class Rclone(remote_interface_base):
511 |     """
512 |     rclone based remote
513 |     """
514 |     def __init__(self,config,log=None):
515 |         self.config = config
516 |         if log is None:
517 |             log = utils.logger(silent=False,path=None)
518 |         self.log = log
519 |         self.flags = list(config.rclone_flags)
520 |         
521 |         # Backup paths
522 |         now = datetime.datetime.now().strftime('%Y-%m-%d_%H%M%S')
523 |         if self.config.rclone_backup_local:
524 |             self.backup_path = os.path.join(
525 |                     config.pathA,'.PyFiSync','backups_remote',now)
526 |         else:
527 |             self.backup_path = os.path.join(
528 |                     config.pathB,'.PyFiSync','backups',now)            
529 |         
530 |         self._debug = getattr(config,'_debug',False)
531 |         self.rc_version = self.call(['--version'])
532 |         
533 |         
534 |     def file_list(self,attributes,empty):
535 |         """
536 |         use rclone to produce a file list
537 |         """
538 |         # This is a hack to only show it the first time
539 |         if self.rc_version:
540 |             self.log.add('rclone version:\n')
541 |             self.log.add(self.rc_version)
542 |             self.rc_version = False
543 |         
544 |         from . import PFSwalk
545 |         attributes = set(attributes)
546 |         attributes.add('size')
547 |         attributes.add('mtime')
548 |         args = ['-q']
549 |         
550 |         # The order of some flags matter
551 |         args.append('lsjson')
552 |         args.extend(['--exclude','".PyFiSync/**"']) # other filters will come later
553 |         args.append('-R')
554 |         if any(attribute.startswith('hash.') for attribute in attributes):
555 |             args.append('--hash')
556 |             
557 |         args.append(self.config.pathB)
558 |         list_out = self.call(args)
559 |         
560 |         raw_list = json.loads(list_out)
561 |         
562 |         files = []
563 |         for rawfile in raw_list:
564 |             if rawfile['IsDir']:
565 |                 continue
566 |             file = dict()
567 |             file['path'] = rawfile['Path']
568 |             file['size'] = rawfile['Size']
569 |             file['mtime'] = utils.RFC3339_to_unix(rawfile['ModTime'])
570 |             for attr in attributes:
571 |                 if not attr.startswith('hash.'):
572 |                     continue
573 |                 name = attr.split('.')[-1]
574 |                 try:
575 |                     file[attr] = rawfile['Hashes'][name]
576 |                 except KeyError:
577 |                     self.log.add_err('Could not get hash "{}". Make sure it is availible in the specified remote'.format(name))
578 |                     if self.config.imitate_missing_hash:    
579 |                         self.log.add_err('Imitateing hash')
580 |                         file[attr] = utils.imitate_hash(rawfile)
581 |                     else:
582 |                         sys.exit(2)
583 |             files.append(file)
584 |         
585 |         # Use the machinery for rsync+ssh and local to filter
586 |         # since we want to be consistent
587 |         pfswalk = PFSwalk.file_list('',self.config,self.log)
588 |         files = pfswalk.filter_old_list(files)
589 |         
590 |         return files
591 |     
592 |     def apply_queue(self,queue,force=None):
593 |         """
594 |         Apply the queue
595 |         """
596 |         config,log = self.config,self.log
597 |         
598 |         log.add('\nApplying queue on remote')
599 |         didback = False
600 |         
601 |         for action_dict in queue:
602 |             action,path = list(action_dict.items())[0]
603 |             if action == 'move':
604 |                 src = os.path.join(self.config.pathB,path[0])
605 |                 dst = os.path.join(self.config.pathB,path[1])
606 |                 self.call(['moveto',src,dst])
607 |                 self.log.add('move: ' + utils.move_txt(path[0],path[1]))
608 |             elif action in ['backup','delete']:
609 |                 src = os.path.join(self.config.pathB,path)
610 |                 dst = os.path.join(self.backup_path,path)
611 |                 if action == 'backup' and config.backup:
612 |                     self.call(['copyto',src,dst])
613 |                     log.add('backup: ' + path)
614 |                     didback = True
615 |                 elif action=='delete' and config.backup:
616 |                     self.call(['moveto',src,dst])
617 |                     log.add('delete (w/ backup): ' + path)
618 |                     didback = True
619 |                 elif action=='delete' and not config.backup:
620 |                     self.call(['delete',src])
621 |                     log.add('delete (w/o backup): ' + path)
622 |                 else:
623 |                     pass # Do nothing for now
624 |         
625 |         # Cleanup local backup folder if not used
626 |         if self.config.rclone_backup_local:
627 |             try:
628 |                 os.rmdir(self.backup_path)
629 |             except OSError:
630 |                 pass
631 |         
632 |         if didback: 
633 |             if self.config.rclone_backup_local:
634 |                 log.add('\nBackups saved LOCALLY in {}'.format(self.backup_path))
635 |             else:
636 |                 log.add('\nBackups saved in {}'.format(self.backup_path))    
637 |         
638 |     def transfer(self,tqA2B,tqB2A):
639 |         config = self.config
640 |         log = self.log
641 |         
642 |         args = ['--ignore-times'] # We don't need this since we've already compared
643 |         # set up arguments
644 |         if config.copy_symlinks_as_links:
645 |             args.append('--links')
646 |             log.add('WARNING: rclone may or may-not work with `copy_symlinks_as_links`')
647 |         else:
648 |             args.append('--copy-links')
649 |         
650 |         log.add('(using rclone)')
651 |         
652 |         if len(tqA2B) > 0:
653 | 
654 |             # A2B
655 |             tmp_file = '/tmp/tqA2B' + _randstr()
656 | 
657 |             with open(tmp_file,'wt') as file:
658 |                 file.write('\n'.join('/' + t for t in tqA2B)) # Must start with / to be full path for root
659 |             
660 |             newargs = args[:]
661 |             newargs.extend(['-v','--stats-one-line'])
662 |             newargs.extend(['copy','--files-from','{}'.format(tmp_file)])
663 |             newargs.extend([config.pathA,config.pathB])
664 |             
665 |             log.space=1
666 |             log.add('Running rclone A >>> B')
667 |             log.space = 4
668 |             out = self.call(newargs,echo=True)
669 |            
670 |             
671 |         else:
672 |             log.space=1
673 |             log.add('\nNo A >>> B transfers')
674 |         
675 |         log.add('')
676 |         
677 |         if len(tqB2A) > 0:
678 | 
679 |             # B2A
680 |             tmp_file = '/tmp/tqB2A' + _randstr()
681 | 
682 |             with open(tmp_file,'wt') as file:
683 |                 file.write('\n'.join('/' + t for t in tqB2A)) # Must start with / to be full path for root
684 |             
685 |             newargs = args[:]
686 |             newargs.extend(['-v','--stats-one-line'])
687 |             newargs.extend(['copy','--files-from','{}'.format(tmp_file)])
688 |             newargs.extend([config.pathB,config.pathA])
689 | 
690 |             log.space=1
691 |             log.add('Running rclone A <<< B ')
692 |             log.space = 4
693 |             out = self.call(newargs,echo=True)
694 |             #log.add(out)
695 |             
696 |         else:
697 |             log.space=1
698 |             log.add('\nNo A <<< B transfers')       
699 |         
700 |             
701 |     def call(self,args,echo=False):
702 |         """
703 |         Call rclone with the appropriate flags already set
704 |         """
705 |         if isinstance(args,(str,unicode)):
706 |             args = shlex.split(args)
707 |         args = list(args)
708 |         env = dict(os.environ)
709 |         if self.config.rclone_pw:
710 |             args.append('--ask-password=false')
711 |             env['RCLONE_CONFIG_PASS'] = self.config.rclone_pw
712 |         
713 |         cmd = list()
714 |         cmd.append(self.config.rclone_executable)
715 |         cmd.extend(self.flags)
716 |         cmd.extend(args)
717 |         
718 |         # Use two different methods depending on whether we need to stream
719 |         # the result. This is to hopefully prevent issues with large
720 |         # buffered responses
721 |         if self._debug:
722 |             txt = ['DEBUG MODE','']
723 |             txt.append('rclone call')
724 |             txt.append(' '.join(cmd))
725 |             txt.append(' ')
726 |             self.log.add_err('\n'.join(txt))
727 |             
728 |         if echo:
729 |             stdout = subprocess.PIPE
730 |             self.log.add('rclone\n  $ ' + ' '.join(cmd) + '\n')
731 |         else:
732 |             stdout =  tempfile.NamedTemporaryFile(mode='wb',delete=False)
733 |         
734 |         proc = subprocess.Popen(cmd,
735 |                                 stdout=stdout,
736 |                                 stderr=subprocess.STDOUT if not self._debug else subprocess.PIPE,
737 |                                 shell=False,
738 |                                 env=env,
739 |                                 cwd=self.config.pathA)
740 |         if echo:
741 |             out = []
742 |             with proc.stdout:
743 |                 for line in iter(proc.stdout.readline, b''):
744 |                     line = utils.to_unicode(line)
745 |                     self.log.add(line.rstrip())
746 |                     out.append(line)
747 |             if self._debug:
748 |                 err = proc.stderr.read()
749 |         else:
750 |             _,err = proc.communicate() # Since we are not streaming the output
751 |             with open(stdout.name,'rb') as F:
752 |                 out = utils.to_unicode(F.read())
753 |         proc.wait()
754 |         if proc.returncode >0:
755 |             self.log.add_err('rclone returned a non-zero exit code')
756 |         
757 |         if self._debug:
758 |             txt = []
759 |             txt.append('OUT:')
760 |             txt.append(''.join(out))
761 |             txt.append('ERR:')
762 |             txt.append(utils.to_unicode(err))
763 |             txt = '\n'.join(txt)
764 |             txt = [''] + txt.split('\n')
765 |             txt = '\nDEBUG: '.join(txt)
766 |             self.log.add_err(txt)
767 |         
768 |         
769 |         return ''.join(out)
770 |         
771 |         
772 | def get_remote_interface(config=None,name=None):
773 |     if config is None == name is None:
774 |         raise ValueError('Must specify config OR name')
775 |     
776 |     if config is not None:
777 |         name = config.remote
778 |     
779 |     if name == 'rsync':
780 |         if len(getattr(config,'userhost','')) == 0:
781 |             return None
782 |         return ssh_rsync
783 |     elif name == 'rclone':
784 |         return Rclone
785 |     else:
786 |         raise ValueError()
787 | 
788 | 
789 | def _randstr(N=10):
790 |     random.seed()
791 |     return ''.join(random.choice('abcdefghijklmnopqrstuvwxyz') for _ in xrange(N))
792 | 
793 | 
794 | 
795 | 
796 | 
797 | 
798 | 


--------------------------------------------------------------------------------