├── .ci
    ├── release
    ├── release-uv
    └── run
├── .github
    └── workflows
    │   └── main.yml
├── .gitignore
├── LICENSE
├── README.md
├── conftest.py
├── doall
├── doc
    └── options.md
├── mypy.ini
├── old
    ├── README.md
    ├── common.py
    ├── foursquare.py
    ├── jq_normaliser.py
    ├── json.py
    ├── json2.py
    └── reddit.py
├── pyproject.toml
├── pytest.ini
├── ruff.toml
├── scripts
    └── apt.sh
├── src
    └── bleanser
    │   ├── core
    │       ├── __init__.py
    │       ├── __main__.py
    │       ├── common.py
    │       ├── compat.py
    │       ├── ext
    │       │   ├── dummy_executor.py
    │       │   ├── logging.py
    │       │   └── sqlite_dumben.py
    │       ├── main.py
    │       ├── modules
    │       │   ├── extract.py
    │       │   ├── json.py
    │       │   ├── sqlite.py
    │       │   ├── tests
    │       │   │   └── sqlite.py
    │       │   └── xml.py
    │       ├── processor.py
    │       ├── sqlite.py
    │       └── utils.py
    │   ├── modules
    │       ├── antennapod_android.py
    │       ├── binary.py
    │       ├── bluemaestro.py
    │       ├── bumble_android.py
    │       ├── chrome.py
    │       ├── fbmessenger_android.py
    │       ├── firefox.py
    │       ├── foursquare.py
    │       ├── ghexport.py
    │       ├── goodreads.py
    │       ├── hinge_android.py
    │       ├── hpi
    │       │   ├── fbmessenger_android.py
    │       │   ├── twitter_android.py
    │       │   └── whatsapp_android.py
    │       ├── instagram_android.py
    │       ├── json_new.py
    │       ├── kobo.py
    │       ├── lastfm.py
    │       ├── monzo.py
    │       ├── pinboard.py
    │       ├── pocket.py
    │       ├── podcastaddict_android.py
    │       ├── reddit.py
    │       ├── rescuetime.py
    │       ├── rescuetime_android.py
    │       ├── skype_android.py
    │       ├── sleepasandroid_android.py
    │       ├── smscalls.py
    │       ├── spotify.py
    │       ├── spotifyexport.py
    │       ├── stackexchange.py
    │       ├── talon_android.py
    │       ├── tiktok_android.py
    │       ├── tinder_android.py
    │       ├── twitter_android.py
    │       ├── vk_android.py
    │       ├── whatsapp_android.py
    │       └── xml_clean.py
    │   ├── py.typed
    │   └── tests
    │       ├── common.py
    │       ├── test_binary.py
    │       └── test_hypothesis.py
└── tox.ini


/.ci/release:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | '''
 3 | Run [[file:scripts/release][scripts/release]] to deploy Python package onto [[https://pypi.org][PyPi]] and [[https://test.pypi.org][test PyPi]].
 4 | 
 5 | The script expects =TWINE_PASSWORD= environment variable to contain the [[https://pypi.org/help/#apitoken][PyPi token]] (not the password!).
 6 | 
 7 | The script can be run manually.
 8 | It's also running as =pypi= job in [[file:.github/workflows/main.yml][Github Actions config]]. Packages are deployed on:
 9 | - every master commit, onto test pypi
10 | - every new tag, onto production pypi
11 | 
12 | You'll need to set =TWINE_PASSWORD= and =TWINE_PASSWORD_TEST= in [[https://help.github.com/en/actions/configuring-and-managing-workflows/creating-and-storing-encrypted-secrets#creating-encrypted-secrets][secrets]]
13 | for Github Actions deployment to work.
14 | '''
15 | 
16 | import os
17 | import sys
18 | from pathlib import Path
19 | from subprocess import check_call
20 | import shutil
21 | 
22 | is_ci = os.environ.get('CI') is not None
23 | 
24 | def main() -> None:
25 |     import argparse
26 |     p = argparse.ArgumentParser()
27 |     p.add_argument('--test', action='store_true', help='use test pypi')
28 |     args = p.parse_args()
29 | 
30 |     extra = []
31 |     if args.test:
32 |         extra.extend(['--repository', 'testpypi'])
33 | 
34 |     root = Path(__file__).absolute().parent.parent
35 |     os.chdir(root) # just in case
36 | 
37 |     if is_ci:
38 |         # see https://github.com/actions/checkout/issues/217
39 |         check_call('git fetch --prune --unshallow'.split())
40 | 
41 |     dist = root / 'dist'
42 |     if dist.exists():
43 |         shutil.rmtree(dist)
44 | 
45 |     check_call(['python3', '-m', 'build'])
46 | 
47 |     TP = 'TWINE_PASSWORD'
48 |     password = os.environ.get(TP)
49 |     if password is None:
50 |         print(f"WARNING: no {TP} passed", file=sys.stderr)
51 |         import pip_secrets
52 |         password = pip_secrets.token_test if args.test else pip_secrets.token # meh
53 | 
54 |     check_call([
55 |         'python3', '-m', 'twine',
56 |         'upload', *dist.iterdir(),
57 |         *extra,
58 |     ], env={
59 |         'TWINE_USERNAME': '__token__',
60 |         TP: password,
61 |         **os.environ,
62 |     })
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     main()
67 | 


--------------------------------------------------------------------------------
/.ci/release-uv:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | '''
 3 | Deploys Python package onto [[https://pypi.org][PyPi]] or [[https://test.pypi.org][test PyPi]].
 4 | 
 5 | - running manually
 6 | 
 7 |   You'll need =UV_PUBLISH_TOKEN= env variable
 8 | 
 9 | - running on Github Actions
10 | 
11 |   Instead of env variable, relies on configuring github as Trusted publisher (https://docs.pypi.org/trusted-publishers/) -- both for test and regular pypi
12 | 
13 |   It's running as =pypi= job in [[file:.github/workflows/main.yml][Github Actions config]].
14 |   Packages are deployed on:
15 |   - every master commit, onto test pypi
16 |   - every new tag, onto production pypi
17 | '''
18 | 
19 | UV_PUBLISH_TOKEN = 'UV_PUBLISH_TOKEN'
20 | 
21 | import argparse
22 | import os
23 | import shutil
24 | from pathlib import Path
25 | from subprocess import check_call
26 | 
27 | is_ci = os.environ.get('CI') is not None
28 | 
29 | def main() -> None:
30 |     p = argparse.ArgumentParser()
31 |     p.add_argument('--use-test-pypi', action='store_true')
32 |     args = p.parse_args()
33 | 
34 |     publish_url = ['--publish-url', 'https://test.pypi.org/legacy/'] if args.use_test_pypi else []
35 | 
36 |     root = Path(__file__).absolute().parent.parent
37 |     os.chdir(root) # just in case
38 | 
39 |     if is_ci:
40 |         # see https://github.com/actions/checkout/issues/217
41 |         check_call('git fetch --prune --unshallow'.split())
42 | 
43 |     # TODO ok, for now uv won't remove dist dir if it already exists
44 |     #  https://github.com/astral-sh/uv/issues/10293
45 |     dist = root / 'dist'
46 |     if dist.exists():
47 |         shutil.rmtree(dist)
48 | 
49 |     # todo what is --force-pep517?
50 |     check_call(['uv', 'build'])
51 | 
52 |     if not is_ci:
53 |         # CI relies on trusted publishers so doesn't need env variable
54 |         assert UV_PUBLISH_TOKEN in os.environ, f'no {UV_PUBLISH_TOKEN} passed'
55 | 
56 |     check_call(['uv', 'publish', *publish_url])
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     main()
61 | 


--------------------------------------------------------------------------------
/.ci/run:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eu
 3 | 
 4 | cd "$(dirname "$0")"
 5 | cd .. # git root
 6 | 
 7 | if ! command -v sudo; then
 8 |     # CI or Docker sometimes doesn't have it, so useful to have a dummy
 9 |     function sudo {
10 |         "$@"
11 |     }
12 | fi
13 | 
14 | # --parallel-live to show outputs while it's running
15 | tox_cmd='run-parallel --parallel-live'
16 | if [ -n "${CI-}" ]; then
17 |     # install OS specific stuff here
18 |     case "$OSTYPE" in
19 |     darwin*) 
20 |         # macos
21 |         brew install libmagic # for python-magic
22 |         brew install diffutils # for GNU diff
23 |         ;;
24 |     cygwin* | msys* | win*)
25 |         # windows
26 |         # ugh. parallel stuff seems super flaky under windows, some random failures, "file used by other process" and crap like that
27 |         tox_cmd='run'
28 |         ;;
29 |     *)
30 |         # must be linux?
31 |         :
32 |         ;;
33 |     esac
34 | fi
35 | 
36 | # NOTE: expects uv installed
37 | uv tool run --with tox-uv tox $tox_cmd "$@"
38 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
  1 | # see https://github.com/karlicoss/pymplate for up-to-date reference
  2 | 
  3 | name: CI
  4 | on:
  5 |   push:
  6 |     branches: '*'
  7 |     tags: 'v[0-9]+.*' # only trigger on 'release' tags for PyPi
  8 |     # Ideally I would put this in the pypi job... but github syntax doesn't allow for regexes there :shrug:
  9 |   pull_request: # needed to trigger on others' PRs
 10 |   # Note that people who fork it need to go to "Actions" tab on their fork and click "I understand my workflows, go ahead and enable them".
 11 |   workflow_dispatch: # needed to trigger workflows manually
 12 |     # todo cron?
 13 |     inputs:
 14 |       debug_enabled:
 15 |         type: boolean
 16 |         description: 'Run the build with tmate debugging enabled (https://github.com/marketplace/actions/debugging-with-tmate)'
 17 |         required: false
 18 |         default: false
 19 | 
 20 | 
 21 | jobs:
 22 |   build:
 23 |     strategy:
 24 |       fail-fast: false
 25 |       matrix:
 26 |         platform: [ubuntu-latest, macos-latest] # , windows-latest]
 27 |         python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
 28 |         exclude: [
 29 |           # min and max version is enough for osx running (it's kinda slow)
 30 |           {platform: macos-latest, python-version: '3.10'},
 31 |           {platform: macos-latest, python-version: '3.11'},
 32 |           {platform: macos-latest, python-version: '3.12'},
 33 |         ]
 34 |     runs-on: ${{ matrix.platform }}
 35 | 
 36 |     # useful for 'optional' pipelines
 37 |     # continue-on-error: ${{ matrix.platform == 'windows-latest' }}
 38 | 
 39 |     steps:
 40 |     # ugh https://github.com/actions/toolkit/blob/main/docs/commands.md#path-manipulation
 41 |     - run: echo "$HOME/.local/bin" >> $GITHUB_PATH
 42 | 
 43 |     - uses: actions/checkout@v4
 44 |       with:
 45 |         submodules: recursive
 46 |         fetch-depth: 0  # nicer to have all git history when debugging/for tests
 47 | 
 48 |     - uses: actions/setup-python@v5
 49 |       with:
 50 |         python-version: ${{ matrix.python-version }}
 51 |       
 52 |     - uses: astral-sh/setup-uv@v5
 53 |       with:
 54 |         enable-cache: false  # we don't have lock files, so can't use them as cache key
 55 | 
 56 |     - uses: mxschmitt/action-tmate@v3
 57 |       if: ${{ github.event_name == 'workflow_dispatch' && inputs.debug_enabled }}
 58 | 
 59 |     # explicit bash command is necessary for Windows CI runner, otherwise it thinks it's cmd...
 60 |     - run: bash .ci/run
 61 | 
 62 |     - if: matrix.platform == 'ubuntu-latest'  # no need to compute coverage for other platforms
 63 |       uses: actions/upload-artifact@v4
 64 |       with:
 65 |         include-hidden-files: true
 66 |         name: .coverage.mypy_${{ matrix.platform }}_${{ matrix.python-version }}
 67 |         path: .coverage.mypy/
 68 | 
 69 | 
 70 |   pypi:
 71 |     runs-on: ubuntu-latest
 72 |     needs: [build] # add all other jobs here
 73 |     permissions:
 74 |       # necessary for Trusted Publishing
 75 |       id-token: write
 76 |     steps:
 77 |     # ugh https://github.com/actions/toolkit/blob/main/docs/commands.md#path-manipulation
 78 |     - run: echo "$HOME/.local/bin" >> $GITHUB_PATH
 79 | 
 80 |     - uses: actions/checkout@v4
 81 |       with:
 82 |         submodules: recursive
 83 | 
 84 |     - uses: actions/setup-python@v5
 85 |       with:
 86 |         python-version: '3.10'
 87 | 
 88 |     - uses: astral-sh/setup-uv@v5
 89 |       with:
 90 |         enable-cache: false  # we don't have lock files, so can't use them as cache key
 91 | 
 92 |     - name: 'release to test pypi'
 93 |       # always deploy merged master to test pypi
 94 |       if: github.event_name != 'pull_request' && github.event.ref == 'refs/heads/master'
 95 |       run: .ci/release-uv --use-test-pypi
 96 | 
 97 |     - name: 'release to pypi'
 98 |       # always deploy tags to release pypi
 99 |       # NOTE: release tags are guarded by on: push: tags on the top
100 |       if: github.event_name != 'pull_request' && startsWith(github.event.ref, 'refs/tags')
101 |       run: .ci/release-uv
102 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Created by https://www.gitignore.io/api/python,emacs
  3 | # Edit at https://www.gitignore.io/?templates=python,emacs
  4 | 
  5 | ### Emacs ###
  6 | # -*- mode: gitignore; -*-
  7 | *~
  8 | \#*\#
  9 | /.emacs.desktop
 10 | /.emacs.desktop.lock
 11 | *.elc
 12 | auto-save-list
 13 | tramp
 14 | .\#*
 15 | 
 16 | # Org-mode
 17 | .org-id-locations
 18 | *_archive
 19 | 
 20 | # flymake-mode
 21 | *_flymake.*
 22 | 
 23 | # eshell files
 24 | /eshell/history
 25 | /eshell/lastdir
 26 | 
 27 | # elpa packages
 28 | /elpa/
 29 | 
 30 | # reftex files
 31 | *.rel
 32 | 
 33 | # AUCTeX auto folder
 34 | /auto/
 35 | 
 36 | # cask packages
 37 | .cask/
 38 | dist/
 39 | 
 40 | # Flycheck
 41 | flycheck_*.el
 42 | 
 43 | # server auth directory
 44 | /server/
 45 | 
 46 | # projectiles files
 47 | .projectile
 48 | 
 49 | # directory configuration
 50 | .dir-locals.el
 51 | 
 52 | # network security
 53 | /network-security.data
 54 | 
 55 | 
 56 | ### Python ###
 57 | # Byte-compiled / optimized / DLL files
 58 | __pycache__/
 59 | *.py[cod]
 60 | *$py.class
 61 | 
 62 | # C extensions
 63 | *.so
 64 | 
 65 | # Distribution / packaging
 66 | .Python
 67 | build/
 68 | develop-eggs/
 69 | downloads/
 70 | eggs/
 71 | .eggs/
 72 | lib/
 73 | lib64/
 74 | parts/
 75 | sdist/
 76 | var/
 77 | wheels/
 78 | pip-wheel-metadata/
 79 | share/python-wheels/
 80 | *.egg-info/
 81 | .installed.cfg
 82 | *.egg
 83 | MANIFEST
 84 | 
 85 | # PyInstaller
 86 | #  Usually these files are written by a python script from a template
 87 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 88 | *.manifest
 89 | *.spec
 90 | 
 91 | # Installer logs
 92 | pip-log.txt
 93 | pip-delete-this-directory.txt
 94 | 
 95 | # Unit test / coverage reports
 96 | htmlcov/
 97 | .tox/
 98 | .nox/
 99 | .coverage
100 | .coverage.*
101 | .cache
102 | nosetests.xml
103 | coverage.xml
104 | *.cover
105 | .hypothesis/
106 | .pytest_cache/
107 | 
108 | # Translations
109 | *.mo
110 | *.pot
111 | 
112 | # Scrapy stuff:
113 | .scrapy
114 | 
115 | # Sphinx documentation
116 | docs/_build/
117 | 
118 | # PyBuilder
119 | target/
120 | 
121 | # pyenv
122 | .python-version
123 | 
124 | # pipenv
125 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
126 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
127 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
128 | #   install all needed dependencies.
129 | #Pipfile.lock
130 | 
131 | # celery beat schedule file
132 | celerybeat-schedule
133 | 
134 | # SageMath parsed files
135 | *.sage.py
136 | 
137 | # Spyder project settings
138 | .spyderproject
139 | .spyproject
140 | 
141 | # Rope project settings
142 | .ropeproject
143 | 
144 | # Mr Developer
145 | .mr.developer.cfg
146 | .project
147 | .pydevproject
148 | 
149 | # mkdocs documentation
150 | /site
151 | 
152 | # mypy
153 | .mypy_cache/
154 | .dmypy.json
155 | dmypy.json
156 | 
157 | # Pyre type checker
158 | .pyre/
159 | 
160 | # End of https://www.gitignore.io/api/python,emacs
161 | 
162 | untracked/
163 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Dmitrii Gerasimov
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
 1 | # this is a hack to monkey patch pytest so it handles tests inside namespace packages without __init__.py properly
 2 | # without it, pytest can't discover the package root for some reason
 3 | # also see https://github.com/karlicoss/pytest_namespace_pkgs for more
 4 | 
 5 | import os
 6 | import pathlib
 7 | from typing import Optional
 8 | 
 9 | import _pytest.main
10 | import _pytest.pathlib
11 | 
12 | # we consider all dirs in repo/ to be namespace packages
13 | root_dir = pathlib.Path(__file__).absolute().parent.resolve() / 'src'
14 | assert root_dir.exists(), root_dir
15 | 
16 | # TODO assert it contains package name?? maybe get it via setuptools..
17 | 
18 | namespace_pkg_dirs = [str(d) for d in root_dir.iterdir() if d.is_dir()]
19 | 
20 | # resolve_package_path is called from _pytest.pathlib.import_path
21 | # takes a full abs path to the test file and needs to return the path to the 'root' package on the filesystem
22 | resolve_pkg_path_orig = _pytest.pathlib.resolve_package_path
23 | 
24 | 
25 | def resolve_package_path(path: pathlib.Path) -> Optional[pathlib.Path]:
26 |     result = path  # search from the test file upwards
27 |     for parent in result.parents:
28 |         if str(parent) in namespace_pkg_dirs:
29 |             return parent
30 |     if os.name == 'nt':
31 |         # ??? for some reason on windows it is trying to call this against conftest? but not on linux/osx
32 |         if path.name == 'conftest.py':
33 |             return resolve_pkg_path_orig(path)
34 |     raise RuntimeError("Couldn't determine path for ", path)
35 | 
36 | 
37 | # NOTE: seems like it's not necessary anymore?
38 | # keeping it for now just in case
39 | # after https://github.com/pytest-dev/pytest/pull/13426 we should be able to remove the whole conftest
40 | # _pytest.pathlib.resolve_package_path = resolve_package_path
41 | 
42 | 
43 | # without patching, the orig function returns just a package name for some reason
44 | # (I think it's used as a sort of fallback)
45 | # so we need to point it at the absolute path properly
46 | # not sure what are the consequences.. maybe it wouldn't be able to run against installed packages? not sure..
47 | search_pypath_orig = _pytest.main.search_pypath
48 | 
49 | 
50 | def search_pypath(module_name: str) -> str:
51 |     mpath = root_dir / module_name.replace('.', os.sep)
52 |     if not mpath.is_dir():
53 |         mpath = mpath.with_suffix('.py')
54 |         assert mpath.exists(), mpath  # just in case
55 |     return str(mpath)
56 | 
57 | 
58 | _pytest.main.search_pypath = search_pypath
59 | 


--------------------------------------------------------------------------------
/doall:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from pathlib import Path
 3 | from itertools import islice
 4 | from subprocess import check_call, run, PIPE
 5 | 
 6 | paths = list(sorted(Path('reddit').glob('*.json')))
 7 | 
 8 | def different(p1: Path, p2: Path, extract: bool) -> bool:
 9 |     cmd = [
10 |         './jdiff', '--diff', *(['--extract'] if extract else []), str(p1), str(p2),
11 |     ]
12 |     print('   ' + ' '.join(cmd))
13 |     res = run(cmd, stdout=PIPE)
14 |     assert res.returncode <= 1
15 |     return res.returncode == 1
16 | 
17 | # TODO domination relationship can be tested via diff inclusion
18 | # TODO different normaliser for csv (e.g. lastfm)
19 | # TODO start erroring when there are enough of them, so it's not too annoying? 
20 | # TODO or, maybe only error if the last one triggered. tha
21 | 
22 | from_ = 1644
23 | for i, before, after in islice(zip(range(10000000000000), paths, paths[1:]), from_, None):
24 |     print(f'comparing {i} {before.name} vs {after.name}: ')
25 |     extr_diff = different(before, after, extract=True)
26 |     cleanup_diff = different(before, after, extract=False)
27 |     # if there are differences, whatever, keep on going
28 |     if extr_diff == cleanup_diff:
29 |         print('    ok: both normalisers agree ' + ('different' if extr_diff else 'SAME'))
30 |         continue
31 |     print('  ERROR!!!!!')
32 | 
33 |     
34 |     # if cleanup_diff:
35 |     #     print('   OK: both normalined and cleaned up')
36 |     # assert not cleanup_diff
37 | 


--------------------------------------------------------------------------------
/doc/options.md:
--------------------------------------------------------------------------------
 1 | An explanation of the `--multiway`/`--prune-dominated` options, modified from [zulip chat](https://memex.zulipchat.com/#narrow/stream/279601-hpi/topic/bleanser/near/258276779)
 2 | 
 3 | Say you had a bunch of sqlite databases and mapped them onto text dumps using `normalise`. The idea is to figure out which dumps are redundant.
 4 | 
 5 | Say you've got dumps `C.sql` and `B.sql` -- and you diff them (like literally, [`diff`](https://man7.org/linux/man-pages/man1/diff.1.html))
 6 | 
 7 | You have the following cases
 8 | 
 9 | - they are exactly the same (`CmpResult.SAME`), so obviously it's safe to remove `A.sql`
10 | - `B.sql` is a superset of `A.sql` (this is `CmpResult.DOMINATES`). In general it's safe to remove `A.sql` in this case, but cause I'm paranoid it's controlled by `delete_dominated`
11 | - `B.sql` isn't a superset of `A.sql`, i.e. some items present in `A` are missing in `B`. (this is `CmpResult.DIFFERENT`). In this case you wanna keep both`A` and `B`. In practice this happens when there is some retention in the database (like with browser history)
12 | - there is also a special value `CmpResult.ERROR`, which also means we want to keep both `A` and `B` (but it's nice to distinguish from `DIFFERENT`)
13 | 
14 | Now in the simplest case... you just go through all pairs of adjacent files, compute these `CmpResult`s, and end up with smth like this
15 | I'll use `<` for 'dominated', `=` for 'same', `!=` for 'different':
16 | 
17 | `A < B < C != D = E < G != H != I != J < K != L < M < N`
18 | 
19 | So in principle, you only need to keep files `C, G, H, I, K, N` and it will still give you a complete set of data when you merge it
20 | 
21 | Alternatively, you keep `A, C, D, G, H, I, J, K, L, N` if the `delete_dominated` flag is `False`
22 | 
23 | This is called 'two-way' comparison, cause you just consider pairs of adjacent files, so it would be `MULTIWAY = False`
24 | 
25 | Multiway comparison; easier to show on an example
26 | 
27 | Say we've got these sets of items
28 | 
29 | ```
30 | {A B C} # 0
31 | {B C D} # 1
32 | {C D E} # 2
33 | {X Y Z} # 3
34 | ```
35 | 
36 | If we do two-way comparisons, we'll keep them all because none of them fully contains the previous neighbour.
37 | 
38 | However you may notice that union of `0` and `2` completely contains `1`. This is what 'multiway' mode does -- trying to find 'pivot' elements which contain the sets 'between' them. <https://github.com/karlicoss/bleanser/blob/deae59f956ceb1131ed8f8f3666516f63ad82757/src/bleanser/core/common.py#L31-L41>
39 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | pretty = True
 3 | show_error_context = True
 4 | show_column_numbers = True
 5 | show_error_end = True
 6 | 
 7 | check_untyped_defs = True
 8 | 
 9 | # see https://mypy.readthedocs.io/en/stable/error_code_list2.html
10 | warn_redundant_casts = True
11 | strict_equality = True
12 | warn_unused_ignores = True
13 | enable_error_code = deprecated,redundant-expr,possibly-undefined,truthy-bool,truthy-iterable,ignore-without-code,unused-awaitable
14 | 
15 | # an example of suppressing
16 | # [mypy-my.config.repos.pdfannots.pdfannots]
17 | # ignore_errors = True
18 | 


--------------------------------------------------------------------------------
/old/README.md:
--------------------------------------------------------------------------------
1 | some old normalisers, might be useful for reference
2 | 


--------------------------------------------------------------------------------
/old/common.py:
--------------------------------------------------------------------------------
 1 | class CmpResult(Enum):
 2 |     DIFFERENT = 'different'
 3 |     SAME      = 'same'
 4 |     DOMINATES = 'dominates'
 5 |     ERROR     = 'error'
 6 | 
 7 | 
 8 | class Diff(NamedTuple):
 9 |     cmp: CmpResult
10 |     diff: bytes
11 | 
12 | 
13 | class Relation(NamedTuple):
14 |     before: Path
15 |     diff: Diff
16 |     after: Path
17 | 


--------------------------------------------------------------------------------
/old/foursquare.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from pathlib import Path
 3 | 
 4 | from jq_normaliser import JqNormaliser, Filter, pipe, jdel as d, jq_del_all
 5 | 
 6 | 
 7 | def _normalise_coordinates():
 8 |     return [
 9 |         # TODO shit. take - into account??
10 |         '(.. | .lat?) |= (tostring | .[0:4])',
11 |         '(.. | .lng?) |= (tostring | .[0:4])',
12 |     ]
13 | 
14 | 
15 | 
16 | class FsqNormaliser(JqNormaliser):
17 |     def __init__(self, *args, **kwargs) -> None:
18 |         super().__init__(*args, **kwargs, logger_tag='fsq-normaliser', delete_dominated=True, keep_both=False) # type: ignore
19 | 
20 |     # ok, this on only can delete items or do trivial rewrites
21 |     # if we map we might lose data here!
22 |     def cleanup(self) -> Filter:
23 |         return pipe(
24 |             d('.[] | (.meta, .notifications)'),
25 | 
26 |             d('.[].response.checkins.items[] | (.isMayor, .venue, .likes, .sticker, .like, .ratedAt)'),
27 | 
28 |             jq_del_all(
29 |                 'contact',
30 |             ),
31 |             jq_del_all(
32 |                 'editableUntil',
33 |                 'prefix',
34 |                 'consumerId',
35 |             ),
36 |             jq_del_all(
37 |                 'lastName',
38 |             ),
39 |             *_normalise_coordinates(),
40 |             # TODO shit. again, we want to assert...
41 |         )
42 |     # TODO shit. lat and lng jump randomly.. can we trim them?
43 |         # return '.'
44 |         # return 'sort_by(.date) | map(map_values(ascii_downcase))'
45 | 
46 |     def extract(self) -> Filter:
47 |         return pipe(
48 |             'map_values(.response)',
49 |             'map_values(.checkins)',
50 |             'map_values(.items)',
51 |             '.[]',
52 |             'map({id})', #  venue: .venue.name })', just keep venue id??
53 |             *_normalise_coordinates(),
54 |             # TODO not sure if we need to sort?
55 |         )
56 | 
57 | 
58 | 
59 | def main():
60 |     norm = FsqNormaliser()
61 |     norm.main()
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     main()
66 | 


--------------------------------------------------------------------------------
/old/json.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | from __future__ import annotations
  3 | 
  4 | from contextlib import contextmanager
  5 | from itertools import tee
  6 | import orjson as json
  7 | from pathlib import Path
  8 | from typing import Iterator, List
  9 | 
 10 | from bleanser.core.common import logger
 11 | from bleanser.core.utils import Json
 12 | from bleanser.core.processor import BaseNormaliser
 13 | 
 14 | 
 15 | from plumbum import local  # type: ignore
 16 | 
 17 | 
 18 | jq = local['jq']
 19 | 
 20 | 
 21 | # TODO hmm maybe I just want to use https://github.com/tomnomnom/gron ?
 22 | # although would be tricky to chop off the indices...
 23 | 
 24 | # we replace numbers with placeholders since otherwise it's too unstable
 25 | # TODO ... not sure if it should be the default
 26 | JQ_PATHS = '''
 27 | paths(scalars) as $p
 28 |   | [ ( [ $p[] | if type == "number" then "X" else tostring end ] | join(".") )
 29 |     , ( getpath($p) | tojson )
 30 |     ]
 31 |   | join(": ")
 32 | '''
 33 | 
 34 | import hashlib
 35 | 
 36 | from typing import Iterator, Tuple, Iterable
 37 | JPath = str
 38 | JVal  = str
 39 | JHash = str
 40 | # TODO ugh. it's a bit too elaborate way to do structural diff, really...
 41 | # TODO fuck. this is quite slow, but not sure what should I do about it...
 42 | # how to make it work with process pool executor??
 43 | def _aspaths(js: Json) -> Tuple[JHash, Iterable[Tuple[JPath, JVal]]]:
 44 |     if isinstance(js, (str, int, float, bool, type(None))):
 45 |         # TODO json dumps?
 46 |         # TODO do root values really need hash?
 47 |         vhash = hashlib.md5(str(js).encode('utf8')).hexdigest()[:7]
 48 |         return (vhash, [('', str(js))])
 49 | 
 50 |     sep = '.' # todo customize?
 51 | 
 52 |     # TODO ugh. not very iterative..
 53 |     # I guess can't really be, because need information about all siblings before proceeding?
 54 |     if isinstance(js, list):
 55 |         ress = []
 56 |         for i, c in enumerate(js):
 57 |             k = str(i)
 58 |             chash, cres = _aspaths(c)
 59 | 
 60 |             for p, v in cres:
 61 |                 cp = chash
 62 |                 ress.append((cp + ('' if len(p) == 0 else (sep + p)), v))
 63 |         # TODO list shouldn't be hashed??
 64 |         # TODO shit... could this be a problem for something like tags?
 65 |         return ('<list>', ress)
 66 | 
 67 |     if isinstance(js, dict):
 68 |         # TODO or maybe two pass? then won't need to cache as much?
 69 |         # TODO could optimize and avoid combining the very top level hash?
 70 |         ress = []
 71 |         hd: dict[str, str] = {}
 72 |         for k, c in sorted(js.items()):
 73 |             cp = k
 74 | 
 75 |             chash, cres = _aspaths(c)
 76 |             hd[k] = chash
 77 | 
 78 |             for p, v in cres:
 79 |                 ress.append((cp + ('' if len(p) == 0 else (sep + p)), v))
 80 | 
 81 |         dhash = hashlib.md5(json.dumps(hd)).hexdigest()[:7]
 82 |         return (dhash, ress)
 83 | 
 84 |     raise RuntimeError(js, type(js))
 85 | 
 86 | 
 87 | def aspaths(js: Json) -> Iterator[str]:
 88 |     _, res = _aspaths(js=js)
 89 |     for k, v in res:
 90 |         yield k + ' : ' + v
 91 | 
 92 | 
 93 | def test_aspaths() -> None:
 94 |     j = {
 95 |         'root': [
 96 |             dict(a=1,b=1),
 97 |             dict(a=1,b=0),
 98 |             dict(a=0,b=1),
 99 |             dict(a=0,b=0),
100 |             dict(a=2,b=2),
101 | 
102 |             dict(a=1,b=0),
103 |             dict(a=1,b=1),
104 |         ],
105 |         'boop': {'beep': [123, 456]},
106 |     }
107 |     paths = list(aspaths(j))
108 |     assert paths == [
109 |         'boop.beep.202cb96 : 123',
110 |         'boop.beep.250cf8b : 456',
111 |         'root.824ad40.a : 1',
112 |         'root.824ad40.b : 1',
113 |         'root.8a5a377.a : 1',
114 |         'root.8a5a377.b : 0',
115 |         'root.23bbe1a.a : 0',
116 |         'root.23bbe1a.b : 1',
117 |         'root.213c309.a : 0',
118 |         'root.213c309.b : 0',
119 |         'root.8b165c4.a : 2',
120 |         'root.8b165c4.b : 2',
121 |         'root.8a5a377.a : 1',
122 |         'root.8a5a377.b : 0',
123 |         'root.824ad40.a : 1',
124 |         'root.824ad40.b : 1',
125 |     ]
126 | 
127 | 
128 | 
129 | def _aspaths_aux(js: Json) -> List[str]:
130 |     return list(aspaths(js))
131 | 
132 | 
133 | class JsonNormaliser(BaseNormaliser):
134 |     # filter out additions; keep the rest
135 |     DIFF_FILTER = '> '
136 | 
137 |     MULTIWAY = True
138 |     # TODO delete dominated
139 | 
140 |     def cleanup(self, j: Json) -> None:
141 |         # TODO not sure if should modify in place?
142 |         pass
143 | 
144 |     @contextmanager
145 |     def do_cleanup(self, path: Path, *, wdir: Path) -> Iterator[Path]:
146 |         # TODO call .unpacked
147 | 
148 |         # todo copy paste from SqliteNormaliser
149 |         path = path.absolute().resolve()
150 |         cleaned = wdir / Path(*path.parts[1:]) / (path.name + '-cleaned')
151 |         cleaned.parent.mkdir(parents=True, exist_ok=True)
152 | 
153 |         with path.open('r') as fp:
154 |             j = json.loads(fp.read())
155 |         self.cleanup(j)
156 |         # todo sort keys? not sure...
157 |         # TODO huh. jq version is almost order of magnitude slower???
158 |         # js = json.dumps(j) # , indent=2, sort_keys=True)
159 |         # cmd = jq['-r', JQ_PATHS]
160 |         # jq_lines = (cmd << js )().splitlines()
161 |         jq_lines = _aspaths_aux(j)
162 |         # # move to top
163 |         # from concurrent.futures import ProcessPoolExecutor as Pool
164 |         # pool = Pool(8)
165 |         # #
166 |         # fut = pool.submit(_aspaths_aux, j)
167 |         # jq_lines = fut.result()
168 | 
169 |         # TODO later
170 |         cleanup_jq_dump = getattr(self, 'cleanup_jq_dump', None)
171 |         if cleanup_jq_dump is not None:
172 |             cleanup_jq_dump(jq_lines)
173 |         with cleaned.open('w') as fp:
174 |             for line in jq_lines:
175 |                 print(line, file=fp)
176 |         yield cleaned
177 | 
178 | 
179 | 
180 | def test_json_normaliser_1(tmp_path: Path) -> None:
181 |     j = [
182 |         dict(a=1,b=1),
183 |         dict(a=1,b=0),
184 |         dict(a=0,b=1),
185 |         dict(a=0,b=0),
186 |         dict(a=2,b=2),
187 | 
188 |         dict(a=1,b=0),
189 |         dict(a=1,b=1),
190 |     ]
191 |     i = tmp_path / 'input.json'
192 |     i.write_text(json.dumps(j))
193 | 
194 |     n = JsonNormaliser()
195 |     with n.do_cleanup(i, wdir=tmp_path) as c:
196 |         res = c.read_text()
197 | 
198 |     lines = res.splitlines()
199 |     assert len(lines) == 14, lines
200 | 
201 |     lset = set(lines)
202 |     # we want to keep these unique 'rows'
203 |     assert len(lset) == 10, (lines, lset)
204 | 
205 | 
206 | def test_json_normaliser_2(tmp_path: Path) -> None:
207 |     # TODO ok -- so we need to mark certain 'levels' as rolling instead? uggggh
208 |     j = [
209 |         ['b', 1],
210 |         ['b', 0],
211 |         ['a', 1],
212 |         ['a', 0],
213 |         ['c', 2],
214 | 
215 |         ['b', 0],
216 |         ['b', 1],
217 |     ]
218 |     i = tmp_path / 'input.json'
219 |     i.write_text(json.dumps(j))
220 | 
221 |     n = JsonNormaliser()
222 |     with n.do_cleanup(i, wdir=tmp_path) as c:
223 |         res = c.read_text()
224 | 
225 |     lines = res.splitlines()
226 |     assert len(lines) == 14, lines
227 | 
228 |     lset = set(lines)
229 |     # TODO right, this won't work now... because we don't want to hash the whole list...
230 |     # assert len(lset) == 10, (lines, lset)
231 | 
232 | 
233 | # can work as generic json processor
234 | if __name__ == '__main__':
235 |     from bleanser.core import main
236 |     main(Normaliser=JsonNormaliser)
237 | 
238 | # just for convenience
239 | from .utils import Json
240 | 


--------------------------------------------------------------------------------
/old/json2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | from pathlib import Path
  3 | import sys
  4 | from subprocess import check_call
  5 | 
  6 | import json
  7 | 
  8 | # todo hmm, seems that there isn't that much perf difference, at least on hyperfine
  9 | # although on the profile, when running with orjson, seems to finish faster??
 10 | # maybe double check later
 11 | # import orjson as json
 12 | 
 13 | 
 14 | # TODO warn about some data being cleaned, refer to the sources
 15 | def pp_github(j):
 16 |     # todo later compare to jq somehow? but doubt it'd be faster
 17 |     from itertools import chain
 18 | 
 19 |     # TODO hmm
 20 |     # what should we do with repos :::: clones thing?
 21 |     # maybe we could check domination relationship in a more clever way somehow?...
 22 |     # e.g. here clones -> count ???
 23 |     # ': 0, 'uniques': 0, 'views': []}, 'clones': {'count': 29, 'uniques': 14, 'clones': [{'timestamp': '2021-11-29T00:00:00Z'|       'pull': True}, 'traffic': {'views': {'count': 0, 'uniques': 0, 'views': []}, 'clones': {'count': 27, 'uniques': 13, 'cl
 24 |     #   , 'count': 3, 'uniques': 2}, {'timestamp': '2021-11-30T00:00:00Z', 'count': 2, 'uniques': 1}, {'timestamp': '2021-12-01T|       ones': [{'timestamp': '2021-11-29T00:00:00Z', 'count': 1, 'uniques': 1}, {'timestamp': '2021-11-30T00:00:00Z', 'count':
 25 | 
 26 |     # TODO not sure what to do with it...
 27 |     # for x in j['repos']:
 28 |     #     del x['traffic']
 29 | 
 30 |     for x in chain(j['watched'], j['starred']):
 31 |         for key in [
 32 |                 'watchers', 'stargazers_count', 'watchers_count',
 33 | 
 34 |                 # updated_at -- seems that it's updated every time there is a star etc...
 35 |                 'updated_at',
 36 |                 'forks', 'forks_count',
 37 | 
 38 |                 'open_issues', 'open_issues_count',
 39 | 
 40 |                 # eh, not sure about these...
 41 |                 'pushed_at',
 42 |                 'size',
 43 |         ]:
 44 |             del x[key]
 45 | 
 46 | 
 47 | def pp_spotify(j):
 48 |     from bleanser.modules.spotifyexport import Normaliser
 49 |     n = Normaliser(path='meh')
 50 |     # todo method to delete multiple keys
 51 |     n.cleanup(j=j)
 52 | 
 53 | 
 54 |     # TODO need to unflatten playlists somehow
 55 |     # hmm basically any list-like thing is 'suspicious', because it kinda means denormalised struct
 56 |     pl2 = []
 57 |     for x in j['playlists']:
 58 |         for t in x['tracks']:
 59 |             q = {k: v for k, v in x.items()}
 60 |             q['tracks'] = t
 61 |             pl2.append(q)
 62 |     j['playlists'] = pl2
 63 |     # hmm this is annoying... shared playlists are updating literally every day?
 64 | 
 65 | 
 66 | def preprocess(*, j, name):
 67 |     # todo not sure how defensive should be?
 68 | 
 69 |     # todo not sure if there is a better way
 70 |     if '/github-events/' in name:
 71 |         pp_github(j)
 72 |     elif '/spotify/' in name:
 73 |         pp_spotify(j)
 74 | 
 75 | 
 76 | def process(fo, *, name) -> None:
 77 |     data = fo.read()
 78 |     # todo orjson supports memoryview??
 79 |     j = json.loads(data)
 80 |     # todo would be nice to close it here
 81 | 
 82 |     preprocess(j=j, name=name)
 83 | 
 84 |     if isinstance(j, list):
 85 |         res = {'<toplevel>': j} # meh
 86 |     else:
 87 |         assert isinstance(j, dict), j
 88 |         res = j
 89 | 
 90 |     for k, v in res.items():
 91 |         if not isinstance(v, list):
 92 |             # something like 'profile' data in hypothesis could be a dict
 93 |             # something like 'notes' in rescuetime could be a scalar (str)
 94 |             v = [v] # meh
 95 |         assert isinstance(v, list), (k, v)
 96 |         for i in v:
 97 |             # todo dump json here for i; sort keys?
 98 |             print(f'{k} ::: {i}')
 99 |     print('ok')
100 | 
101 | 
102 | def compare(p1: str, p2: str):
103 |     assert p1 != '-' and p2 != '-'
104 |     # hacky way to compare
105 |     def cc(p: str):
106 |         if p.endswith('.xz'):
107 |             cat = 'xzcat'
108 |         else:
109 |             cat = 'cat'
110 |         # {cat} {p} | {__file__} -
111 |         return f'{__file__} {p} | sort'
112 |     c1 = cc(p1)
113 |     c2 = cc(p2)
114 |     # wrap = ' -c "windo set wrap" '#  -- eh, not super convenient?
115 |     wrap = ''
116 |     # TODO pipefail? doesn't work well..
117 |     cmd = f'vimdiff {wrap} <({c1}) <({c2})'
118 |     check_call(cmd, shell=True, executable='/bin/bash')
119 | 
120 | 
121 | def main() -> None:
122 |     import argparse
123 |     p = argparse.ArgumentParser()
124 |     p.add_argument('path1')
125 |     p.add_argument('path2', nargs='?')
126 |     p.add_argument('--first' , required=False, type=int)
127 |     p.add_argument('--second', required=False, type=int)
128 |     args = p.parse_args()
129 | 
130 |     p1 = args.path1
131 |     p2 = args.path2
132 | 
133 |     # TODO compare performance fo handling compressed and uncompressed files
134 |     from bleanser.core.kompress import CPath
135 | 
136 |     assert p1 is not None
137 | 
138 |     if p2 is not None:
139 |         compare(p1=p1, p2=p2)
140 |         return
141 | 
142 |     # handle single file
143 |     if p1 == '-':
144 |         process(fo=sys.stdin)
145 |         return
146 | 
147 |     pp = Path(p1).absolute()
148 | 
149 |     if pp.is_dir():
150 |         files = list(sorted(pp.iterdir()))
151 | 
152 |         first = args.first; assert first is not None
153 | 
154 |         second = args.second
155 |         if second is None:
156 |             second = first + 1
157 |         assert second < len(files), len(files)
158 | 
159 |         p1 = str(files[first ])
160 |         p2 = str(files[second])
161 |         compare(p1=p1, p2=p2)
162 |     else:
163 |         path = str(pp)
164 |         with CPath(path).open() as fo:
165 |             process(fo=fo, name=path)
166 | 
167 | 
168 | if __name__ == '__main__':
169 |     main()
170 | 


--------------------------------------------------------------------------------
/old/reddit.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | from pathlib import Path
  3 | 
  4 | from jq_normaliser import JqNormaliser, Filter, pipe, jdel as d, Filter2
  5 | from jq_normaliser import CmpResult # eh, just to bring into scope for backup script
  6 | 
  7 | from kython.kjq import del_all_kjson
  8 | 
  9 | 
 10 | class RedditNormaliser(JqNormaliser):
 11 |     def __init__(self, *args, **kwargs) -> None:
 12 |         super().__init__(*args, **kwargs, logger_tag='reddit-normaliser', delete_dominated=False, keep_both=True) # type: ignore
 13 |         # TODO wonder if there are many dominated ?
 14 | 
 15 |     def cleanup(self) -> Filter:
 16 |         ignore_keys = (
 17 |             'allow_discovery',
 18 |             'event_start', 'event_end', 'event_is_live',
 19 |             'allowed_galleries',
 20 |             'top_awarded_type',
 21 |             'treatment_tags',
 22 |             # TODO 'edited'?
 23 |             'collapsed', 'collapsed_reason', # todo potentially interesting?
 24 |             'is_crosspostable_subreddit',
 25 |             'og_description', 'og_title',
 26 |             'pref_no_profanity', 'pref_geopopular', 'pref_top_karma_subreddits',
 27 |             'steward_report',
 28 |             'is_video',
 29 |             'rte_mode',
 30 |             'accept_chats',
 31 |             'accept_pms',
 32 |             'treatment_tags',
 33 |             'password_set',
 34 |             'allow_polls',
 35 |             'allow_chat_post_creation',
 36 |             'is_chat_post_feature_enabled',
 37 |             'linked_identities',
 38 |             'upvote_ratio',
 39 |             'icon_img',
 40 |             'icon_size',
 41 |             'icon_url',
 42 |             'icon_name',
 43 | 
 44 |             'thumbnail_height',
 45 | 
 46 |             'crosspost_parent_list',
 47 |             'primary_color',
 48 |             'archived',
 49 |             'suggested_sort',
 50 |             'over_18',
 51 |             'over18',
 52 |             'allow_videos',
 53 |             'allow_images',
 54 |             'allow_videogifs',
 55 | 
 56 |             'comment_score_hide_mins',
 57 |             'wiki_enabled',
 58 |             'suggested_sort',
 59 |             'suggested_comment_sort',
 60 |             'header_img',
 61 |             'header_size',
 62 |             'has_menu_widget',
 63 |             'banner_background_color',
 64 |             'banner_background_image',
 65 |             'banner_img',
 66 |             'banner_size',
 67 |             'mobile_banner_image',
 68 | 
 69 |             'community_icon',
 70 |             'no_follow',
 71 |             'submission_type',
 72 |             'is_crosspostable',
 73 | 
 74 |             'link_flair_enabled',
 75 |             'link_flair_position',
 76 |             'link_flair_css_class',
 77 |             'link_flair_template_id',
 78 |             'link_flair_text',
 79 |             'link_flair_type',
 80 |             'link_flair_richtext',
 81 | 
 82 |             'post_hint',
 83 |             'is_robot_indexable',
 84 |             'content_categories',
 85 | 
 86 |             'parent_whitelist_status',
 87 |             'pwls',
 88 |             'whitelist_status',
 89 |             'wls',
 90 |             'show_media',
 91 |             'spoilers_enabled',
 92 |             'collapse_deleted_comments',
 93 |             'key_color',
 94 |             'can_assign_user_flair',
 95 |             'emojis_enabled',
 96 |             'author_patreon_flair',
 97 |             "author_flair_richtext",
 98 |             'author_flair_text',
 99 |             'author_flair_background_color',
100 |             'author_flair_text_color',
101 |             'author_flair_type',
102 |             'author_flair_css_class',
103 |             'author_flair_template_id',
104 | 
105 |             "original_content_tag_enabled",
106 |             'emojis_custom_size',
107 | 
108 |             'gilded',
109 |             'gildings',
110 |             'gid_1',
111 |             'gid_2',
112 |             'gid_3',
113 |             'media_metadata',
114 |             'can_assign_link_flair',
115 |             'advertiser_category',
116 |             'can_gild',
117 |             'user_reports',
118 |             'author',
119 |             'author_fullname',
120 |             'report_reasons',
121 |             'discussion_type',
122 |             'allow_live_comments',
123 |             'score_hidden',
124 | 
125 |             'submit_link_label',
126 |             'submit_text_label',
127 |             'header_title',
128 |             # TODO reuse it in reddit backup script?
129 | 
130 |             'secure_media',
131 |             'domain',
132 | 
133 |             'audience_target',
134 |             'free_form_reports',
135 | 
136 |             'restrict_commenting',
137 |             'restrict_posting',
138 |             'show_media_preview',
139 | 
140 |             'is_favorited',
141 |             'is_subscriber',
142 | 
143 |             'oembed',
144 |             'media_embed',
145 |             'secure_media_embed',
146 |             'stickied',
147 |             'owner_id',
148 | 
149 |             'all_awardings',
150 | 
151 |             'total_awards_received',
152 | 
153 |             'likes',
154 |             'send_replies',
155 |             'is_self',
156 | 
157 |             'url', # ugh. changed from www.reddit.... to link without reddit domain
158 |             '_comments',
159 | 
160 |             "user_flair_richtext",
161 |             "user_flair_template_id",
162 |             "user_flair_type",
163 |             "user_flair_text_color",
164 |             "associated_award",
165 | 
166 |             'author_premium',
167 |             'new',
168 |             'awarders',
169 |             'hide_score',
170 |         )
171 |         # TODO ugh, some issue with coins null vs 0??
172 | 
173 |         # NOTE this step took _really_ long.... e.g. 20 secs vs 0.5 sec for the rest of steps
174 |         # dq.append(jq_del_all(*ignore_keys))
175 | 
176 |         dq = []
177 |         dq.append('. + if has("inbox") then {} else {"inbox": []} end') # ugh. filling default value
178 |         dq.append(d('.saved[].link_url')) # weird, changes for no reason sometimes...
179 |         sections = [
180 |             'saved',
181 |             'comments',
182 |             'upvoted',
183 |             'downvoted',
184 |             'submissions',
185 |             'inbox',
186 |         ]
187 |         dq.extend([
188 |             d(f'''.{section}[] | (
189 |             .saved, .preview, .body_html, .score, .ups, .description_html, .subreddit_type, .subreddit_subscribers, .selftext_html, .num_comments, .num_crossposts, .thumbnail, .created, .media,
190 |             .locked
191 | 
192 |             )''') for section in sections
193 |         ])
194 |         dq.append(
195 |             d('.multireddits[] | (.description_html, .created, .owner, .num_subscribers)')
196 |         )
197 |         dq.append(
198 |             d('''(.profile.subreddit, .subreddits[]) | (
199 |               .disable_contributor_requests
200 |             )''')
201 |         )
202 |         dq.append(
203 |             d('''.profile | (
204 |             .created,
205 |             .has_mail,
206 |             .inbox_count,
207 |             .can_create_subreddit,
208 |             .five_follower_send_message,
209 |             .features,
210 |             .has_gold_subscription,
211 |             .has_stripe_subscription,
212 |             .has_paypal_subscription,
213 |             .has_subscribed_to_premium,
214 |             .has_android_subscription,
215 |             .has_ios_subscription,
216 |             .next_coin_drip_date,
217 |             .seen_premium_ftux,
218 |             .seen_premium_adblock_modal,
219 |             .in_redesign_beta,
220 |             .gold_expiration,
221 |             .is_gold
222 |             )'''),
223 |         )
224 |         # del_preview = lambda s: ddel(f'.{s} | .[]')
225 |         # dq.extend(del_preview(s) for s in sections)
226 |         # TODO shit, that's gonna remove lots of subreddits
227 |         # I should also check that result contains reasonable favorites??
228 |         # TODO not sure if it's necessary to sort.. | sort_by(.id) 
229 |         # dq.append('.subreddits | map(del(.subscribers, .videostream_links_count, .description_html))') # ddel('(.subreddits) | .)') #  | del(.videostream_links_count) | del(.description_html)
230 |         dq.extend([
231 |             d('.subreddits[] | (.created, .subscribers, .description, .description_html, .videostream_links_count, .submit_text, .submit_text_html)'),
232 |         ])
233 |         return Filter2(
234 |             jq=pipe(*dq),
235 |             extra_del_all=ignore_keys,
236 |         )
237 | 
238 |     def extract(self) -> Filter:
239 |         return pipe(
240 |             # TODO FIXME this should be assertive on field existence
241 | 
242 |             # TODO ehh. dunno about link/comment karma.. it's fuzzy anyway?
243 |             # maybe try removing it once and see the difference
244 | 
245 |             # hmm, created changes all the time for some reason starting from 20181124201020
246 |             # https://www.reddit.com/r/redditdev/comments/29991t/whats_the_difference_between_created_and_created/ciiuk24/
247 |             # ok, it's broken
248 |             '''.profile      |=     {
249 |                 id,
250 |                 created_utc,
251 |                 name,
252 |                 coins,
253 |                 comment_karma,
254 |                 link_karma,
255 |                 subreddit: .subreddit | {subscribers}
256 |             }''',
257 |             '.comments     |= map({id, created_utc, body})',
258 |             '.multireddits |= map({id, created_utc, name, subreddits: .subreddits | map_values(.display_name) })',
259 |             '.saved        |= map({id, created_utc, title, body,  selftext})',
260 |             '.submissions  |= map({id, created_utc, title, selftext})',
261 |             '.subreddits   |= map({id, created_utc, title, display_name, public_description, subreddit_type})',
262 |             '.upvoted      |= map({id, created_utc, title, selftext})',
263 |             '.downvoted    |= map({id, created_utc, title, selftext})',
264 | 
265 |             '. + if has("inbox") then {} else {"inbox": []} end', # ugh. filling default value
266 | 
267 |             '.inbox        |= map({id, created_utc, title, selftext, body})',
268 |         )
269 | 
270 | # 2 styles of normalising:
271 | # first is extracting stuff we expect to see. this is nicer and gives the idea if something actually changed
272 | # second is cleaning up stuff that we don't need
273 | 
274 | 
275 | 
276 | 
277 | def main():
278 |     norm = RedditNormaliser()
279 |     norm.main(glob='*.json.xz')
280 | 
281 | 
282 | if __name__ == '__main__':
283 |     main()
284 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | # see https://github.com/karlicoss/pymplate for up-to-date reference
 2 | [project]
 3 | dynamic = ["version"]  # version is managed by build backend
 4 | name = "bleanser"
 5 | dependencies = [
 6 |     "more-itertools"   ,
 7 |     "typing-extensions",
 8 |     "click"            , # nicer cli
 9 |     "plumbum"          , # nicer command composition/piping
10 |     "kompress"         , # for compressed files processing (TODO potentially could be optional if they don't use compressed files? but how to detect if they are compressed.. maybe via libmagic?)
11 | ]
12 | requires-python = ">=3.9"
13 | 
14 | ## these need to be set if you're planning to upload to pypi
15 | # description = "TODO"
16 | license = {file = "LICENSE"}
17 | authors = [
18 |     {name = "Dima Gerasimov (@karlicoss)", email = "karlicoss@gmail.com"},
19 | ]
20 | maintainers = [
21 |     {name = "Dima Gerasimov (@karlicoss)", email = "karlicoss@gmail.com"},
22 | ]
23 | # keywords = []
24 | # # see: http://pypi.python.org/pypi?%3Aaction=list_classifiers
25 | # classifiers = [
26 | # ]
27 | 
28 | 
29 | [project.urls]
30 | Homepage = "https://github.com/karlicoss/bleanser"
31 | ##
32 | 
33 | 
34 | [project.optional-dependencies]
35 | extra = [
36 |     "python-magic",  # more reliable mimetype detection -- requires extra binaries, so perhaps best to keep optional
37 |     "logzero"     ,  # nicer logging, but can work without it
38 | ]
39 | json = [
40 |     "orjson",  # faster json processing (required if you use json-derived modules)
41 | ]
42 | xml = [
43 |     "lxml",  # for handling xml files (required if you use xml-derived modules)
44 | ]
45 | zstd = [
46 |     "kompress[zstd]",
47 | ]
48 | HPI = [  # for bleanser.modules.hpi
49 |     "HPI",
50 | ]
51 | 
52 | [dependency-groups]
53 | testing = [
54 |     "pytest",
55 |     "ruff",
56 |     "mypy",
57 |     "lxml",  # for mypy html coverage
58 | 
59 |     "types-lxml",
60 | ]
61 | 
62 | 
63 | # workaround for error during uv publishing
64 | # see https://github.com/astral-sh/uv/issues/9513#issuecomment-2519527822
65 | [tool.setuptools]
66 | license-files = []
67 | 
68 | 
69 | [build-system]
70 | requires = ["hatchling", "hatch-vcs"]
71 | build-backend = "hatchling.build"
72 | 
73 | # unfortunately have to duplicate project name here atm, see https://github.com/pypa/hatch/issues/1894
74 | [tool.hatch.build.targets.wheel]
75 | packages = ["src/bleanser"]
76 | 
77 | [tool.hatch.version]
78 | source = "vcs"
79 | 
80 | [tool.hatch.version.raw-options]
81 | version_scheme = "python-simplified-semver"
82 | local_scheme = "dirty-tag"
83 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
 1 | [pytest]
 2 | # discover files that don't follow test_ naming. Useful to keep tests along with the source code
 3 | python_files = *.py
 4 | 
 5 | # this setting only impacts package/module naming under pytest, not the discovery
 6 | consider_namespace_packages = true
 7 | 
 8 | addopts =
 9 |   # prevent pytest cache from being created... it craps into project dir and I never use it anyway
10 |   -p no:cacheprovider
11 | 
12 |   # -rap to print tests summary even when they are successful
13 |   -rap
14 |   --verbose
15 | 
16 |   # otherwise it won't discover doctests
17 |   --doctest-modules
18 | 
19 |   # show all test durations (unless they are too short)
20 |   --durations=0
21 | 


--------------------------------------------------------------------------------
/ruff.toml:
--------------------------------------------------------------------------------
  1 | lint.extend-select = [
  2 |     "F",    # flakes rules -- default, but extend just in case
  3 |     "E",    # pycodestyle  -- default, but extend just in case
  4 |     "W",    # various warnings
  5 | 
  6 |     "B",    # 'bugbear' set -- various possible bugs
  7 |     "C4",   # flake8-comprehensions -- unnecessary list/map/dict calls
  8 |     "COM",  # trailing commas
  9 |     "EXE",  # various checks wrt executable files
 10 |     "I",    # sort imports
 11 |     "ICN",  # various import conventions
 12 |     "FBT",  # detect use of boolean arguments
 13 |     "FURB", # various rules
 14 |     "PERF", # various potential performance speedups
 15 |     "PD",   # pandas rules
 16 |     "PIE",  # 'misc' lints
 17 |     "PLC",  # pylint convention rules
 18 |     "PLR",  # pylint refactor rules
 19 |     "PLW",  # pylint warnings
 20 |     "PT",   # pytest stuff
 21 |     "PYI",  # various type hinting rules
 22 |     "RET",  # early returns
 23 |     "RUF",  # various ruff-specific rules
 24 |     "TID",  # various imports suggestions
 25 |     "TRY",  # various exception handling rules
 26 |     "UP",   # detect deprecated python stdlib stuff
 27 |     "FA",   # suggest using from __future__ import annotations
 28 |     "PTH",  # pathlib migration
 29 |     "ARG",  # unused argument checks
 30 |     "A",    # builtin shadowing
 31 |     "G",    # logging stuff
 32 |     # "EM",  # TODO hmm could be helpful to prevent duplicate err msg in traceback.. but kinda annoying
 33 | 
 34 |     # "ALL", # uncomment this to check for new rules!
 35 | ]
 36 | 
 37 | # Preserve types, even if a file imports `from __future__ import annotations`
 38 | # we need this for cachew to work with HPI types on 3.9
 39 | # can probably remove after 3.10?
 40 | lint.pyupgrade.keep-runtime-typing = true
 41 | 
 42 | lint.ignore = [
 43 |     "D",     # annoying nags about docstrings
 44 |     "N",     # pep naming
 45 |     "TCH",   # type checking rules, mostly just suggests moving imports under TYPE_CHECKING
 46 |     "S",     # bandit (security checks) -- tends to be not very useful, lots of nitpicks
 47 |     "DTZ",   # datetimes checks -- complaining about missing tz and mostly false positives
 48 |     "FIX",   # complains about fixmes/todos -- annoying
 49 |     "TD",    # complains about todo formatting -- too annoying
 50 |     "ANN",   # missing type annotations? seems way to strict though
 51 | 
 52 | ### too opinionated style checks
 53 |     "E501",  # too long lines
 54 |     "E702",  # Multiple statements on one line (semicolon)
 55 |     "E731",  # assigning lambda instead of using def
 56 |     "E741",  # Ambiguous variable name: `l`
 57 |     "E742",  # Ambiguous class name: `O
 58 |     "E401",  # Multiple imports on one line
 59 |     "F403",  # import *` used; unable to detect undefined names
 60 | ###
 61 | 
 62 | ###
 63 |     "E722",  # Do not use bare `except` ## Sometimes it's useful for defensive imports and that sort of thing..
 64 |     "F811",  # Redefinition of unused  # this gets in the way of pytest fixtures (e.g. in cachew)
 65 | 
 66 | ## might be nice .. but later and I don't wanna make it strict
 67 |     "E402",  # Module level import not at top of file
 68 | 
 69 |     "RUF100",  # unused noqa -- handle later
 70 |     "RUF012",  # mutable class attrs should be annotated with ClassVar... ugh pretty annoying for user configs
 71 | 
 72 | ### these are just nitpicky, we usually know better
 73 |     "PLR0911",  # too many return statements
 74 |     "PLR0912",  # too many branches
 75 |     "PLR0913",  # too many function arguments
 76 |     "PLR0915",  # too many statements
 77 |     "PLR1714",  # consider merging multiple comparisons
 78 |     "PLR2044",  # line with empty comment
 79 |     "PLR5501",  # use elif instead of else if
 80 |     "PLR2004",  # magic value in comparison -- super annoying in tests
 81 | ###
 82 |     "PLR0402",  # import X.Y as Y -- TODO maybe consider enabling it, but double check
 83 | 
 84 |     "B009",  # calling gettattr with constant attribute -- this is useful to convince mypy
 85 |     "B010",  # same as above, but setattr
 86 |     "B011",  # complains about assert False
 87 |     "B017",  # pytest.raises(Exception)
 88 |     "B023",  # seems to result in false positives?
 89 |     "B028",  # suggest using explicit stacklevel? TODO double check later, but not sure it's useful
 90 | 
 91 |     # complains about useless pass, but has sort of a false positive if the function has a docstring?
 92 |     # this is common for click entrypoints (e.g. in __main__), so disable
 93 |     "PIE790",
 94 | 
 95 |     # a bit too annoying, offers to convert for loops to list comprehension
 96 |     # , which may heart readability
 97 |     "PERF401",
 98 | 
 99 |     # suggests no using exception in for loops
100 |     # we do use this technique a lot, plus in 3.11 happy path exception handling is "zero-cost"
101 |     "PERF203",
102 | 
103 |     "RET504", # unnecessary assignment before returning -- that can be useful for readability
104 |     "RET505", # unnecessary else after return -- can hurt readability
105 | 
106 |     "PLW0603",  # global variable update.. we usually know why we are doing this
107 |     "PLW2901",  # for loop variable overwritten, usually this is intentional
108 | 
109 |     "PT011",  # pytest raises should is too broad
110 |     "PT012",  # pytest raises should contain a single statement
111 | 
112 |     "COM812",  # trailing comma missing -- mostly just being annoying with long multiline strings
113 | 
114 |     "PD901",   # generic variable name df
115 | 
116 |     "TRY003",  # suggests defining exception messages in exception class -- kinda annoying
117 |     "TRY004",  # prefer TypeError -- don't see the point
118 |     "TRY201",  # raise without specifying exception name -- sometimes hurts readability
119 |     "TRY400",  # TODO double check this, might be useful
120 |     "TRY401",  # redundant exception in logging.exception call? TODO double check, might result in excessive logging
121 | 
122 |     "PGH",  # TODO force error code in mypy instead? although it also has blanket noqa rule
123 | 
124 |     "TID252",  # Prefer absolute imports over relative imports from parent modules
125 | 
126 |     "UP038",  # suggests using | (union) in isisntance checks.. but it results in slower code
127 | 
128 |     ## too annoying
129 |     "T20",     # just complains about prints and pprints
130 |     "Q",       # flake quotes, too annoying
131 |     "C90",     # some complexity checking
132 |     "G004",    # logging statement uses f string
133 |     "ERA001",  # commented out code
134 |     "SLF001",  # private member accessed
135 |     "BLE001",  # do not catch 'blind' Exception
136 |     "INP001",  # complains about implicit namespace packages
137 |     "SIM",     # some if statements crap
138 |     "RSE102",  # complains about missing parens in exceptions
139 |     ##
140 | ]
141 | 
142 | 
143 | lint.flake8-builtins.builtins-allowed-modules = ["json", "logging", "xml"]
144 | 
145 | 
146 | lint.exclude = [
147 |     "old/**",
148 | ]
149 | 


--------------------------------------------------------------------------------
/scripts/apt.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -euo pipefail
 3 | 
 4 | progs=(
 5 |     sqlite3 # [optional] sqlite processing
 6 |     vim     # [optional] for vimdiff
 7 |     fdupes  # [optional] duplicate detection tool, for tests
 8 | )
 9 | 
10 | apt-get update && apt-get --yes install ${progs[@]}
11 | 


--------------------------------------------------------------------------------
/src/bleanser/core/__init__.py:
--------------------------------------------------------------------------------
1 | from .common import logger
2 | 
3 | __all__ = [
4 |     'logger',
5 | ]
6 | 


--------------------------------------------------------------------------------
/src/bleanser/core/__main__.py:
--------------------------------------------------------------------------------
 1 | # TODO hmm so we kind of need a specific Normaliser for bleanser, so calling
 2 | # python3 -m bleanser.core (or just -m bleanser) doesn't make much sense
 3 | # it could probs take in module name, and then call it? like python3 -m bleanser modules.xxx
 4 | # but it's the same as calling python -m bleanser.modules.xxx
 5 | # TODO maybe this thing could do module discovery or something?
 6 | def main() -> None:
 7 |     pass
 8 | 
 9 | 
10 | if __name__ == '__main__':
11 |     # FIXME warn if we're running this command? kinda confusing otherwise
12 |     main()
13 | 


--------------------------------------------------------------------------------
/src/bleanser/core/common.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from collections.abc import Sequence
  4 | from dataclasses import dataclass
  5 | from pathlib import Path
  6 | from typing import TYPE_CHECKING, Union
  7 | 
  8 | from .ext.logging import LazyLogger
  9 | 
 10 | logger = LazyLogger(__name__, level='debug')
 11 | 
 12 | 
 13 | @dataclass
 14 | class Group:
 15 |     items: Sequence[Path]
 16 |     """
 17 |     All items in group are tied via 'domination' relationship
 18 |     Which might be either exact equality, or some sort of 'inclusion' relationship
 19 |     """
 20 | 
 21 |     pivots: Sequence[Path]
 22 |     """
 23 |     Pivots are the elements that 'define' group.
 24 |     In general the pivots contain all other elements in the group
 25 |     Sometimes pivots might be redundant, e.g. if we want to keep both boundaries of the group
 26 |     """
 27 | 
 28 |     # TODO attach diff or something
 29 |     # cmp: CmpResult
 30 |     error: bool
 31 | 
 32 |     def __post_init__(self) -> None:
 33 |         sp = set(self.pivots)
 34 |         si = set(self.items)
 35 |         if len(self.items) != len(si):
 36 |             raise RuntimeError(f'duplicate items: {self}')
 37 |         if len(self.pivots) != len(sp):
 38 |             raise RuntimeError(f'duplicate pivots: {self}')
 39 |         # in theory could have more pivots, but shouldn't happen for now
 40 |         assert 1 <= len(sp) <= 2, sp
 41 |         if not (sp <= si):
 42 |             raise RuntimeError(f"pivots aren't fully contained in items: {self}")
 43 | 
 44 | 
 45 | @dataclass
 46 | class Instruction:
 47 |     path: Path
 48 |     group: Group
 49 |     """
 50 |     'Reason' why the path got a certain instruction
 51 |     """
 52 | 
 53 | 
 54 | @dataclass
 55 | class Prune(Instruction):
 56 |     pass
 57 | 
 58 | 
 59 | @dataclass
 60 | class Keep(Instruction):
 61 |     pass
 62 | 
 63 | 
 64 | ### helper to define paramertized tests in function's body
 65 | from .utils import under_pytest
 66 | 
 67 | if TYPE_CHECKING or under_pytest:
 68 |     import pytest
 69 | 
 70 |     parametrize = pytest.mark.parametrize
 71 | else:
 72 |     parametrize = lambda *_args, **_kwargs: (lambda f: f)
 73 | ###
 74 | 
 75 | 
 76 | @dataclass
 77 | class BaseMode:
 78 |     pass
 79 | 
 80 | 
 81 | @dataclass
 82 | class Dry(BaseMode):
 83 |     pass
 84 | 
 85 | 
 86 | @dataclass
 87 | class Move(BaseMode):
 88 |     path: Path
 89 | 
 90 |     def __post_init__(self) -> None:
 91 |         assert self.path.is_dir(), self.path
 92 | 
 93 | 
 94 | @dataclass
 95 | class Remove(BaseMode):
 96 |     pass
 97 | 
 98 | 
 99 | Mode = Union[Dry, Move, Remove]
100 | 
101 | 
102 | def divide_by_size(*, buckets: int, paths: Sequence[Path]) -> Sequence[Sequence[Path]]:
103 |     """
104 |     Divide paths into approximately equally sized groups, while preserving order
105 |     """
106 |     res = []
107 |     with_size = [(p, p.stat().st_size) for p in paths]
108 |     bucket_size = sum(sz for _, sz in with_size) / buckets
109 | 
110 |     group: list[Path] = []
111 |     group_size = 0
112 | 
113 |     def dump() -> None:
114 |         nonlocal group_size, group
115 | 
116 |         if len(group) == 0:
117 |             return
118 | 
119 |         res.append(group)
120 |         # print(f"dumping group, size {group_size} {len(group)} {group[0]} {group[-1]}")
121 | 
122 |         group = []
123 |         group_size = 0
124 | 
125 |     for p, sz in with_size:
126 |         if group_size >= bucket_size:
127 |             dump()
128 |         group.append(p)
129 |         group_size += sz
130 |     # last group always needs to be dumped
131 |     dump()
132 | 
133 |     assert len(res) <= buckets
134 |     while len(res) < buckets:  # can be less if buckets > len(paths)
135 |         res.append([])
136 | 
137 |     flattened = []
138 |     for r in res:
139 |         flattened.extend(r)
140 |     assert paths == flattened, res  # just a safety check
141 | 
142 |     return res
143 | 


--------------------------------------------------------------------------------
/src/bleanser/core/compat.py:
--------------------------------------------------------------------------------
1 | import sys
2 | 
3 | if sys.version_info[:2] >= (3, 11):
4 |     from typing import Never, Self, assert_never, assert_type  # noqa: F401
5 | else:
6 |     from typing_extensions import Never, Self, assert_never, assert_type  # noqa: F401
7 | 


--------------------------------------------------------------------------------
/src/bleanser/core/ext/dummy_executor.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from concurrent.futures import Executor, Future
 4 | 
 5 | # https://stackoverflow.com/a/10436851/706389
 6 | from typing import Any
 7 | 
 8 | 
 9 | class DummyExecutor(Executor):
10 |     def __init__(self, max_workers: int | None = 1) -> None:
11 |         self._shutdown = False
12 |         self._max_workers = max_workers
13 | 
14 |     def submit(self, fn, *args, **kwargs):  # type: ignore[override,unused-ignore]  # todo type properly after 3.9
15 |         if self._shutdown:
16 |             raise RuntimeError('cannot schedule new futures after shutdown')
17 | 
18 |         f: Future[Any] = Future()
19 |         try:
20 |             result = fn(*args, **kwargs)
21 |         except KeyboardInterrupt:
22 |             raise
23 |         except BaseException as e:
24 |             f.set_exception(e)
25 |         else:
26 |             f.set_result(result)
27 | 
28 |         return f
29 | 
30 |     def shutdown(self, wait: bool = True, **kwargs) -> None:  # noqa: FBT001,FBT002,ARG002
31 |         self._shutdown = True
32 | 


--------------------------------------------------------------------------------
/src/bleanser/core/ext/logging.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Default logger is a bit meh, see 'test'/run this file for a demo
 3 | TODO name 'klogging' to avoid possible conflict with default 'logging' module
 4 | TODO shit. too late already? maybe use fallback & deprecate
 5 | '''
 6 | 
 7 | 
 8 | def test() -> None:
 9 |     import logging
10 |     import sys
11 |     from typing import Callable
12 | 
13 |     M: Callable[[str], None] = lambda s: print(s, file=sys.stderr)
14 | 
15 |     M("   Logging module's defaults are not great...'")
16 |     l = logging.getLogger('test_logger')
17 |     # todo why is mypy unhappy about these???
18 |     l.error("For example, this should be logged as error. But it's not even formatted properly, doesn't have logger name or level")
19 | 
20 |     M("   The reason is that you need to remember to call basicConfig() first")
21 |     l.error("OK, this is better. But the default format kinda sucks, I prefer having timestamps and the file/line number")
22 | 
23 |     M("")
24 |     M("    With LazyLogger you get a reasonable logging format, colours and other neat things")
25 | 
26 |     ll = LazyLogger('test')  # No need for basicConfig!
27 |     ll.info("default level is INFO")
28 |     ll.debug(".. so this shouldn't be displayed")
29 |     ll.warning("warnings are easy to spot!")
30 |     ll.exception(RuntimeError("exceptions as well"))
31 | 
32 | 
33 | import logging
34 | import os
35 | from typing import Optional, Union
36 | 
37 | Level = int
38 | LevelIsh = Optional[Union[Level, str]]
39 | 
40 | 
41 | def mklevel(level: LevelIsh) -> Level:
42 |     # todo put in some global file, like envvars.py
43 |     glevel = os.environ.get('HPI_LOGS', None)
44 |     if glevel is not None:
45 |         level = glevel
46 |     if level is None:
47 |         return logging.NOTSET
48 |     if isinstance(level, int):
49 |         return level
50 |     return getattr(logging, level.upper())
51 | 
52 | 
53 | FORMAT = '{start}[%(levelname)-7s %(asctime)s %(name)s %(filename)s:%(lineno)d]{end} %(message)s'
54 | FORMAT_COLOR = FORMAT.format(start='%(color)s', end='%(end_color)s')
55 | FORMAT_NOCOLOR = FORMAT.format(start='', end='')
56 | DATEFMT = '%Y-%m-%d %H:%M:%S'
57 | 
58 | 
59 | def setup_logger(logger: logging.Logger, level: LevelIsh) -> None:
60 |     lvl = mklevel(level)
61 |     try:
62 |         import logzero  # type: ignore[import-untyped]
63 |     except ModuleNotFoundError:
64 |         import warnings
65 | 
66 |         warnings.warn("You might want to install 'logzero' for nice colored logs!")
67 |         logger.setLevel(lvl)
68 |         h = logging.StreamHandler()
69 |         h.setLevel(lvl)
70 |         h.setFormatter(logging.Formatter(fmt=FORMAT_NOCOLOR, datefmt=DATEFMT))
71 |         logger.addHandler(h)
72 |         logger.propagate = False  # ugh. otherwise it duplicates log messages? not sure about it..
73 |     else:
74 |         formatter = logzero.LogFormatter(
75 |             fmt=FORMAT_COLOR,
76 |             datefmt=DATEFMT,
77 |         )
78 |         logzero.setup_logger(logger.name, level=lvl, formatter=formatter)
79 | 
80 | 
81 | class LazyLogger(logging.Logger):
82 |     def __new__(cls, name: str, level: LevelIsh = 'INFO') -> 'LazyLogger':
83 |         logger = logging.getLogger(name)
84 | 
85 |         # this is called prior to all _log calls so makes sense to do it here?
86 |         def isEnabledFor_lazyinit(*args, logger=logger, orig=logger.isEnabledFor, **kwargs):
87 |             att = 'lazylogger_init_done'
88 |             if not getattr(logger, att, False):  # init once, if necessary
89 |                 setup_logger(logger, level=level)
90 |                 setattr(logger, att, True)
91 |             return orig(*args, **kwargs)
92 | 
93 |         logger.isEnabledFor = isEnabledFor_lazyinit  # type: ignore[method-assign]
94 |         return logger  # type: ignore[return-value]
95 | 
96 | 
97 | if __name__ == '__main__':
98 |     test()
99 | 


--------------------------------------------------------------------------------
/src/bleanser/core/ext/sqlite_dumben.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # A tool to 'dumb down' an sqlite database and convert into just data rows
  3 | # Basically it strips all
  4 | # - views
  5 | # - indices
  6 | # - triggers
  7 | # - constraints
  8 | # this is useful if you want to mess/cleanup the database, but don't want to trip over constraints/triggers
  9 | # NOTE: handling everything as bytes since not sure I wanna mess with encoding here (esp. row data encoding)
 10 | from __future__ import annotations
 11 | 
 12 | import hashlib
 13 | import os
 14 | import re
 15 | import shutil
 16 | import sqlite3
 17 | import subprocess
 18 | import sys
 19 | from pathlib import Path
 20 | from subprocess import DEVNULL, check_call, check_output
 21 | from tempfile import TemporaryDirectory
 22 | 
 23 | Tables = dict[str, dict[str, str]]
 24 | 
 25 | 
 26 | def _get_tables(db: Path) -> Tables:
 27 |     res: Tables = {}
 28 |     with sqlite3.connect(f'file:{db}?immutable=1', uri=True) as conn:
 29 |         tables = []
 30 |         for row in conn.execute('SELECT name, type FROM sqlite_master'):
 31 |             (table, type_) = row
 32 |             if type_ in {'index', 'view', 'trigger'}:
 33 |                 # todo log what kind of things we are filtering out?
 34 |                 continue
 35 |             assert type_ == 'table', (table, type_)
 36 |             tables.append(table)
 37 | 
 38 |         for table in tables:
 39 |             schema: dict[str, str] = {}
 40 |             for row in conn.execute(f'PRAGMA table_info({table})'):
 41 |                 col = row[1]
 42 |                 type_ = row[2]
 43 |                 schema[col] = type_
 44 |             res[table] = schema
 45 |     return res
 46 | 
 47 | 
 48 | def _sqlite(*cmd):
 49 |     return ['sqlite3', '-bail', *cmd]
 50 | 
 51 | 
 52 | def _dumben_db(output_db: Path) -> None:
 53 |     # expected to operate on output_db directly
 54 |     assert output_db.exists(), output_db
 55 | 
 56 |     # hmm. CREATE TABLE syntax seems ridiculously complicated https://www.sqlite.org/lang_createtable.html
 57 |     # so seems pretty hopeless to sanitize off the constraints purely via sqlite?
 58 |     # the only easy win is making it single line
 59 |     # "UPDATE sqlite_master SET sql = replace(sql, char(10), ' ');"
 60 | 
 61 |     allow_writable_schema = [
 62 |         # seems like some versions of sqlite (e.g. on osx don't allow writable schema without this pragma)
 63 |         # https://github.com/tekartik/sqflite/blob/master/sqflite_common_ffi/doc/custom_pragmas.md?plain=1
 64 |         "PRAGMA sqflite -- db_config_defensive_off",
 65 |         "PRAGMA writable_schema=ON",
 66 |     ]
 67 | 
 68 |     # first delete virtual tables -- they might render it impossible to do anything with database at all due to USING
 69 |     # e.g. fb messenger android msys database has this CREATE VIRTUAL TABLE msys_experiment_cache USING experiment_cache
 70 |     # either way virtual tables are basically views, no need to keep them
 71 |     with sqlite3.connect(output_db) as conn:
 72 |         for cmd in allow_writable_schema:
 73 |             conn.execute(cmd)
 74 |         conn.execute('DELETE FROM sqlite_master WHERE sql LIKE "%CREATE VIRTUAL TABLE%"')
 75 |     conn.close()
 76 | 
 77 |     tables = _get_tables(output_db)
 78 | 
 79 |     updates = []
 80 |     for name, schema in tables.items():
 81 |         simple_create = f'CREATE TABLE `{name}` (' + ', '.join(f'`{k}` {v}' for k, v in schema.items()) + ')'
 82 |         # TODO dunno if worth keeping autoincrement
 83 |         # without it, all columns with numerical id end up as NULL. although maybe for the best?
 84 |         upd = f'UPDATE sqlite_master SET sql = "{simple_create}" WHERE name = "{name}";'
 85 |         updates.append(upd)
 86 | 
 87 |     cmds = [
 88 |         *allow_writable_schema,
 89 |         # drop table doesn't work for special sqlite_ tables
 90 |         # sqlite_sequence is something to do with autoincrement, ends up with some indices noise otherwise
 91 |         # sqlite_stat{1,2,3,4} is something to do with ANALYZE query
 92 |         'DELETE FROM sqlite_master WHERE name = "sqlite_sequence" OR name LIKE "sqlite_stat%";',
 93 |         #
 94 |         'DELETE FROM sqlite_master WHERE type IN ("view", "trigger", "index");',
 95 |         *updates,
 96 |         #
 97 |         # without vacuum, sometimes ended up with "rootpage disagrees with header error", from sqlite code seemed like it had something to do with autovacuum
 98 |         'VACUUM',
 99 |     ]
100 | 
101 |     # need to set isolation level to None, otherwise VACUUM fails
102 |     with sqlite3.connect(output_db, isolation_level=None) as conn:
103 |         for cmd in cmds:
104 |             conn.execute(cmd)
105 |     conn.close()
106 | 
107 |     # make sure it's not corrupted
108 |     # redirect output to DEVNULL, otherwise it's printing "ok" which is a bit annoying
109 |     subprocess.check_call(_sqlite(output_db, 'PRAGMA integrity_check;'), stdout=DEVNULL)
110 | 
111 | 
112 | def run(*, db: Path, output: Path | None, output_as_db: bool) -> None:
113 |     if output is not None:
114 |         assert not output.exists(), output
115 | 
116 |     if output is None:
117 |         assert output_as_db is False, "can't output to stdout as a binary database"
118 | 
119 |     if output_as_db:
120 |         assert output is not None
121 | 
122 |         dumben_cache: Path | None = None
123 |         _DUMBEN_CACHE_BASE = os.environ.get('SQLITE_DUMBEN_USE_CACHE')
124 |         if _DUMBEN_CACHE_BASE is not None:
125 |             DUMBEN_CACHE_BASE = Path(_DUMBEN_CACHE_BASE)
126 |             DUMBEN_CACHE_BASE.mkdir(parents=True, exist_ok=True)
127 | 
128 |             fhash = hashlib.md5(
129 |                 # add code of sqlite_dumben just in case we change logic
130 |                 db.read_bytes() + Path(__file__).read_bytes()
131 |             ).hexdigest()
132 | 
133 |             dumben_cache = DUMBEN_CACHE_BASE / fhash
134 |             if dumben_cache.exists():
135 |                 # TODO log it?
136 |                 shutil.copy(dumben_cache, output)
137 |                 return
138 | 
139 |         # if we output as db, just operate on that target database directly
140 |         shutil.copy(db, output)
141 |         _dumben_db(output)
142 | 
143 |         if dumben_cache is not None:
144 |             shutil.copy(output, dumben_cache)
145 |         return
146 | 
147 |     # otherwise, need to create a temporary db to operate on -- and after that can dump it to sql
148 |     # TODO need to be careful, if there are BLOBs in the database they may be dumped as empty strings
149 |     with TemporaryDirectory() as td:
150 |         tdir = Path(td)
151 |         tdb = Path(tdir) / 'tmp.db'
152 |         run(db=db, output=tdb, output_as_db=True)
153 |         if output is not None:
154 |             with output.open('w') as out:
155 |                 subprocess.run(_sqlite(tdb, '.dump'), check=True, stdout=out)
156 |         else:
157 |             subprocess.run(_sqlite(tdb, '.dump'), check=True, stdout=sys.stdout)
158 | 
159 | 
160 | def test_dumben(tmp_path: Path) -> None:
161 |     # TODO would be nice to implement integration style test here straight away
162 |     sql = '''
163 | CREATE TABLE departments
164 | ( department_id INTEGER PRIMARY KEY AUTOINCREMENT,
165 |   department_name VARCHAR
166 | );
167 | 
168 | CREATE TABLE employees
169 | ( employee_id INTEGER PRIMARY KEY AUTOINCREMENT,
170 |   last_name VARCHAR NOT NULL,
171 |   first_name VARCHAR,
172 |   department_id INTEGER,
173 |   CONSTRAINT fk_departments
174 |     FOREIGN KEY (department_id)
175 |     REFERENCES departments(department_id)
176 |     ON DELETE CASCADE
177 | );
178 | 
179 | INSERT INTO departments VALUES (30, 'HR');
180 | INSERT INTO departments VALUES (999, 'Sales');
181 | 
182 | INSERT INTO employees VALUES (10000, 'Smith', 'John', 30);
183 | INSERT INTO employees VALUES (10001, 'Anderson', 'Dave', 999);
184 | 
185 | CREATE VIEW whatevs AS
186 |     SELECT * FROM employees;
187 | '''
188 | 
189 |     db = tmp_path / 'tmp.db'
190 |     subprocess.run(_sqlite(db), input=sql.encode('utf8'), check=True)
191 | 
192 |     ## precondition -- check that db has multiline CREATE statements
193 |     dbd = check_output(_sqlite(db, '.dump')).decode('utf8').splitlines()
194 |     assert 'CREATE TABLE employees' in dbd
195 |     assert '  CONSTRAINT fk_departments' in dbd
196 |     ##
197 | 
198 |     ## precondition -- check that with foreign key it will indeed impact other tables
199 |     check_call(_sqlite(db, 'PRAGMA foreign_keys=on; DELETE FROM departments WHERE department_id = 30;'))
200 |     ecnt = int(check_output(_sqlite(db, 'SELECT COUNT(*) FROM employees')).decode('utf8').strip())
201 |     assert ecnt == 1, ecnt
202 |     ##
203 | 
204 |     db.unlink()
205 |     subprocess.run(_sqlite(db), input=sql.encode('utf8'), check=True)
206 | 
207 |     dumb_sql = tmp_path / 'dumb.sql'
208 |     run(db=db, output=dumb_sql, output_as_db=False)
209 |     dump = dumb_sql.read_text()
210 |     dump_lines = dump.splitlines()
211 | 
212 |     crt = dump_lines[5]  # meh but easiest
213 |     # make sure it puts the statement on single line
214 |     assert re.fullmatch(r'CREATE TABLE `employees` \(`employee_id` INTEGER,.*`department_id` INTEGER.*\);', crt)
215 |     # make sure it strips off constraints
216 |     assert 'AUTOINCREMENT' not in crt, crt
217 |     assert 'CONSTRAINT' not in crt, crt
218 | 
219 |     assert 'CREATE VIEW' not in dump
220 | 
221 |     dumb_db = tmp_path / 'dumb.db'
222 |     run(db=db, output=dumb_db, output_as_db=True)
223 |     check_call(_sqlite(dumb_db, 'PRAGMA foreign_keys=on; DELETE FROM departments WHERE department_id = 30;'))
224 |     ecnt = int(check_output(_sqlite(dumb_db, 'SELECT COUNT(*) FROM employees')).decode('utf8').strip())
225 |     assert ecnt == 2, ecnt
226 | 
227 | 
228 | def main() -> None:
229 |     from argparse import ArgumentParser
230 | 
231 |     p = ArgumentParser()
232 |     p.add_argument('--output-as-db', action='store_true')
233 |     p.add_argument('--output', type=Path, required=False)
234 |     p.add_argument('db', type=Path)
235 |     args = p.parse_args()
236 | 
237 |     run(db=args.db, output=args.output, output_as_db=args.output_as_db)
238 | 
239 | 
240 | if __name__ == '__main__':
241 |     main()
242 | 
243 | 
244 | # some possible inspiration for testing
245 | # - KoboReader-20211130.sqlite seems to have
246 | #    CREATE TRIGGER kobo_plus_asset_cleanup
247 | # - fb messenger android is a good db to test on... lots of weird shit, e.g. transactions
248 | # - bumble android has search_message_removed trigger
249 | # - whatsapp android has loads of weird shit
250 | 


--------------------------------------------------------------------------------
/src/bleanser/core/main.py:
--------------------------------------------------------------------------------
  1 | # not to confuse with __main__.py... meh
  2 | from __future__ import annotations
  3 | 
  4 | import os
  5 | from glob import glob as do_glob
  6 | from pathlib import Path
  7 | from typing import cast
  8 | 
  9 | import click
 10 | 
 11 | from .common import Dry, Mode, Move, Remove, logger
 12 | from .processor import (
 13 |     BaseNormaliser,
 14 |     apply_instructions,
 15 |     bleanser_tmp_directory,
 16 |     compute_instructions,
 17 | )
 18 | 
 19 | 
 20 | # TODO use context and default_map
 21 | # https://click.palletsprojects.com/en/7.x/commands/#overriding-defaults
 22 | def main(*, Normaliser: type[BaseNormaliser]) -> None:
 23 |     # meh.. by default the width is stupid, like 80 chars
 24 |     context_settings = {
 25 |         'max_content_width': 120,
 26 |         'show_default': True,
 27 |     }
 28 | 
 29 |     @click.group(context_settings=context_settings)
 30 |     def call_main() -> None:
 31 |         pass
 32 | 
 33 |     # meh... would be nice to use object but it gets casted to str by click??
 34 |     _DEFAULT = '<default>'
 35 | 
 36 |     @call_main.command(name='diff', short_help='cleanup two files and diff')
 37 |     @click.argument('path1', type=str)
 38 |     @click.argument('path2', default=_DEFAULT)
 39 |     @click.option('--glob', is_flag=True, default=False, help='Treat the path as glob (in the glob.glob sense)')
 40 |     @click.option('--vim', is_flag=True, default=False, help='Use vimdiff')
 41 |     @click.option('--difftool', type=str, help='Custom difftool to use')
 42 |     @click.option('--from', 'from_', type=int, default=None)
 43 |     @click.option('--to', type=int, default=None, help='non-inclusive, i.e. [from, to)')
 44 |     def diff(path1: str, path2: Path, *, glob: bool, from_: int | None, to: int | None, vim: bool, difftool: str) -> None:
 45 |         path1_: Path
 46 |         if glob:
 47 |             assert path2 is cast(Path, _DEFAULT), path2
 48 |             if to is None:
 49 |                 assert from_ is not None
 50 |                 to = from_ + 2  # by default just compare with the next adjacent element
 51 |             paths = _get_paths(path=path1, from_=from_, to=to, glob=glob)
 52 |         else:
 53 |             assert cast(str, path2) is not _DEFAULT
 54 |             assert from_ is None
 55 |             assert to is None
 56 |             path1_ = Path(path1)
 57 |             path2 = Path(path2)
 58 |             paths = [path1_, path2]
 59 | 
 60 |         from .processor import compute_diff
 61 | 
 62 |         # meh..
 63 |         if vim:
 64 |             difftool = 'vimdiff'
 65 |         if difftool is not None:
 66 |             os.environ['DIFFTOOL'] = difftool
 67 | 
 68 |         for line in compute_diff(paths, Normaliser=Normaliser):
 69 |             print(line)
 70 | 
 71 |     @call_main.command(name='normalised', short_help='normalise file and dump to stdout')
 72 |     @click.argument('path', type=Path)
 73 |     @click.option('--stdout', is_flag=True, help='print normalised files to stdout instead of printing the path to it')
 74 |     def normalised(*, path: Path, stdout: bool) -> None:
 75 |         with bleanser_tmp_directory() as base_tmp_dir:
 76 |             n = Normaliser(original=path, base_tmp_dir=base_tmp_dir)
 77 |             with n.do_normalise() as cleaned:
 78 |                 if stdout:
 79 |                     print(cleaned.read_text())
 80 |                 else:
 81 |                     click.secho(f'You can examine normalised file: {cleaned}', fg='green')
 82 |                     click.pause(info="Press any key when you've finished")
 83 | 
 84 |     @call_main.command(name='prune', short_help='process & prune files')
 85 |     @click.argument('path', type=str)
 86 |     @click.option('--glob', is_flag=True, default=False, help='Treat the path as glob (in the glob.glob sense)')
 87 |     @click.option('--sort-by', type=click.Choice(['size', 'name']), default='name', help='how to sort input files')
 88 |     ##
 89 |     @click.option('--dry', is_flag=True, default=None, help='Do not prune the input files, just print what would happen after pruning.')
 90 |     @click.option('--remove', is_flag=True, default=None, help='Prune the input files by REMOVING them (be careful!)')
 91 |     @click.option('--move', type=Path, help='Prune the input files by MOVING them to the specified path. A bit safer than --remove mode.')
 92 |     ##
 93 |     @click.option('--yes', is_flag=True, default=False, help="Do not prompt before pruning files (useful for cron etc)")
 94 |     @click.option(
 95 |         '--threads',
 96 |         type=int,
 97 |         is_flag=False,
 98 |         flag_value=0,
 99 |         default=None,
100 |         help="Number of threads (processes) to use. Without the flag won't use any, with the flag will try using all available, can also take a specific value. Passed down to PoolExecutor.",
101 |     )
102 |     ##
103 |     @click.option('--from', 'from_', type=int, default=None)
104 |     @click.option('--to', type=int, default=None)
105 |     ##
106 |     @click.option('--multiway', is_flag=True, default=None, help='force "multiway" cleanup')
107 |     @click.option('--prune-dominated', is_flag=True, default=None)
108 |     def prune(
109 |         *,
110 |         path: str,
111 |         sort_by: str,
112 |         glob: bool,
113 |         dry: bool,
114 |         move: Path | None,
115 |         remove: bool,
116 |         threads: int | None,
117 |         from_: int | None,
118 |         to: int | None,
119 |         multiway: bool | None,
120 |         prune_dominated: bool | None,
121 |         yes: bool,
122 |     ) -> None:
123 |         modes: list[Mode] = []
124 |         if dry is True:
125 |             modes.append(Dry())
126 |         if move is not None:
127 |             modes.append(Move(path=move))
128 |         if remove is True:
129 |             modes.append(Remove())
130 |         if len(modes) == 0:
131 |             modes.append(Dry())
132 |         assert len(modes) == 1, f'please specify exactly one of modes (got {modes})'
133 |         [mode] = modes
134 |         # TODO eh, would be nice to use some package for mutually exclusive args..
135 |         # e.g. https://stackoverflow.com/questions/37310718/mutually-exclusive-option-groups-in-python-click
136 | 
137 |         paths = _get_paths(path=path, glob=glob, from_=from_, to=to, sort_by=sort_by)
138 | 
139 |         if multiway is not None:
140 |             Normaliser.MULTIWAY = multiway
141 |         if prune_dominated is not None:
142 |             Normaliser.PRUNE_DOMINATED = prune_dominated
143 | 
144 |         instructions = list(compute_instructions(paths, Normaliser=Normaliser, threads=threads))
145 |         # NOTE: for now, forcing list() to make sure instructions compute before path check
146 |         # not strictly necessary
147 |         for p in paths:
148 |             # just in case, to make sure no one messed with files in the meantime
149 |             assert p.exists(), p
150 | 
151 |         need_confirm = not yes
152 |         apply_instructions(instructions, mode=mode, need_confirm=need_confirm)
153 | 
154 |     call_main()
155 | 
156 | 
157 | def _get_paths(*, path: str, from_: int | None, to: int | None, sort_by: str = "name", glob: bool = False) -> list[Path]:
158 |     if not glob:
159 |         pp = Path(path)
160 |         assert pp.is_dir(), pp
161 |         path = str(pp) + os.sep + '**'
162 |     paths = [Path(p) for p in do_glob(path, recursive=True)]  # noqa: PTH207
163 |     paths = [p for p in paths if p.is_file()]
164 |     if sort_by == "name":
165 |         # assumes sort order is same as date order? guess it's reasonable
166 |         paths = sorted(paths)
167 |     else:
168 |         paths = sorted(paths, key=lambda s: s.stat().st_size)
169 | 
170 |     if from_ is None:
171 |         from_ = 0
172 |     if to is None:
173 |         to = len(paths)
174 |     paths = paths[from_:to]
175 |     assert len(paths) > 0
176 | 
177 |     logger.info('processing %d files (%s ... %s)', len(paths), paths[0], paths[-1])
178 |     return paths
179 | 


--------------------------------------------------------------------------------
/src/bleanser/core/modules/extract.py:
--------------------------------------------------------------------------------
 1 | from collections.abc import Iterator
 2 | from contextlib import contextmanager
 3 | from pathlib import Path
 4 | from typing import Any
 5 | 
 6 | from bleanser.core.processor import (
 7 |     BaseNormaliser,
 8 |     Normalised,
 9 |     sort_file,
10 |     unique_file_in_tempdir,
11 | )
12 | 
13 | 
14 | class ExtractObjectsNormaliser(BaseNormaliser):
15 |     """
16 |     This is meant to be overridden by a subclass
17 | 
18 |     extract_objects receives an input file, and should yield data/objects that when converted
19 |     to a string, produces some comparable data/object to the normalised/cleaned output file
20 | 
21 |     possible things this could return is a unique key/id, or a tuple of (key, data), or a
22 |     namedtuple/dataclass
23 | 
24 |     newlines are stripped from the string, so lines can be compared/diffed properly
25 | 
26 |     Its possible you could use a library or code from https://github.com/karlicoss/HPI
27 |     in extract_objects, to use the DAL itself to parse the file https://beepb00p.xyz/exports.html#dal
28 |     """
29 | 
30 |     def extract_objects(self, path: Path) -> Iterator[Any]:
31 |         raise NotImplementedError
32 |         # when you subclass, you should do something like
33 |         # with path.open('r') as f:
34 |         #   for object in some_library(f):
35 |         #       yield (object.id, object.key)
36 | 
37 |     def _emit_history(self, upath: Path, cleaned) -> None:
38 |         """
39 |         calls extract_objects to extract lines from the unpacked path
40 |         subclasses should override that to yield some kind of object
41 |         out to here
42 |         """
43 |         with cleaned.open("w") as f:
44 |             for line in self.extract_objects(upath):
45 |                 # newlines may interfere with the diffing, use the repr of the string
46 |                 f.write(repr(str(line)))
47 |                 f.write("\n")
48 | 
49 |     @contextmanager
50 |     def normalise(self, *, path: Path) -> Iterator[Normalised]:
51 |         cleaned = unique_file_in_tempdir(input_filepath=path, dir=self.tmp_dir, suffix=path.suffix)
52 | 
53 |         self._emit_history(path, cleaned)
54 |         sort_file(cleaned)
55 | 
56 |         yield cleaned
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     ExtractObjectsNormaliser.main()
61 | 


--------------------------------------------------------------------------------
/src/bleanser/core/modules/json.py:
--------------------------------------------------------------------------------
  1 | from collections.abc import Iterator
  2 | from contextlib import contextmanager
  3 | from pathlib import Path
  4 | 
  5 | import orjson
  6 | 
  7 | from bleanser.core.processor import (
  8 |     BaseNormaliser,
  9 |     Normalised,
 10 |     sort_file,
 11 |     unique_file_in_tempdir,
 12 | )
 13 | 
 14 | # imports for convenience -- they are used in other modules
 15 | from bleanser.core.utils import Json, delkeys, patch_atoms  # noqa: F401
 16 | 
 17 | 
 18 | class JsonNormaliser(BaseNormaliser):
 19 |     PRUNE_DOMINATED = False
 20 | 
 21 |     def cleanup(self, j: Json) -> Json:
 22 |         '''
 23 |         subclasses should override this function, to do the actual cleanup
 24 | 
 25 |         cleanup in this context means removing extra JSON keys which are not
 26 |         needed to produce a normalised representation for a file
 27 |         '''
 28 |         return j
 29 | 
 30 |     @contextmanager
 31 |     def normalise(self, *, path: Path) -> Iterator[Normalised]:
 32 |         # TODO maybe, later implement some sort of class variable instead of hardcoding
 33 |         # note: deliberately keeping mime check inside do_cleanup, since it's executed in a parallel process
 34 |         # otherwise it essentially blocks waiting for all mimes to compute..
 35 |         # TODO crap. annoying, sometimes mime determines as text/plain for no reason
 36 |         # I guess doesn't matter as much, json.loads is the ultimate check it's ineed json
 37 |         # mp = mime(upath)
 38 |         # assert mp in {
 39 |         #         'application/json',
 40 |         # }, mp
 41 | 
 42 |         j = orjson.loads(path.read_text())
 43 |         j = self.cleanup(j)
 44 | 
 45 |         # create a tempfile to write flattened data to
 46 |         cleaned = unique_file_in_tempdir(input_filepath=path, dir=self.tmp_dir, suffix='.json')
 47 | 
 48 |         with cleaned.open('w') as fo:
 49 |             if isinstance(j, list):
 50 |                 j = {'<toplevel>': j}  # meh
 51 | 
 52 |             assert isinstance(j, dict), j
 53 |             for k, v in j.items():
 54 |                 if not isinstance(v, list):
 55 |                     # something like 'profile' data in hypothesis could be a dict
 56 |                     # something like 'notes' in rescuetime could be a scalar (str)
 57 |                     v = [v]  # meh
 58 |                 assert isinstance(v, list), (k, v)
 59 |                 for i in v:
 60 |                     print(f'{k} ::: {orjson.dumps(i, option=orjson.OPT_SORT_KEYS).decode("utf8")}', file=fo)
 61 | 
 62 |         # todo meh... see Fileset._union
 63 |         # this gives it a bit of a speedup, just calls out to unix sort
 64 |         sort_file(cleaned)
 65 | 
 66 |         yield cleaned
 67 | 
 68 | 
 69 | if __name__ == '__main__':
 70 |     JsonNormaliser.main()
 71 | 
 72 | 
 73 | # TODO actually implement some artificial json test
 74 | #
 75 | def test_nonidempotence(tmp_path: Path) -> None:
 76 |     from bleanser.tests.common import actions, hack_attribute
 77 | 
 78 |     '''
 79 |     Just demonstrates that multiway processing might be
 80 |     It's probably going to be very hard to fix, likely finding 'minimal' cover (at least in terms of partial ordering) is NP hard?
 81 |     '''
 82 | 
 83 |     # fmt: off
 84 |     sets = [
 85 |         [],
 86 |         ['a'],
 87 |         ['a', 'b'],
 88 |         [     'b', 'c'],
 89 |         ['a', 'b', 'c'],
 90 |     ]
 91 |     # fmt: on
 92 |     for i, s in enumerate(sets):
 93 |         p = tmp_path / f'{i}.json'
 94 |         p.write_text(orjson.dumps(s).decode('utf8'))
 95 | 
 96 |     with hack_attribute(JsonNormaliser, 'MULTIWAY', value=True), hack_attribute(JsonNormaliser, 'PRUNE_DOMINATED', value=True):
 97 |         paths = sorted(tmp_path.glob('*.json'))
 98 |         res = actions(paths=paths, Normaliser=JsonNormaliser)
 99 | 
100 |         assert [p.name for p in res.remaining] == [
101 |             '0.json',  # keeping as boundary
102 |             '2.json',  # keeping because item a has rolled over
103 |             '4.json',  # keeping as boundary
104 |         ]
105 | 
106 |         paths = list(res.remaining)
107 |         res = actions(paths=paths, Normaliser=JsonNormaliser)
108 |         assert [p.name for p in res.remaining] == [
109 |             '0.json',
110 |             # note: 2.json is removed because fully contained in 4.json
111 |             '4.json',
112 |         ]
113 | 


--------------------------------------------------------------------------------
/src/bleanser/core/modules/tests/sqlite.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import sqlite3
  4 | from pathlib import Path
  5 | 
  6 | from ...common import Keep, Prune
  7 | from ...processor import compute_groups, groups_to_instructions
  8 | from ..sqlite import SqliteNormaliser
  9 | 
 10 | 
 11 | def _make_db(out: Path, values: list[bytes], *, bad: bool = False) -> Path:
 12 |     with sqlite3.connect(out) as conn:
 13 |         conn.execute('CREATE TABLE `test` (bbb BLOB)')
 14 |         conn.executemany(
 15 |             'INSERT INTO `test` VALUES (?)',
 16 |             [(v,) for v in values],
 17 |         )
 18 |         if bad:
 19 |             # the only way I figured to actually force BLOB column to contain text values
 20 |             conn.execute('CREATE TABLE `bad` (bbb BLOB)')
 21 |             conn.execute('INSERT INTO `bad` SELECT cast(bbb AS TEXT) FROM `test`')
 22 |             conn.execute('DROP TABLE `test`')
 23 |             conn.execute('ALTER TABLE `bad` RENAME TO `test`')
 24 |     conn.close()
 25 |     return out
 26 | 
 27 | 
 28 | def test_sqlite_blobs_good(tmp_path: Path) -> None:
 29 |     """
 30 |     In this case we have blob data in BLOB column -- so cleanup should work as expected
 31 |     """
 32 | 
 33 |     class TestNormaliser(SqliteNormaliser):
 34 |         MULTIWAY = False
 35 |         PRUNE_DOMINATED = True
 36 | 
 37 |     db0 = _make_db(tmp_path / '0.db', [b'\x00\x01'])
 38 |     db1 = _make_db(tmp_path / '1.db', [b'\x00\x01', b'\x01\x02'])
 39 |     db2 = _make_db(tmp_path / '2.db', [b'\x00\x01', b'\x01\x02', b'\x02\x03'])
 40 |     db3 = _make_db(tmp_path / '3.db', [b'\x00\x01', b'\x01\x02', b'\x02\x03', b'\x03\x04'])
 41 |     dbs = [db0, db1, db2, db3]
 42 | 
 43 |     groups = list(compute_groups(dbs, Normaliser=TestNormaliser))
 44 |     instructions = list(groups_to_instructions(groups))
 45 | 
 46 |     assert [type(i) for i in instructions] == [
 47 |         Keep,
 48 |         Prune,
 49 |         Prune,
 50 |         Keep,
 51 |     ]
 52 | 
 53 | 
 54 | def test_sqlite_blobs_bad(tmp_path: Path) -> None:
 55 |     """
 56 |     In this case we have text (!) data in BLOB column.
 57 |     This will cause errors during cleanup so we'll keep all inputs (even though dbs are identical here)
 58 |     """
 59 | 
 60 |     class TestNormaliser(SqliteNormaliser):
 61 |         MULTIWAY = False
 62 |         PRUNE_DOMINATED = True
 63 | 
 64 |     db0 = _make_db(tmp_path / '0.db', [b'\x00', b'\x01', b'\x02'], bad=True)
 65 |     db1 = _make_db(tmp_path / '1.db', [b'\x00', b'\x01', b'\x02'], bad=True)
 66 |     db2 = _make_db(tmp_path / '2.db', [b'\x00', b'\x01', b'\x02'], bad=True)
 67 |     db3 = _make_db(tmp_path / '3.db', [b'\x00', b'\x01', b'\x02'], bad=True)
 68 |     dbs = [db0, db1, db2, db3]
 69 | 
 70 |     groups = list(compute_groups(dbs, Normaliser=TestNormaliser))
 71 |     instructions = list(groups_to_instructions(groups))
 72 | 
 73 |     assert [type(i) for i in instructions] == [
 74 |         Keep,
 75 |         Keep,
 76 |         Keep,
 77 |         Keep,
 78 |     ]
 79 | 
 80 | 
 81 | def test_sqlite_blobs_allowed(tmp_path: Path) -> None:
 82 |     class TestNormaliser(SqliteNormaliser):
 83 |         MULTIWAY = False
 84 |         PRUNE_DOMINATED = True
 85 | 
 86 |         ALLOWED_BLOBS = {('test', 'bbb')}
 87 | 
 88 |     db0 = _make_db(tmp_path / '0.db', [b'\x00\x01'], bad=True)
 89 |     db1 = _make_db(tmp_path / '1.db', [b'\x00\x02'], bad=True)
 90 |     db2 = _make_db(tmp_path / '2.db', [b'\x00\x03'], bad=True)
 91 |     db3 = _make_db(tmp_path / '3.db', [b'\x00\x04'], bad=True)
 92 |     dbs = [db0, db1, db2, db3]
 93 | 
 94 |     groups = list(compute_groups(dbs, Normaliser=TestNormaliser))
 95 |     instructions = list(groups_to_instructions(groups))
 96 | 
 97 |     # this kinda demonstrates what happens if we're not careful and mess up ALLOWED_BLOBS
 98 |     # sqlite3 will end up dumping supposedly blob data as empty strings
 99 |     # and this will clean up files that shouldn't be cleaned up (in the files above all data is different!)
100 |     assert [type(i) for i in instructions] == [
101 |         Keep,
102 |         Prune,
103 |         Prune,
104 |         Keep,
105 |     ]
106 | 


--------------------------------------------------------------------------------
/src/bleanser/core/modules/xml.py:
--------------------------------------------------------------------------------
  1 | from collections.abc import Iterator
  2 | from contextlib import contextmanager
  3 | from pathlib import Path
  4 | 
  5 | from lxml import etree
  6 | 
  7 | from bleanser.core.processor import (
  8 |     BaseNormaliser,
  9 |     Normalised,
 10 |     sort_file,
 11 |     unique_file_in_tempdir,
 12 | )
 13 | 
 14 | 
 15 | class Normaliser(BaseNormaliser):
 16 |     PRUNE_DOMINATED = False
 17 | 
 18 |     def cleanup(self, t: etree._Element) -> etree._Element:
 19 |         return t
 20 | 
 21 |     @contextmanager
 22 |     def normalise(self, *, path: Path) -> Iterator[Normalised]:
 23 |         # todo not sure if need to release some resources here...
 24 |         parser = etree.XMLParser(remove_blank_text=True)
 25 |         # TODO we seem to lose comments here... meh
 26 |         et = etree.fromstring(path.read_bytes(), parser=parser)
 27 |         # restore newlines just for the top level
 28 |         assert et.text is None, et.text
 29 |         et.text = '\n'
 30 |         for c in et:
 31 |             assert c.tail is None, c.tail
 32 |             c.tail = '\n'
 33 | 
 34 |         et = self.cleanup(et)
 35 | 
 36 |         cleaned = unique_file_in_tempdir(input_filepath=path, dir=self.tmp_dir, suffix='.xml')
 37 |         cleaned.write_text(etree.tostring(et, encoding="unicode"))
 38 | 
 39 |         # TODO what is the assumption about shape?
 40 |         # either list of xml entries
 41 |         # or top-level thing with children
 42 | 
 43 |         # todo meh... see Fileset._union
 44 |         # this gives it a bit of a speedup
 45 |         sort_file(cleaned)
 46 |         yield cleaned
 47 | 
 48 | 
 49 | if __name__ == '__main__':
 50 |     Normaliser.main()
 51 | 
 52 | 
 53 | def test_xml_simple(tmp_path: Path) -> None:
 54 |     from bleanser.tests.common import actions, hack_attribute
 55 | 
 56 |     f1 = tmp_path / 'f1'
 57 |     f2 = tmp_path / 'f2'
 58 |     f3 = tmp_path / 'f3'
 59 |     f4 = tmp_path / 'f4'
 60 | 
 61 |     # make sure it handles
 62 |     f1.write_text('''
 63 |     <root>
 64 |     <x>text1</x>
 65 |     <x>text2</x>
 66 |     </root>
 67 |     ''')
 68 | 
 69 |     f2.write_text('''
 70 |     <root>
 71 |     <x>text2</x>
 72 |     <x>text3</x>
 73 |     <x>text4</x>
 74 |     </root>
 75 |     ''')
 76 | 
 77 |     f3.write_text('''
 78 |     <root>
 79 |    <x>text4</x>
 80 |    <x>text5</x>
 81 |     </root>
 82 |     ''')
 83 | 
 84 |     # note: we don't care about order
 85 |     f4.write_text('''
 86 |     <root>
 87 |     <x>text5</x>
 88 |     <x>text4</x>
 89 |     <x>text3</x>
 90 |     <x>text2</x>
 91 |     </root>
 92 |     ''')
 93 | 
 94 |     paths123 = [f1, f2, f3]
 95 |     with hack_attribute(Normaliser, 'MULTIWAY', value=True), hack_attribute(Normaliser, 'PRUNE_DOMINATED', value=True):
 96 |         res123 = actions(paths=paths123, Normaliser=Normaliser)
 97 |     assert res123.remaining == paths123
 98 | 
 99 |     paths124 = [f1, f2, f4]
100 |     with hack_attribute(Normaliser, 'MULTIWAY', value=True), hack_attribute(Normaliser, 'PRUNE_DOMINATED', value=True):
101 |         res124 = actions(paths=paths124, Normaliser=Normaliser)
102 |     assert res124.remaining == [
103 |         f1,
104 |         f4,
105 |     ]
106 | 
107 | 
108 | def test_xml_nested(tmp_path: Path) -> None:
109 |     from bleanser.tests.common import actions, hack_attribute
110 | 
111 |     f1 = tmp_path / 'f1'
112 |     f2 = tmp_path / 'f2'
113 |     f3 = tmp_path / 'f3'
114 |     # make sure we don't just sort all lines and treat them as set
115 |     # this could happen if you just pretty print the whole structure and diff
116 |     # TODO: tbh this is also a good test for 'simple' handling
117 |     f1.write_text('''
118 | <root>
119 | <item>
120 |     <a val="1"></a>
121 |     <b val="2"></b>
122 | </item>
123 | <item>
124 |     <a val="2"></a>
125 |     <b val="3"></b>
126 | </item>
127 | <item>
128 |     <a val="1"></a>
129 |     <b val="3"></b>
130 | </item>
131 | </root>
132 |     ''')
133 |     f2.write_text('''
134 | <root>
135 | <item>
136 |     <a val="1"></a>
137 |     <b val="1"></b>
138 | </item>
139 | <item>
140 |     <a val="2"></a>
141 |     <b val="2"></b>
142 | </item>
143 | <item>
144 |     <a val="3"></a>
145 |     <b val="3"></b>
146 | </item>
147 | </root>
148 |     ''')
149 |     f3.write_text('''
150 | <root>
151 | <item>
152 |     <a val="1"></a>
153 |     <b val="3"></b>
154 | </item>
155 | <item>
156 |     <a val="2"></a>
157 |     <b val="1"></b>
158 | </item>
159 | <item>
160 |     <a val="3"></a>
161 |     <b val="1"></b>
162 | </item>
163 | </root>
164 |     ''')
165 | 
166 |     paths = [f1, f2, f3]
167 |     with hack_attribute(Normaliser, 'MULTIWAY', value=True), hack_attribute(Normaliser, 'PRUNE_DOMINATED', value=True):
168 |         res = actions(paths=paths, Normaliser=Normaliser)
169 |     assert res.remaining == [
170 |         f1,
171 |         f2,
172 |         f3,
173 |     ]
174 | 


--------------------------------------------------------------------------------
/src/bleanser/core/sqlite.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | 
 3 | from bleanser.core.modules.sqlite import *
 4 | 
 5 | warnings.warn(
 6 |     "Module 'bleanser.core.sqlite' is deprecated. Use 'bleanser.core.modules.sqlite' instead.",
 7 |     DeprecationWarning,
 8 | )
 9 | 
10 | 
11 | if __name__ == '__main__':
12 |     SqliteNormaliser.main()  # noqa: F405
13 | 


--------------------------------------------------------------------------------
/src/bleanser/core/utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from typing import TYPE_CHECKING
  4 | 
  5 | if not TYPE_CHECKING:
  6 |     from .compat import assert_never  # noqa: F401
  7 | 
  8 | 
  9 | from pathlib import Path
 10 | 
 11 | 
 12 | def total_dir_size(d: Path) -> int:
 13 |     return sum(f.stat().st_size for f in d.glob('**/*') if f.is_file())
 14 | 
 15 | 
 16 | import sys
 17 | 
 18 | under_pytest = 'pytest' in sys.modules
 19 | ### ugh. pretty horrible... but
 20 | # 'PYTEST_CURRENT_TEST' in os.environ
 21 | # doesn't work before we're actually inside the test.. and it might be late for decorators, for instance
 22 | ###
 23 | 
 24 | 
 25 | import time
 26 | 
 27 | 
 28 | class Timer:
 29 |     def __init__(self, *tags):
 30 |         self.tags = tags
 31 | 
 32 |     def __enter__(self):
 33 |         self.start = time.time()
 34 |         return self
 35 | 
 36 |     def __exit__(self, *args):
 37 |         self.end = time.time()
 38 |         delta = self.end - self.start
 39 |         print(f"{self.tags} TIME TAKEN: {delta:.1f}", file=sys.stderr)
 40 | 
 41 | 
 42 | from functools import wraps
 43 | 
 44 | 
 45 | def timing(f):
 46 |     @wraps(f)
 47 |     def wrapped(*args, **kwargs):
 48 |         with Timer(f.__name__):
 49 |             return f(*args, **kwargs)
 50 | 
 51 |     return wrapped
 52 | 
 53 | 
 54 | # make it lazy, otherwise it might crash on module import (e.g. on Windows)
 55 | # ideally would be nice to fix it properly https://github.com/ahupp/python-magic#windows
 56 | import warnings
 57 | from functools import lru_cache
 58 | from typing import Callable
 59 | 
 60 | 
 61 | @lru_cache(1)
 62 | def _magic() -> Callable[[Path], str | None]:
 63 |     try:
 64 |         import magic
 65 |     except Exception as e:
 66 |         # logger.exception(e)
 67 |         defensive_msg: str | None = None
 68 |         if isinstance(e, ModuleNotFoundError) and e.name == 'magic':
 69 |             defensive_msg = "python-magic is not detected. It's recommended for better file type detection (pip3 install --user python-magic). See https://github.com/ahupp/python-magic#installation"
 70 |         elif isinstance(e, ImportError):
 71 |             emsg = getattr(e, 'msg', '')  # make mypy happy
 72 |             if 'failed to find libmagic' in emsg:  # probably the actual library is missing?...
 73 |                 defensive_msg = "couldn't import magic. See https://github.com/ahupp/python-magic#installation"
 74 |         if defensive_msg is not None:
 75 |             warnings.warn(defensive_msg)
 76 |             return lambda path: None  # stub  # noqa: ARG005
 77 |         else:
 78 |             raise e
 79 |     else:
 80 |         mm = magic.Magic(mime=True)
 81 |         return lambda path: mm.from_file(str(path))
 82 | 
 83 | 
 84 | def mime(path: Path) -> str | None:
 85 |     # next, libmagic, it might access the file, so a bit slower
 86 |     magic = _magic()
 87 |     return magic(path)
 88 | 
 89 | 
 90 | from typing import Any
 91 | 
 92 | Json = Any
 93 | 
 94 | 
 95 | from collections.abc import Collection
 96 | 
 97 | 
 98 | def delkeys(j: Json, *, keys: str | Collection[str]) -> None:
 99 |     if isinstance(keys, str):
100 |         keys = {keys}  # meh
101 | 
102 |     # todo if primitive, don't do anything
103 |     if isinstance(j, (int, float, bool, type(None), str)):
104 |         return
105 |     elif isinstance(j, list):
106 |         for v in j:
107 |             delkeys(v, keys=keys)
108 |     elif isinstance(j, dict):
109 |         for key in keys:
110 |             j.pop(key, None)
111 |         for v in j.values():
112 |             delkeys(v, keys=keys)
113 |     else:
114 |         raise RuntimeError(type(j))
115 | 
116 | 
117 | def patch_atoms(j: Json, *, patch):
118 |     if isinstance(j, (int, float, bool, type(None), str)):
119 |         return patch(j)
120 |     elif isinstance(j, list):
121 |         for i in range(len(j)):
122 |             j[i] = patch_atoms(j[i], patch=patch)
123 |         return j
124 |     elif isinstance(j, dict):
125 |         for k in list(j.keys()):
126 |             j[k] = patch_atoms(j[k], patch=patch)
127 |         return j
128 |     else:
129 |         raise RuntimeError(type(j))
130 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/antennapod_android.py:
--------------------------------------------------------------------------------
 1 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool
 2 | 
 3 | 
 4 | class Normaliser(SqliteNormaliser):
 5 |     MULTIWAY = True
 6 |     PRUNE_DOMINATED = True
 7 | 
 8 |     def check(self, c) -> None:
 9 |         tables = Tool(c).get_tables()
10 |         assert 'Feeds' in tables, tables
11 |         eps = tables['FeedItems']
12 |         assert 'link' in eps
13 |         assert 'read' in eps
14 | 
15 |         # should be safe to use multiway because of these vvv
16 |         media = tables['FeedMedia']
17 |         assert 'played_duration' in media
18 |         assert 'last_played_time' in media
19 | 
20 |     def cleanup(self, c) -> None:
21 |         self.check(c)
22 | 
23 |         t = Tool(c)
24 |         # often changing, no point keeping
25 |         t.drop_cols(
26 |             table='Feeds',
27 |             cols=[
28 |                 'last_update',
29 |                 'last_update_failed',
30 |                 'image_url',  # volatile
31 |                 'minimal_duration_filter',
32 |             ],
33 |         )
34 | 
35 |         t.drop_cols(
36 |             table='FeedMedia',
37 |             cols=[
38 |                 'download_url',  # sometimes change, especially tracking links -- probs not worth keeping anyway
39 |                 'filesize',  # no idea why would it change, but it does sometimes
40 |             ],
41 |         )
42 | 
43 |         t.drop_cols(
44 |             table='FeedItems',
45 |             cols=[
46 |                 'title',  # useful feed, but volatile so best to ignore
47 |                 'content_encoded',  # no idea what is it but volatile
48 |                 'description',  # often changing, no need to keep
49 |                 'image_url',  # volatile
50 |             ],
51 |         )
52 | 
53 |         t.drop('Queue')
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     Normaliser.main()
58 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/binary.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Format-agnostic, clean up as literal file diffs
 3 | """
 4 | # TODO probably should give it a better name...
 5 | 
 6 | from bleanser.core.processor import BaseNormaliser
 7 | 
 8 | 
 9 | class Normaliser(BaseNormaliser):
10 |     # TODO need to be careful about using it...
11 |     # for non-structured data might mess it up by accident if it's weirdly ordered
12 |     pass
13 | 
14 | 
15 | if __name__ == '__main__':
16 |     Normaliser.main()
17 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/bluemaestro.py:
--------------------------------------------------------------------------------
 1 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool
 2 | 
 3 | 
 4 | class Normaliser(SqliteNormaliser):
 5 |     # multiway is useful at the very least for old db format, it only kept rolling 6K points or something in the db
 6 |     MULTIWAY = True
 7 |     PRUNE_DOMINATED = True
 8 | 
 9 |     def check(self, c) -> None:
10 |         tool = Tool(c)
11 |         tables = tool.get_tables()
12 |         info_tables = [x for x in tables if x.endswith('_info')]
13 |         if len(info_tables) == 0:
14 |             # old db format
15 |             data = tables['data']
16 |             assert 'Time' in data, data
17 |             assert 'Temperature' in data, data
18 |         else:
19 |             # TODO hmm how to add some proper check here without too much duplication?
20 |             pass
21 | 
22 |     def cleanup(self, c) -> None:
23 |         self.check(c)
24 |         tool = Tool(c)
25 | 
26 |         tables = tool.get_tables()
27 |         info_tables = [x for x in tables if x.endswith('_info')]
28 |         if len(info_tables) == 0:
29 |             # old db format
30 |             # log_index doesn't correspond to anything real, there are timestamps
31 |             tool.drop_cols(table='data', cols=['log_index'])
32 |             # changes every time db is exported, no point
33 |             tool.drop_cols(table='info', cols=['last_download', 'last_pointer'])
34 |         else:
35 |             for info_table in info_tables:
36 |                 # possible to have multiple info tables, e.g. if you have multiple devices
37 | 
38 |                 device, _ = info_table.split('_')
39 | 
40 |                 ## get rid of downloadUnix -- it's changing after export and redundant info
41 |                 [[ut]] = list(c.execute(f'SELECT downloadUnix FROM {device}_info'))
42 |                 last_logs = [t for t in tables if t.endswith('log')]
43 |                 if len(last_logs) == 0:
44 |                     # seems like no data yet
45 |                     return
46 |                 last_log = max(last_logs)
47 |                 if last_log == f'{device}_{ut}_log':
48 |                     # TODO annoying that it needs to be defensive...
49 |                     # for some dbs it actually does happen, e.g. around 20211102085345
50 |                     tool.drop_cols(table=f'{device}_info', cols=['downloadUnix'])
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     Normaliser.main()
55 | 
56 | 
57 | # TODO think I've had jdoe or something with example databases..
58 | def test_bluemaestro() -> None:
59 |     from bleanser.tests.common import skip_if_no_data
60 | 
61 |     skip_if_no_data()
62 | 
63 |     from bleanser.tests.common import TESTDATA, actions2
64 | 
65 |     res = actions2(path=TESTDATA / 'bluemaestro', rglob='**/*.db*', Normaliser=Normaliser)
66 | 
67 |     assert res.remaining == [
68 |         '20180720.db',
69 |         # '20180724.db',  # move
70 |         '20180728.db',
71 |         # '20180730.db',  # move
72 |         '20180731.db',
73 | 
74 |         '20190723100032.db',  # keep, everything changed
75 |         # TODO need to investigate, some values have changed a bit, like 1st digit after decimal point
76 |         # even timestamps changed sometimes (e.g. just last second)
77 |         # hpi bluemaestro module has something for handling this, I think
78 |         '20190724101707.db',
79 |         # same as above
80 |         '20190727104723.db',
81 | 
82 |         '20200208225936.db',  # keep, everything changed (several months diff)
83 |         # '20201209083427/bmgateway.db',  # move, completely dominated by the next
84 |         # '20210131102917/bmgateway.db',  # move, completely dominated by the next
85 |         # '20210207183947/bmgateway.db',  # move, completely dominated by the next
86 |         '20210216211844/bmgateway.db',  # keep, errored because couldn't find last _log item
87 |         '20211103234924/bmgateway.db',  # same, previous errored
88 |         '20211106191208/bmgateway.db',
89 |     ]  # fmt: skip
90 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/bumble_android.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from bleanser.core.modules.json import delkeys
 4 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool
 5 | 
 6 | 
 7 | class Normaliser(SqliteNormaliser):
 8 |     MULTIWAY = True
 9 |     PRUNE_DOMINATED = True
10 | 
11 |     def check(self, c) -> None:
12 |         tables = Tool(c).get_tables()
13 | 
14 |         # fmt: off
15 |         message   = tables['message']
16 |         conv_info = tables['conversation_info']
17 | 
18 |         assert 'id'                in message
19 |         assert 'conversation_id'   in message
20 |         assert 'payload'           in message
21 |         assert 'created_timestamp' in message
22 | 
23 |         assert 'user_id'   in conv_info
24 |         assert 'user_name' in conv_info
25 |         # fmt: on
26 | 
27 |     def cleanup(self, c) -> None:
28 |         self.check(c)
29 | 
30 |         t = Tool(c)
31 |         t.drop('search_fts_segments')
32 |         t.drop('search_fts_segdir')
33 |         t.drop('search_fts_docsize')
34 |         t.drop('search_fts_content')
35 |         t.drop('search_fts_stat')
36 |         t.drop('message_read_info')
37 | 
38 |         t.drop_cols('conversation_info', cols=[
39 |             'user_image_url',
40 |             'photo_url',
41 |             'last_seen_message_id',
42 |             'covid_preferences',
43 | 
44 |             'chat_input_settings',
45 | 
46 |             'match_status',  # ?? either NULL or -1 or some weird hash thing??
47 | 
48 |             'sending_multimedia_enabled',
49 |             'disabled_multimedia_explanation',
50 |         ])  # fmt: skip
51 |         # for extract: photo_id can be a bit volatile
52 | 
53 |         # mm, user photos are a bit annoying, urls are flaky
54 |         def _cleanup_jsons(s):
55 |             if s is None:
56 |                 return None
57 |             j = json.loads(s)
58 |             delkeys(
59 |                 j,
60 |                 keys=[
61 |                     'url',  # for conversation_info.user_photos & message.payload
62 |                     'expiration_timestamp',  # for message.payload
63 |                 ],
64 |             )
65 |             return json.dumps(j)
66 | 
67 |         c.create_function("CLEANUP_JSONS", 1, _cleanup_jsons)
68 |         list(c.execute('UPDATE conversation_info SET user_photos = CLEANUP_JSONS(user_photos)'))
69 |         list(c.execute('UPDATE message           SET payload     = CLEANUP_JSONS(payload)'))
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     Normaliser.main()
74 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/chrome.py:
--------------------------------------------------------------------------------
 1 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool
 2 | 
 3 | 
 4 | class Normaliser(SqliteNormaliser):
 5 |     MULTIWAY = True
 6 |     PRUNE_DOMINATED = True
 7 | 
 8 |     ALLOWED_BLOBS = {
 9 |         ('downloads', 'hash'),
10 |         ('typed_url_sync_metadata', 'value'),
11 |     }
12 | 
13 |     def check(self, c) -> None:
14 |         tables = Tool(c).get_tables()
15 |         # fmt: off
16 |         v = tables['visits']
17 |         assert 'visit_time' in v, v
18 |         assert 'url'        in v, v  # note: url is an int id
19 | 
20 |         u = tables['urls']
21 |         assert 'url'   in u, u
22 |         assert 'title' in u, u
23 |         # fmt: on
24 | 
25 |     def cleanup(self, c) -> None:
26 |         self.check(c)
27 | 
28 |         t = Tool(c)
29 |         t.drop_cols(
30 |             'urls',
31 |             cols=[
32 |                 # TODO similar issue to firefox -- titles sometimes jump because of notifications (e.g. twitter)
33 |                 # maybe could sanitize it?
34 |                 # cleans up like 15% databases if I wipe it completely?
35 |                 # the annoying thing is that sqlite doesn't have support for regex...
36 |                 # 'title',
37 |                 #
38 |                 # aggregates, no need for them
39 |                 'visit_count',
40 |                 'typed_count',
41 |                 'last_visit_time',
42 |             ],
43 |         )
44 |         t.drop_cols(
45 |             'segment_usage',
46 |             cols=['visit_count'],
47 |         )
48 |         c.execute('DELETE FROM meta WHERE key IN ("typed_url_model_type_state", "early_expiration_threshold")')
49 | 
50 |         # hmm, not sure -- it might change?
51 |         # cleans up about 10% files
52 |         # t.drop_cols(
53 |         #     'visits',
54 |         #     cols=['visit_duration'],
55 |         # )
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     Normaliser.main()
60 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/firefox.py:
--------------------------------------------------------------------------------
  1 | from sqlite3 import Connection
  2 | 
  3 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool
  4 | 
  5 | 
  6 | class Normaliser(SqliteNormaliser):
  7 |     MULTIWAY = True
  8 |     PRUNE_DOMINATED = True
  9 | 
 10 |     def is_old_firefox(self, c: Connection) -> bool:
 11 |         tool = Tool(c)
 12 |         tables = tool.get_tables()
 13 |         if 'bookmarks' in tables:
 14 |             return True
 15 |         if 'moz_bookmarks' in tables:
 16 |             return False
 17 |         raise RuntimeError(f"Unexpected schema {tables}")
 18 | 
 19 |     def check(self, c: Connection) -> None:
 20 |         tool = Tool(c)
 21 |         tables = tool.get_tables()
 22 |         # fmt: off
 23 |         if self.is_old_firefox(c):
 24 |             v = tables['visits']
 25 |             assert 'history_guid' in v, v
 26 |             assert 'date'         in v, v
 27 | 
 28 |             h = tables['history']
 29 |             assert 'url'  in h, h
 30 |             assert 'guid' in h, h
 31 |         else:
 32 |             b = tables['moz_bookmarks']
 33 |             assert 'dateAdded' in b, b
 34 |             assert 'guid'      in b, b
 35 | 
 36 |             h = tables['moz_historyvisits']
 37 |             assert 'place_id'   in h, h
 38 |             assert 'visit_date' in h, h
 39 | 
 40 |             p = tables['moz_places']
 41 |             assert 'url' in p, p
 42 |             assert 'id'  in p, p
 43 |         # fmt: on
 44 | 
 45 |     def cleanup(self, c: Connection) -> None:
 46 |         self.check(c)
 47 | 
 48 |         if self.is_old_firefox(c):
 49 |             self.cleanup_old(c)
 50 |             return
 51 | 
 52 |         # otherwise, assume new db format
 53 | 
 54 |         tool = Tool(c)
 55 |         [(visits_before,)] = c.execute('SELECT count(*) FROM moz_historyvisits')
 56 |         tool.drop_cols(
 57 |             table='moz_places',
 58 |             cols=[
 59 |                 # aggregates, changing all the time
 60 |                 'frecency',
 61 |                 'recalc_frecency',
 62 |                 'alt_frecency',
 63 |                 'recalc_alt_frecency',
 64 |                 'last_visit_date',
 65 |                 'visit_count',
 66 |                 # ugh... sometimes changes because of notifications, e.g. twitter/youtube?, or during page load
 67 |                 'hidden',
 68 |                 'typed',
 69 |                 'title',
 70 |                 'description',
 71 |                 'preview_image_url',
 72 |                 'foreign_count',  # just some internal refcount thing... https://bugzilla.mozilla.org/show_bug.cgi?id=1017502
 73 |                 ## mobile only
 74 |                 'visit_count_local',
 75 |                 'last_visit_date_local',
 76 |                 'last_visit_date_remote',
 77 |                 'sync_status',
 78 |                 'sync_change_counter',
 79 |                 ##
 80 |                 ## ? maybe mobile only
 81 |                 'visit_count_remote',
 82 |                 ##
 83 |             ],
 84 |         )
 85 |         # ugh. sometimes changes for no reason...
 86 |         # and anyway, for history the historyvisits table refers place_id (this table's actual id)
 87 |         # also use update instead delete because phone db used to have UNIQUE constraint...
 88 |         c.execute('UPDATE moz_places SET guid=id')
 89 |         tool.drop_cols(
 90 |             table='moz_bookmarks',
 91 |             cols=['lastModified'],  # changing all the time for no reason?
 92 |             # todo hmm dateAdded might change when e.g. firefox reinstalls and it adds default bookmarks
 93 |             # probably not worth the trouble
 94 |         )
 95 |         tool.drop('moz_meta')
 96 |         tool.drop('moz_origins')  # prefix/host/frequency -- not interesting
 97 |         # tool.drop('moz_annos')  # not sure -- contains downloads data? might be volatile
 98 | 
 99 |         tool.drop_cols(
100 |             'moz_inputhistory',
101 |             cols=[
102 |                 'use_count',  #  eh, some floating point that changes all the time
103 |             ],
104 |         )
105 | 
106 |         tool.drop_cols(
107 |             'moz_bookmarks_synced',
108 |             cols=[
109 |                 'id',  # id always changes, and they have guid instead
110 |                 'serverModified',  # changes without any actual changes to bookmark?
111 |             ],
112 |         )
113 | 
114 |         ## fenix
115 |         tool.drop_cols(
116 |             'moz_bookmarks_synced_structure',
117 |             cols=[
118 |                 # I think it's the position in bookmarks list, doesn't matter
119 |                 'position',
120 |             ],
121 |         )
122 |         tool.drop('moz_places_metadata_search_queries')
123 | 
124 |         tool.drop_cols(
125 |             'moz_places_metadata',
126 |             cols=[
127 |                 ## volatile
128 |                 'updated_at',
129 |                 'total_view_time',
130 |                 'typing_time',
131 |                 'key_presses',
132 |                 'scrolling_time',
133 |                 'scrolling_distance',
134 |                 ##
135 |             ],
136 |         )
137 |         ##
138 | 
139 |         # TODO do we still need it?
140 |         # sanity check just in case... can remove after we get rid of triggers properly...
141 |         [(visits_after,)] = c.execute('SELECT count(*) FROM moz_historyvisits')
142 |         assert visits_before == visits_after, (visits_before, visits_after)
143 | 
144 |     def cleanup_old(self, c) -> None:
145 |         tool = Tool(c)
146 | 
147 |         # TODO could be pretty useful + really marginal benefits form cleaning it up, like 5% of databases maybe
148 |         # tool.drop('searchhistory')
149 | 
150 |         tool.drop('thumbnails')
151 |         tool.drop('favicons')
152 | 
153 |         # doesn't really have anything interesting? ...
154 |         # just some image urls and maybe titles... likely no one cares about them
155 |         tool.drop('page_metadata')
156 | 
157 |         tool.drop_cols(
158 |             'bookmarks',
159 |             # we don't care about these
160 |             cols=[
161 |                 'position',
162 |                 'localVersion',
163 |                 'syncVersion',
164 |                 'modified',  # also seems to depend on bookmark position
165 |                 'guid',  # sort of a hash and changes with position changes too?
166 |             ],
167 |         )
168 |         tool.drop_cols(
169 |             'clients',
170 |             cols=['last_modified'],
171 |         )
172 |         tool.drop_cols(
173 |             'history',
174 |             cols=[
175 |                 # aggregates, changing all the time
176 |                 'visits',
177 |                 'visits_local',
178 |                 'visits_remote',
179 |                 ##
180 |                 # hmm, this seems to be last date.. actual dates are in 'visits'
181 |                 'date',
182 |                 'date_local',
183 |                 'date_remote',
184 |                 ##
185 |                 'title',
186 |                 # ugh. changes dynamically. e.g. (1) on twitter/telegram notifications
187 |                 # could update in some elaborate manner. idk
188 |                 'modified',  # ? changes for no apparent reason, probs because of the corresponding aggregates
189 |             ],
190 |         )
191 | 
192 |         tool.drop_cols(
193 |             'remote_devices',
194 |             cols=[
195 |                 # probs only the presence of devices is interesting..
196 |                 # changing all the time for no reason
197 |                 '_id',
198 |                 'modified',
199 |                 'last_access_time',
200 |                 'created',  # yes, this also changed all the time
201 |             ],
202 |         )
203 | 
204 |         # FIXME hmm...
205 |         # on the one hand, kind of interesting info..
206 |         # on the other, they change A LOT, so we'll miss most of tab snapshots anyway...
207 |         # also newer databases don't have tab information anyway.. so I guess for now best to clean them up..
208 |         tool.drop('tabs')
209 |         # tool.drop_cols(
210 |         #     'tabs',
211 |         #     cols=['_id', 'favicon', 'position',],
212 |         # )
213 | 
214 | 
215 | if __name__ == '__main__':
216 |     Normaliser.main()
217 | 
218 | 
219 | # TODO need to make sure we test 'rolling' visits
220 | # these look like they are completely cumulative in terms of history
221 | def test_fenix() -> None:
222 |     from bleanser.tests.common import skip_if_no_data
223 | 
224 |     skip_if_no_data()
225 | 
226 |     from bleanser.tests.common import TESTDATA, actions2
227 | 
228 |     res = actions2(path=TESTDATA / 'fenix', rglob='**/*.sqlite*', Normaliser=Normaliser)
229 |     assert res.remaining == [
230 |         # eh, too lazy to document the reason for keeping them...
231 |         # many of them are just bookmark changes
232 |         '20210327103953/places.sqlite',
233 |         '20210408155753/places.sqlite',
234 |         '20210419092604/places.sqlite',
235 |         '20210514081246/places.sqlite',
236 |         # '20210517094437/places.sqlite',  # move
237 |         # '20210517175309/places.sqlite',  # move
238 |         # '20210520132446/places.sqlite',  # move
239 |         # '20210522092831/places.sqlite',  # move
240 |         # '20210524152154/places.sqlite',  # move
241 |         # '20210526075434/places.sqlite',  # move
242 |         # '20210527062123/places.sqlite',  # move
243 |         # '20210530172804/places.sqlite',  # move
244 |         # '20210601165208/places.sqlite',  # move
245 |         # '20210602192530/places.sqlite',  # move
246 |         # '20210603032923/places.sqlite',  # move
247 |         '20210603144405/places.sqlite',
248 |         '20210623234309/places.sqlite',
249 |         '20210717141629/places.sqlite',
250 |     ]
251 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/foursquare.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from collections.abc import Iterator
  4 | from typing import Any
  5 | 
  6 | from bleanser.core.modules.json import Json, JsonNormaliser, delkeys
  7 | 
  8 | TARGET = object()
  9 | 
 10 | 
 11 | def _check_and_extract(x, schema) -> Iterator[Any]:
 12 |     if schema is TARGET:
 13 |         yield x
 14 |         return
 15 |     if type(schema) == type:  # noqa: E721
 16 |         assert isinstance(x, schema), x
 17 |         return
 18 |     if type(schema) == list:  # noqa: E721
 19 |         [sch] = schema
 20 |         assert isinstance(x, list), x
 21 |         for i in x:
 22 |             yield from _check_and_extract(x=i, schema=sch)
 23 |         return
 24 | 
 25 |     assert type(schema) == dict, schema  # noqa: E721
 26 |     assert isinstance(x, dict), x
 27 | 
 28 |     xk = x.keys()
 29 |     sk = schema.keys()
 30 |     assert xk == sk, (sk, xk)
 31 |     for k in xk:
 32 |         yield from _check_and_extract(x=x[k], schema=schema[k])
 33 | 
 34 | 
 35 | def check_and_extract(x, schema) -> Any:
 36 |     [res] = list(_check_and_extract(x=x, schema=schema))
 37 |     return res
 38 | 
 39 | 
 40 | # TODO move to some generic helper
 41 | SCHEMA = {
 42 |     'meta': {
 43 |         'code': int,
 44 |         'requestId': str,
 45 |     },
 46 |     'notifications': [
 47 |         {
 48 |             'item': {
 49 |                 'unreadCount': int,
 50 |             },
 51 |             'type': str,
 52 |         },
 53 |     ],
 54 |     'response': {
 55 |         'checkins': {
 56 |             'count': int,
 57 |             'items': TARGET,
 58 |         }
 59 |     },
 60 | }
 61 | 
 62 | 
 63 | class Normaliser(JsonNormaliser):
 64 |     PRUNE_DOMINATED = True
 65 |     # hmm, I guess makes sense to make MULTIWAY = False considering it seems to be cumulative... kinda safer this way
 66 |     # on the otherhand useful to keep multiway for renamed venues? ugh
 67 |     MULTIWAY = True
 68 | 
 69 |     def cleanup(self, j: Json) -> Json:
 70 |         # ok, a bit nasty -- foursquare export seems to be a list of some sort of responses..
 71 |         assert isinstance(j, list)
 72 | 
 73 |         res = []
 74 |         for d in j:
 75 |             l = check_and_extract(x=d, schema=SCHEMA)
 76 |             assert isinstance(l, list)
 77 |             res.extend(l)
 78 | 
 79 |         for c in res:
 80 |             # some id that might change, probs useless
 81 |             v = c.get('venue', None)
 82 |             if v is not None:
 83 |                 v['contact'].pop('facebook', None)  # don't care
 84 |                 v['contact'].pop('instagram', None)  # don't care
 85 |                 v.pop('verified', None)  # don't care
 86 |                 v.pop('delivery', None)  # eh, we don't care about what venue uses for delivery
 87 | 
 88 |             # todo would be nice to support compose keys for delkeys..
 89 |             # e.g. ('venue', 'contact', 'facebook')
 90 |             delkeys(
 91 |                 c,
 92 |                 keys={
 93 |                     ## these are just always changing, nothing we can do about it
 94 |                     'checkinsCount',
 95 |                     'usersCount',
 96 |                     'tipCount',
 97 |                     ##
 98 |                     'sticker',  # very volatile, some crap that 4sq sets on places
 99 |                     # ugh. lat/lng are volatile, varying after 4th digit after dot for some reason
100 |                     'lat',
101 |                     'lng',  # TODO instead round to 4th digit or something??
102 |                 },
103 |             )
104 | 
105 |         return res
106 | 
107 | 
108 | if __name__ == '__main__':
109 |     Normaliser.main()
110 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/ghexport.py:
--------------------------------------------------------------------------------
 1 | from bleanser.core.modules.json import Json, JsonNormaliser
 2 | 
 3 | 
 4 | class Normaliser(JsonNormaliser):
 5 |     PRUNE_DOMINATED = True
 6 |     MULTIWAY = True
 7 | 
 8 |     def cleanup(self, j: Json) -> Json:
 9 |         if isinstance(j, list):
10 |             # old format -- I think only contained events log or something
11 |             return j
12 | 
13 |         profile = j.get('profile')
14 |         if profile is not None:
15 |             profile.pop('disk_usage', None)
16 |             profile.pop('updated_at', None)  # I think it updates at any github activity, so pretty pointless
17 |             profile.pop('followers', None)  # pretty volatile, so not worth keeping + reflected in "followers" field anyway
18 | 
19 |         volatile = [
20 |             'stargazers_count',
21 |             'watchers',
22 |             'watchers_count',
23 |             'forks',
24 |             'forks_count',
25 |             'open_issues',
26 |             'open_issues_count',
27 |         ]
28 | 
29 |         for what in ['repos', 'watched', 'starred', 'subscriptions']:
30 |             thing = j.get(what)
31 |             if thing is None:
32 |                 continue
33 |             for r in thing:
34 |                 # these are gonna be super flaky, so just ignore from diff
35 |                 # for our own repos they are duplicated in events anyway
36 |                 for k in [
37 |                     *volatile,
38 |                     'updated_at',
39 |                     'pushed_at',
40 |                     'size',
41 |                 ]:
42 |                     r.pop(k, None)
43 | 
44 |                 repo_name = r["full_name"]
45 |                 if repo_name == 'emacs-straight/advice-patch':
46 |                     r.pop('description')
47 |                     # changes every day automatically
48 |                     # TODO move to private overlay?
49 | 
50 |         for r in j['repos']:
51 |             repo_name = r["full_name"]
52 | 
53 |             for k in volatile:
54 |                 v = r.get(k)
55 |                 if v is None:
56 |                     continue
57 |                 r[k] = r[k] // 10 * 10  # round up to nearest multiple of 10 so there are less diffs
58 | 
59 |             ## need to 'flatten' traffic, otherwise it can't properly figure out diffs
60 |             ## TODO possible to make generic, e.g. hint the normaliser that we need to flatten .repos.traffic.clones field
61 |             traffic = r.get('traffic')
62 |             if traffic is None:
63 |                 continue
64 |             for key in ['clones', 'views']:
65 |                 xxx = traffic[key]
66 |                 xxx.pop('count')  # aggregate
67 |                 xxx.pop('uniques')  # aggregate
68 |                 assert xxx.keys() == {key}
69 |                 # NOTE: we ignore first and last traffic entry since timestamps are aligned to the closest day
70 |                 # so they are always going to be kinda flaky
71 |                 for c in xxx[key][1:-1]:
72 |                     ts = c['timestamp']
73 |                     j[f'{repo_name}_traffic_{key}_{ts}'] = c
74 |                 xxx.pop(key)
75 |             for key in ['popular/paths', 'popular/referrers']:
76 |                 # TODO hmm these are still quite flaky? they collect stats over last two weeks so can change a lot..
77 |                 j[f'{repo_name}_traffic_{key}'] = traffic[key]
78 |                 traffic.pop(key)
79 | 
80 |         # TODO should probably prefer in place cleanup to make consistent with sqlite? not sure
81 |         return j
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     Normaliser.main()
86 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/goodreads.py:
--------------------------------------------------------------------------------
 1 | from bleanser.core.modules.xml import Normaliser as XmlNormaliser
 2 | 
 3 | 
 4 | class Normaliser(XmlNormaliser):
 5 |     MULTIWAY = True
 6 |     PRUNE_DOMINATED = True
 7 | 
 8 |     def cleanup(self, t):
 9 |         for key in [
10 |             'average_rating',
11 |             'text_reviews_count',
12 |             'ratings_count',
13 |             'book/description',  # volatile
14 |         ]:
15 |             for x in t.findall('.//' + key):
16 |                 x.getparent().remove(x)
17 |         return t
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     Normaliser.main()
22 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/hinge_android.py:
--------------------------------------------------------------------------------
  1 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool
  2 | 
  3 | 
  4 | class Normaliser(SqliteNormaliser):
  5 |     MULTIWAY = True
  6 |     PRUNE_DOMINATED = True
  7 | 
  8 |     ALLOWED_BLOBS = {
  9 |         # hopefully should be fine, all the metadata seems to be present in the table
 10 |         ('chat_messages', 'serialized'),
 11 |         ('channels', 'serialized'),
 12 |     }
 13 | 
 14 |     def check(self, c) -> None:
 15 |         tables = Tool(c).get_tables()
 16 |         msgs = tables['chat_messages']
 17 |         # TODO hmm, maybe 'created' just means created in the db?
 18 |         assert 'sent' in msgs, msgs
 19 |         assert 'body' in msgs, msgs
 20 |         assert 'messageId' in msgs, msgs
 21 |         profiles = tables['profiles']
 22 |         assert 'userId' in profiles, profiles
 23 |         # not sure if really useful at all but whatever
 24 |         channels = tables['channels']
 25 |         assert 'subjectId' in channels, channels
 26 | 
 27 |     def cleanup(self, c) -> None:
 28 |         self.check(c)  # todo could also call 'check' after just in case
 29 |         t = Tool(c)
 30 |         # seems that e.g. liked_content has some retention, so will need multiway
 31 | 
 32 |         # TODO not sure if it can be useful at all?? it contains something like 'Today' etc...
 33 |         # it generates tons of changes.. so I'd rather drop it I guess
 34 |         t.drop_cols(table='profiles', cols=['lastActiveStatus', 'lastActiveStatusId'])
 35 | 
 36 |         # not sure what's the point of updated col here, it just changes for all entries at the same time
 37 |         t.drop_cols(table='channels', cols=['updated', 'serialized'])
 38 | 
 39 |         # eh, not sure, they appear to be modified without actual changes to other cols?
 40 |         # fmt: off
 41 |         t.drop_cols(table='profiles'     , cols=['created', 'updated', 'hidden'])
 42 |         t.drop_cols(table='answers'      , cols=['created', 'modified'])
 43 |         t.drop_cols(table='player_media' , cols=['created'])
 44 |         t.drop_cols(table='subject_media', cols=['created'])
 45 |         # fmt: on
 46 | 
 47 |         # instagram urls change all the time (they contain some sort of token)
 48 |         # and expire quickly anyway.. so just easier to cleanup
 49 |         c.execute('UPDATE subject_media SET photoUrl="", thumbnailUrl="", videoUrl="" WHERE source = "instagram"')
 50 |         # todo width,height are changing all the time for some reason for subject_media
 51 | 
 52 |         # TODO pending_ratings??
 53 | 
 54 |         ##
 55 |         t.drop(table='metrics')
 56 |         # TODO WTF?? they are collecting some network stats and putting in the db? e.g. metered/vpn/etc
 57 |         t.drop(table='networks')
 58 | 
 59 |         t.drop(table='preference_choices')  # search prefrences -- change all the time and not interesting
 60 |         t.drop(table='pending_ratings')  # flaky, seems like contains intermediate state
 61 | 
 62 |         ## clean up unnecessary profile/media data
 63 |         # seems 3 - seems like if there is a conversation with user, so worth keeping
 64 |         # state 1 - seems like 'liked', probs not worth tracking
 65 |         # state 11 is possibly 'seen', so not super interesting
 66 |         delete_profiles = 'FROM profiles WHERE state in (1, 11)'
 67 |         for tbl in ['subject_media', 'answers']:
 68 |             c.execute(f'DELETE FROM {tbl} WHERE userId IN (SELECT userId {delete_profiles})')
 69 |             # delete orphans too
 70 |             c.execute(f'DELETE FROM {tbl} WHERE userId NOT IN (SELECT userId FROM profiles)')
 71 |         c.execute(f'DELETE {delete_profiles}')
 72 |         ##
 73 | 
 74 |         ## id seems to be very unstable, as if they are resequenced all the time...
 75 |         remove_ids = [
 76 |             'answers',
 77 |             'player_media',
 78 |             'basic_choices',
 79 |             'branding',
 80 |             'channels',
 81 |             'surveys',
 82 |             'subject_media',
 83 |             'liked_content',
 84 |         ]
 85 |         for table in remove_ids:
 86 |             t.drop_cols(table=table, cols=['id'])
 87 | 
 88 |         t.drop(table='standouts_content')  # things are flaky here, even urls are changing between databases -- likely they are expiring
 89 | 
 90 |         t.drop_cols(table='surveys', cols=['receivedByHinge'])
 91 |         t.drop_cols(table='call_prompt_packs', cols=['position'])
 92 |         # player_media are user pics? might be useful..
 93 |         t.drop_cols(table='player_media', cols=['position'])
 94 |         t.drop_cols(table='subject_media', cols=['position'])
 95 |         t.drop_cols(table='products', cols=['lastApiUpdate', 'lastStoreUpdate'])
 96 |         ##
 97 | 
 98 | 
 99 | if __name__ == '__main__':
100 |     Normaliser.main()
101 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/hpi/fbmessenger_android.py:
--------------------------------------------------------------------------------
 1 | # NOTE: this is experimental for now, best to use the corresponding module bleanser.modules.* instead
 2 | import os
 3 | from collections.abc import Iterator
 4 | from pathlib import Path
 5 | from typing import Any
 6 | 
 7 | from my.core.cfg import tmp_config
 8 | 
 9 | from bleanser.core.modules.extract import ExtractObjectsNormaliser
10 | 
11 | ## disable cache, otherwise it's gonna flush it all the time
12 | # TODO this should be in some sort of common module
13 | os.environ["CACHEW_DISABLE"] = "*"
14 | os.environ.pop("ENLIGHTEN_ENABLE", None)
15 | os.environ["LOGGING_LEVEL_my_fbmessenger_android"] = "WARNING"
16 | ##
17 | 
18 | import my.fbmessenger.android as module
19 | 
20 | 
21 | class Normaliser(ExtractObjectsNormaliser):
22 |     MULTIWAY = True
23 |     PRUNE_DOMINATED = True
24 | 
25 |     def extract_objects(self, path: Path) -> Iterator[Any]:
26 |         class config:
27 |             class fbmessenger:
28 |                 class android:
29 |                     export_path = path
30 |                     # TODO facebook_id??
31 | 
32 |         with tmp_config(modules=module.__name__, config=config):
33 |             assert len(module.inputs()) == 1  # sanity check to make sure tmp_config worked as expected
34 |             for m in module.messages():
35 |                 yield "message", m
36 |             for c in module.contacts():
37 |                 yield "contact", c
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     Normaliser.main()
42 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/hpi/twitter_android.py:
--------------------------------------------------------------------------------
 1 | # NOTE: this is experimental for now, best to use the corresponding module bleanser.modules.* instead
 2 | import os
 3 | from collections.abc import Iterator
 4 | from pathlib import Path
 5 | from typing import Any
 6 | 
 7 | from my.core.cfg import tmp_config
 8 | 
 9 | from bleanser.core.modules.extract import ExtractObjectsNormaliser
10 | 
11 | ## disable cache, otherwise it's gonna flush it all the time
12 | # TODO this should be in some sort of common module
13 | os.environ['CACHEW_DISABLE'] = '*'
14 | os.environ.pop('ENLIGHTEN_ENABLE', None)
15 | os.environ['LOGGING_LEVEL_my_twitter_android'] = 'WARNING'
16 | ##
17 | 
18 | import my.twitter.android as twitter_android
19 | 
20 | 
21 | class Normaliser(ExtractObjectsNormaliser):
22 |     MULTIWAY = True
23 |     PRUNE_DOMINATED = True
24 | 
25 |     def extract_objects(self, path: Path) -> Iterator[Any]:
26 |         class config:
27 |             class twitter:
28 |                 class android:
29 |                     export_path = path
30 | 
31 |         with tmp_config(modules=twitter_android.__name__, config=config):
32 |             assert len(twitter_android.inputs()) == 1  # sanity check to make sure tmp_config worked as expected
33 |             for x in twitter_android.bookmarks():
34 |                 yield 'bookmark', x
35 |             for x in twitter_android.likes():
36 |                 yield 'like', x
37 |             for x in twitter_android.tweets():
38 |                 yield 'tweet', x
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     Normaliser.main()
43 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/hpi/whatsapp_android.py:
--------------------------------------------------------------------------------
 1 | # NOTE: this is experimental for now, best to use the corresponding module bleanser.modules.* instead
 2 | import os
 3 | from collections.abc import Iterator
 4 | from pathlib import Path
 5 | from typing import Any
 6 | 
 7 | from my.core.cfg import tmp_config
 8 | 
 9 | from bleanser.core.modules.extract import ExtractObjectsNormaliser
10 | 
11 | ## disable cache, otherwise it's gonna flush it all the time
12 | # TODO this should be in some sort of common module
13 | os.environ["CACHEW_DISABLE"] = "*"
14 | os.environ.pop("ENLIGHTEN_ENABLE", None)
15 | os.environ["LOGGING_LEVEL_my_whatsapp_android"] = "WARNING"
16 | ##
17 | 
18 | import my.whatsapp.android as module
19 | 
20 | 
21 | class Normaliser(ExtractObjectsNormaliser):
22 |     MULTIWAY = True
23 |     PRUNE_DOMINATED = True
24 | 
25 |     def extract_objects(self, path: Path) -> Iterator[Any]:
26 |         class config:
27 |             class whatsapp:
28 |                 class android:
29 |                     export_path = path
30 |                     # TODO my_user_id?
31 | 
32 |         with tmp_config(modules=module.__name__, config=config):
33 |             assert len(module.inputs()) == 1  # sanity check to make sure tmp_config worked as expected
34 |             yield from module.entities()
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     Normaliser.main()
39 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/instagram_android.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | from bleanser.core.modules.json import delkeys, patch_atoms
  4 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool
  5 | 
  6 | 
  7 | def _patch_volatile_urls(x):
  8 |     # these contain some sort of hashes and change all the time
  9 |     if not isinstance(x, str):
 10 |         return x
 11 |     if 'fbcdn.net' in x:
 12 |         return ""
 13 |     if 'cdninstagram' in x:
 14 |         return ""
 15 |     return x
 16 | 
 17 | 
 18 | def _cleanup_jsons(s):
 19 |     if s is None:
 20 |         return None
 21 | 
 22 |     if isinstance(s, bytes):
 23 |         j = json.loads(s.decode('utf8'))
 24 |     else:
 25 |         # hmm normally it's bytes, but on odd occasions (old databases??) was str? odd
 26 |         j = json.loads(s)
 27 | 
 28 |     # TODO thread_v2_id -- might be useful for some other processing?
 29 |     delkeys(j, keys=[
 30 |         ## messages db
 31 |         'user',  # eh. super volatile fields inside it... even full name changes all the time for no reason?
 32 |         'is_replied_to_msg_taken_down',
 33 |         'hscroll_share',  # some reaction bullshit
 34 |         'account_badges',
 35 |         ##
 36 | 
 37 |         ## threads db
 38 |         'recipients',  # same as 'user' in messages db.. pretty volatile
 39 |         'has_older_thread_messages_on_server',
 40 |         'interop_user_type',
 41 |         'transparency_product_enabled',
 42 |         'notification_preview_controls',
 43 |         'thread_context_items',  # some volatile follower counts?
 44 |         'snippet',
 45 |         'theme',
 46 |         'ig_thread_capabilities',
 47 |         'ai_agent_social_signal_message_count',
 48 |         'has_groups_xac_ineligible_user',
 49 |         ##
 50 | 
 51 |         'is_group_xac_calling_eligible',
 52 |         'processed_business_suggestion',
 53 | 
 54 |         'url_expiration_timestamp_us',
 55 |         'is_eligible_for_igd_stacks',
 56 |         'profile_pic_url',  # volatile
 57 |         'all_media_count',
 58 |         'displayed_action_button_type',
 59 |         'is_epd',
 60 |         'liked_clips_count',
 61 |         'reel_media_seen_timestamp',
 62 |         'latest_besties_reel_media',
 63 |         'latest_fanclub_reel_media',
 64 |         'latest_reel_media',
 65 | 
 66 |         'follow_friction_type',
 67 |         'playable_url_info',
 68 |         'preview_url_info',
 69 |         'muting',
 70 |         'biz_thread_throttling_state',
 71 |         'badge_count',
 72 |         'follower_count',
 73 |         'following_count',
 74 | 
 75 |         'last_seen_at',
 76 | 
 77 |         'client_context',  # seems to be same as client_item_id -- volatile
 78 | 
 79 |         'feed_post_reshare_disabled',
 80 | 
 81 |         'is_sent_by_viewer',  # very volatile for no reason??
 82 | 
 83 |         'followed_by',
 84 |         'account_type',  # sometimes changes between 1 and 2?
 85 |         'fan_club_info',  # seems like page description
 86 | 
 87 |         'is_business',
 88 |         'is_following_current_user',
 89 |         'is_interest_account',
 90 |         'wa_addressable',
 91 | 
 92 |         'inviter',  # thread inviter? volatile
 93 | 
 94 |         # seems like fields in it appear and disappear for no reason without any actual status changes
 95 |         'friendship_status',
 96 | 
 97 |         'hide_in_thread',
 98 |         'forward_score',
 99 | 
100 |         ## I think these are properties of messages.user json blob
101 |         'paid_partnership_info',
102 |         'biz_user_inbox_state',
103 |         'has_exclusive_feed_content',
104 |         'has_encrypted_backup',
105 |         'is_using_unified_inbox_for_direct',
106 |         'personal_account_ads_page_id',
107 |         'personal_account_ads_page_name',
108 |         'show_account_transparency_details',
109 |         'organic_tracking_token',
110 |         'should_show_category',
111 |         'fundraiser_tag',
112 |         ##
113 | 
114 |         'unseen_count',
115 |         'send_attribution',
116 |         'send_silently',
117 |         'smart_suggestion',
118 |         'idempotence_token',
119 | 
120 |         ## threads.recipients properties
121 |         'can_coauthor_posts',
122 |         'can_coauthor_posts_with_music',
123 |         ##
124 | 
125 |         'visual_messages_newest_cursor',
126 |         'thread_messages_oldest_cursor',
127 |     ])  # fmt: skip
128 |     j = patch_atoms(j, patch=_patch_volatile_urls)
129 |     return json.dumps(j, sort_keys=True).encode('utf8')
130 | 
131 | 
132 | class Normaliser(SqliteNormaliser):
133 |     MULTIWAY = True
134 |     PRUNE_DOMINATED = True
135 | 
136 |     def check(self, c) -> None:
137 |         tables = Tool(c).get_tables()
138 |         msgs = tables['messages']
139 |         assert 'timestamp' in msgs
140 |         assert 'text' in msgs
141 | 
142 |         _threads = tables['threads']
143 | 
144 |     def cleanup(self, c) -> None:
145 |         self.check(c)
146 | 
147 |         t = Tool(c)
148 |         t.drop('session')  # super volatile
149 | 
150 |         for tbl in ['messages', 'threads']:
151 |             t.drop_cols(
152 |                 tbl,
153 |                 cols=[
154 |                     # changes all the time without changing content
155 |                     '_id',
156 |                     #
157 |                     # kinda volatile, seems to change some time after it's inserted?
158 |                     # doesn't seem used in any indexes etc
159 |                     'client_item_id',
160 |                 ],
161 |             )
162 | 
163 |         t.drop_cols('threads', cols=['last_activity_time'])
164 | 
165 |         # so message/thread_info tables also contain a json field with raw data, and it's very volatile
166 |         # to clean it up, tried using this at first:
167 |         # SELECT _id, message_type, message, json_remove(message, (SELECT DISTINCT(fullkey) FROM messages, json_tree(message) WHERE atom LIKE '%cdninstagram%')) FROM messages ORDER BY message_type
168 |         # it was promising, but it seems that it's not possible to pass multiple arguments from a scalar subquery
169 |         # it only ended up removing the first key
170 |         c.create_function("CLEANUP_JSONS", 1, _cleanup_jsons)
171 |         queries = [
172 |             'UPDATE messages SET message     = CLEANUP_JSONS(message)',
173 |             'UPDATE threads  SET thread_info = CLEANUP_JSONS(thread_info)',
174 |         ]
175 |         for query in queries:
176 |             list(c.execute(query))
177 |         # a bit insane and experimental... but worked surprisingly smoothly and fast?
178 |         ##
179 | 
180 | 
181 | if __name__ == '__main__':
182 |     Normaliser.main()
183 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/json_new.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | 
 3 | from bleanser.core.modules.json import *  # noqa: F403, F401
 4 | 
 5 | warnings.warn("Module 'bleanser.modules.json_new' is deprecated. Use 'bleanser.core.modules.json' instead.", DeprecationWarning)
 6 | 
 7 | 
 8 | if __name__ == '__main__':
 9 |     JsonNormaliser.main()  # noqa: F405
10 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/kobo.py:
--------------------------------------------------------------------------------
  1 | from sqlite3 import Connection
  2 | 
  3 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool
  4 | 
  5 | 
  6 | class Normaliser(SqliteNormaliser):
  7 |     # events are only snapshots, so probs makes sense
  8 |     MULTIWAY = True
  9 |     PRUNE_DOMINATED = True
 10 | 
 11 |     def check(self, c: Connection) -> None:
 12 |         tool = Tool(c)
 13 |         tables = tool.get_tables()
 14 |         assert 'content' in tables, tables
 15 |         bm = tables['Bookmark']
 16 |         # fmt: off
 17 |         assert 'ExtraAnnotationData' in bm, bm
 18 |         assert 'BookmarkID'          in bm, bm
 19 |         assert 'DateCreated'         in bm, bm
 20 |         # fmt: on
 21 |         assert 'BookAuthors' in tables, tables
 22 | 
 23 |     def cleanup(self, c: Connection) -> None:
 24 |         self.check(c)
 25 | 
 26 |         tool = Tool(c)
 27 | 
 28 |         tool.fix_bad_blob_column(table='Activity', column='Data')
 29 |         tool.fix_bad_blob_column(table='Event', column='ExtraData')
 30 |         tool.fix_bad_blob_column(table='Bookmark', column='ExtraAnnotationData')
 31 | 
 32 |         tool.drop('content')  # some cached book data? so not very interesting when it changes..
 33 |         tool.drop('content_keys')  # just some image meta
 34 |         tool.drop('volume_shortcovers')  # just some hashes
 35 |         tool.drop('volume_tabs')  # some hashes
 36 |         tool.drop('KoboPlusAssets')  # some builtin faqs/manuals etc
 37 |         tool.drop('KoboPlusAssetGroup')  # some builtin faqs/manuals etc
 38 |         tool.drop('Tab')  # shop tabs
 39 |         tool.drop('Achievement')
 40 |         # TODO DbVersion?
 41 |         # TODO version in user table?
 42 | 
 43 |         tool.drop_cols(table='Event', cols=['Checksum'])
 44 | 
 45 |         ## these are changing all the time
 46 |         # TODO not sure about RecentBook?
 47 |         c.execute('''
 48 |         DELETE FROM Activity
 49 |         WHERE Type IN (
 50 |           "Recommendations",
 51 |           "TopPicksTab",
 52 |           "Top50"
 53 |         )
 54 |         ''')
 55 |         ##
 56 |         # TODO hmm maybe drop all RecentBook from Activity? although doesn't help all that much
 57 | 
 58 |         c.execute('''
 59 |         DELETE FROM AnalyticsEvents
 60 |         WHERE Type IN (
 61 |           "PluggedIn",
 62 |           "BatteryLevelAtSync"
 63 |         )''')
 64 | 
 65 |         ## this changes all the time (Shelf only contains some meta entries, this isn't actual book access time)
 66 |         c.execute('UPDATE Shelf SET _SyncTime = NULL, LastAccessed = NULL, LastModified = NULL WHERE Id = "ReadingList"')
 67 |         ##
 68 | 
 69 |         tool.drop_cols(
 70 |             table='user',
 71 |             cols=[
 72 |                 'SyncContinuationToken',
 73 |                 'KoboAccessToken',
 74 |                 'KoboAccessTokenExpiry',
 75 |                 'AuthToken',
 76 |                 'RefreshToken',
 77 |                 'Loyalty',
 78 |                 'PrivacyPermissions',  # not very interesting, contains this stuff https://github.com/shadow81627/scrapey/blob/6dc2a7bba7f5adf2e3335c68e30208c71cfb5c2d/cookies.json#L950
 79 |             ],
 80 |         )
 81 |         tool.drop_cols(
 82 |             table='Bookmark',
 83 |             cols=[
 84 |                 # TODO UserID??
 85 |                 # TODO ugh. DateCreated sometimes rounds to nearest second? wtf...
 86 |                 #
 87 |                 'SyncTime',
 88 |                 'Version',  # not sure what it is, but sometimes changing?
 89 |                 #
 90 |                 'StartContainerChildIndex',
 91 |                 'EndContainerChildIndex',  # ????
 92 |                 #
 93 |                 'StartContainerPath',
 94 |                 'EndContainerPath',
 95 |             ],
 96 |         )
 97 |         # TODO Event table -- not sure... it trackes event counts, so needs to be cumulative or something?
 98 |         # yep, they def seem to messing up a lot
 99 |         # TODO Activity -- dates changing all the time... not sure
100 | 
101 | 
102 | if __name__ == '__main__':
103 |     Normaliser.main()
104 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/lastfm.py:
--------------------------------------------------------------------------------
 1 | from bleanser.core.modules.json import Json, JsonNormaliser
 2 | 
 3 | 
 4 | class Normaliser(JsonNormaliser):
 5 |     MULTIWAY = True
 6 |     PRUNE_DOMINATED = True
 7 | 
 8 |     def cleanup(self, j: Json) -> Json:
 9 |         # ugh sometimes case changes for no reason
10 |         for x in j:
11 |             for k, v in list(x.items()):
12 |                 if isinstance(v, str):
13 |                     # defensive, there was a date (around 2019-01-16) when dates glitched and were ints...
14 |                     x[k] = v.lower()
15 |         return j
16 |         # todo would be nice to use jq for that... e.g. older filter was
17 |         # 'sort_by(.date) | map(map_values(ascii_downcase?))'
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     Normaliser.main()
22 | 
23 | 
24 | def test_lastfm() -> None:
25 |     """
26 |     This test also highlights how multiway cleanup is more efficient than twoway
27 |     """
28 |     from bleanser.tests.common import skip_if_no_data
29 | 
30 |     skip_if_no_data()
31 | 
32 |     from bleanser.tests.common import TESTDATA, actions, hack_attribute
33 | 
34 |     data = TESTDATA / 'lastfm'
35 |     paths = sorted(data.glob('*.json'))
36 | 
37 |     with hack_attribute(Normaliser, key='MULTIWAY', value=False):
38 |         res = actions(paths=paths, Normaliser=Normaliser)
39 |     assert [p.name for p in res.pruned] == [
40 |         'lastfm_20211107T011431Z.json',  # fully contained in lastfm_20211127T011459Z
41 |     ]
42 | 
43 |     with hack_attribute(Normaliser, key='MULTIWAY', value=True):
44 |         res = actions(paths=paths, Normaliser=Normaliser)
45 |     assert [p.name for p in res.remaining] == [
46 |         'lastfm_2017-08-29.json',   # keeping : initial: X + a
47 | 
48 |         # disappeared (a), and a bunch of items added (Y)
49 |         # (a) <toplevel> ::: {"album": "", "artist": "pusha t/haim/q-tip/stromae/lorde", "date": "1503868125", "name": "meltdown (\u0438\u0437 \u0444\u0438\u043b\u044c\u043c\u0430 \u00ab\u0433\u043e\u043b\u043e\u0434\u043d\u044b\u0435 \u0438\u0433\u0440\u044b: \u0441\u043e\u0439\u043a\u0430-\u043f\u0435\u0440\u0435\u0441\u043c\u0435\u0448\u043d\u0438\u0446\u0430\u00bb. \u0447\u0430\u0441\u0442\u044c i)"}
50 |         # 'lastfm_2017-09-01.json', # removing:          X     + Y
51 | 
52 |         # bunch of items were added (Z + b)
53 |         'lastfm_2017-09-19.json',   # keeping :          X     + Y + Z + b
54 | 
55 |         # but b disappeared in this: so the previous item is the last pivot
56 |         # (b) <toplevel> ::: {"album": "", "artist": "denny berthiaume", "date": "1505649846", "name": "moon river"}
57 |         # 'lastfm_2017-09-22.json', # removing:          X     + Y + Z     + Q
58 | 
59 |         'lastfm_2017-10-31.json',   # keeping : last item in group
60 | 
61 |         # this item is only present in this file:
62 |         # <toplevel> ::: {"album": "departed glories", "artist": "biosphere", "date": "1635619124", "name": "than is the mater"}
63 |         'lastfm_20211031T001458Z.json',
64 | 
65 |         # this item is only present in this file:
66 |         # > <toplevel> ::: {"album": "2010", "artist": "earl sweatshirt", "date": "1638578097", "name": "2010"}
67 |         'lastfm_20211204T011641Z.json',
68 | 
69 |         # last item
70 |         'lastfm_20220103T011522Z.json',
71 |     ]  # fmt: skip
72 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/monzo.py:
--------------------------------------------------------------------------------
 1 | from bleanser.core.modules.json import Json, JsonNormaliser, delkeys
 2 | 
 3 | 
 4 | class Normaliser(JsonNormaliser):
 5 |     MULTIWAY = True
 6 |     PRUNE_DOMINATED = True
 7 | 
 8 |     def cleanup(self, j: Json) -> Json:
 9 |         delkeys(
10 |             j,
11 |             keys=[
12 |                 'account_balance',  # obvs flaky
13 |                 'suggested_tags',
14 |                 'website',
15 |                 #
16 |                 'address',
17 |                 'formatted',
18 |                 'logo',
19 |                 #
20 |                 ## flaky and useless
21 |                 'mastercard_lifecycle_id',
22 |                 'mastercard_clearing_message_id',
23 |                 'token_transaction_identifier',
24 |                 'tab_id',
25 |                 ##
26 |                 #
27 |                 'settled',
28 |                 'updated',
29 |                 'amount_is_pending',
30 |                 #
31 |                 'payee_id',  # odd but sometimes flaky
32 |                 'can_add_to_tab',
33 |             ],
34 |         )
35 | 
36 |         if isinstance(j, list):
37 |             # old format, only transactions for one account
38 |             return j
39 | 
40 |         # flatten out transactions
41 |         for account, d in list(j.items()):
42 |             transactions = d['data']['transactions']
43 |             j[f'{account}_transactions'] = transactions
44 |             del d['data']['transactions']
45 |         return j
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     Normaliser.main()
50 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/pinboard.py:
--------------------------------------------------------------------------------
 1 | from bleanser.core.modules.json import JsonNormaliser
 2 | 
 3 | 
 4 | class Normaliser(JsonNormaliser):
 5 |     MULTIWAY = True
 6 |     PRUNE_DOMINATED = True
 7 | 
 8 | 
 9 | if __name__ == '__main__':
10 |     Normaliser.main()
11 | 
12 | 
13 | # TODO pinboard: tag summaries might be flaky
14 | # might be worth doing
15 | # if isinstance(j, dict):
16 | #     del j['tags']
17 | 
18 | 
19 | def test_pinboard() -> None:
20 |     from bleanser.tests.common import skip_if_no_data
21 | 
22 |     skip_if_no_data()
23 | 
24 |     from bleanser.tests.common import TESTDATA, actions
25 | 
26 |     data = TESTDATA / 'pinboard'
27 | 
28 |     paths = sorted(data.glob('*.json'))
29 | 
30 |     res = actions(paths=paths, Normaliser=Normaliser)
31 | 
32 |     # note: some items duplicate in pinboard...
33 |     # e.g. in bookmarks_2019-08-06.json.xz
34 |     # <toplevel> ::: {"description": "Visual Leak Detector - Enhanced Memory Leak Detection for Visual C++ - CodeProject", "extended": "", "hash": "ef6dcf9d2987ea1f4919b31024c33662", "href": "http://www.codeproject.com/KB/applications/visualleakdetector.aspx", "meta": "8341db79448607b145078e00e69c8003", "shared": "yes", "tags": "debugging cpp", "time": "2014-02-09T01:02:57Z", "toread": "no"}
35 | 
36 |     assert [p.name for p in res.remaining] == [
37 | 
38 |         'bookmarks_2019-08-06.json'      , # first in group
39 |         # fully contained in the next
40 |         # 'bookmarks_2019-08-07.json'      , : MOVE
41 | 
42 |         # has to keep the next because for example this bookmark is flaky:
43 |         # rg 'An Easy Explaination Of First And Follow Sets' | sort
44 |         # bookmarks_2019-08-07.json:{"href":"http:\/\/www.jambe.co.nz\/UNI\/FirstAndFollowSets.html","description":"An Easy Explaination Of First And Follow Sets","extended":"","meta":"c68c6b649d587543bae12367e6fce8ec","hash":"3688a0bcfb0ee9f7cb7fbda43aabe131","time":"2014-02-09T01:03:03Z","shared":"yes","toread":"no","tags":"cs parsing"},
45 |         # bookmarks_20190924T010105Z.json:{"href":"http:\/\/www.jambe.co.nz\/UNI\/FirstAndFollowSets.html","description":"An Easy Explaination Of First And Follow Sets","extended":"","meta":"c68c6b649d587543bae12367e6fce8ec","hash":"3688a0bcfb0ee9f7cb7fbda43aabe131","time":"2014-02-09T01:03:03Z","shared":"yes","toread":"no","tags":"cs parsing"},
46 |         # bookmarks_20190929T124250Z.json:   "description": "An Easy Explaination Of First And Follow Sets",
47 |         # pinboard_20201231T011022Z.json:   "description": "An Easy Explaination Of First And Follow Sets",
48 |         # pinboard_20210220T011105Z.json:   "description": "An Easy Explaination Of First And Follow Sets",
49 |         # pinboard_20210221T011013Z.json:   "description": "An Easy Explaination Of First And Follow Sets",
50 |         # pinboard_20220103T011019Z.json:   "description": "An Easy Explaination Of First And Follow Sets",
51 |         'bookmarks_20190924T010105Z.json', #: will keep
52 | 
53 |         # there is a whole bunch of flaky bookmarks like that ^ in pinboard, so won't bother annotating the rest
54 | 
55 |         # 'bookmarks_20190925T010106Z.json', : MOVE
56 |         'bookmarks_20190929T010107Z.json', #: will keep
57 |         'bookmarks_20190929T124250Z.json', #: will keep
58 |         # 'bookmarks_20190930T010107Z.json', : MOVE
59 |         # 'bookmarks_20191015T010107Z.json', : MOVE
60 |         # 'bookmarks_20191016T010107Z.json', : MOVE
61 |         # 'bookmarks_20191122T010108Z.json', : MOVE
62 |         # 'bookmarks_20191123T010107Z.json', : MOVE
63 |         'bookmarks_20191205T010108Z.json', #: will keep
64 |         # 'bookmarks_20191206T010107Z.json', : MOVE
65 |         # 'bookmarks_20191207T010107Z.json', : MOVE
66 |         'pinboard_20200501T011005Z.json' , #: will keep
67 |         # 'pinboard_20200502T011005Z.json' , : MOVE
68 |         'pinboard_20200614T011006Z.json' , #: will keep
69 |         'pinboard_20200615T001107Z.json' , #: will keep
70 |         # 'pinboard_20200616T001008Z.json' , : MOVE
71 |         # 'pinboard_20200812T001014Z.json' , : MOVE
72 |         # 'pinboard_20200813T001016Z.json' , : MOVE
73 |         # 'pinboard_20200814T001018Z.json' , : MOVE
74 |         # 'pinboard_20200815T001017Z.json' , : MOVE
75 |         'pinboard_20200826T001017Z.json' , #: will keep
76 |         # 'pinboard_20200827T001019Z.json' , : MOVE
77 |         'pinboard_20201230T011025Z.json' , #: will keep
78 |         # 'pinboard_20201231T011022Z.json' , : MOVE
79 |         # 'pinboard_20210220T011105Z.json' , : MOVE
80 |         # 'pinboard_20210221T011013Z.json' , : MOVE
81 |         'pinboard_20220103T011019Z.json' , #: will keep
82 |     ]  # fmt: skip
83 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/pocket.py:
--------------------------------------------------------------------------------
 1 | from bleanser.core.modules.json import Json, JsonNormaliser
 2 | 
 3 | 
 4 | class Normaliser(JsonNormaliser):
 5 |     MULTIWAY = True
 6 |     PRUNE_DOMINATED = True
 7 | 
 8 |     def cleanup(self, j: Json) -> Json:
 9 |         del j['since']  # flaky
10 |         return j
11 | 
12 | 
13 | if __name__ == '__main__':
14 |     Normaliser.main()
15 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/podcastaddict_android.py:
--------------------------------------------------------------------------------
  1 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool
  2 | 
  3 | 
  4 | class Normaliser(SqliteNormaliser):
  5 |     MULTIWAY = True
  6 |     PRUNE_DOMINATED = True
  7 | 
  8 |     ALLOWED_BLOBS = {
  9 |         ('fts_virtual_episode_segments', 'block'),
 10 |         ('fts_virtual_episode_segdir', 'root'),
 11 |         ('fts_virtual_episode_docsize', 'size'),
 12 |         ('fts_virtual_episode_stat', 'value'),
 13 |     }
 14 | 
 15 |     # TODO this would be useful as a base class method
 16 |     # could be called before cleanup/extract etc
 17 |     def check(self, c) -> None:
 18 |         tables = Tool(c).get_tables()
 19 |         assert 'podcasts' in tables, tables
 20 |         eps = tables['episodes']
 21 |         # to make sure it's safe to use multiway/prune dominated:
 22 |         assert 'playbackDate' in eps
 23 |         assert 'position_to_resume' in eps
 24 | 
 25 |     def cleanup(self, c) -> None:
 26 |         self.check(c)
 27 | 
 28 |         t = Tool(c)
 29 |         ## often changing, no point keeping
 30 |         t.drop_cols(
 31 |             table='episodes',
 32 |             cols=[
 33 |                 'thumbnail_id',
 34 |                 'new_status',
 35 |                 'downloaded_status_int',
 36 |                 'thumbsRating',
 37 |             ],
 38 |         )
 39 | 
 40 |         # no point tracking podcasts we're not following
 41 |         c.execute('DELETE FROM podcasts WHERE subscribed_status = 0')
 42 | 
 43 |         t.drop_cols(
 44 |             table='podcasts',
 45 |             cols=[
 46 |                 ## volatile at times, a bit annoying
 47 |                 'author',
 48 |                 'description',
 49 |                 ##
 50 |                 'last_modified',
 51 |                 'etag',  # ?? sometimes contains quoted last_modified or something..
 52 |                 'rating',
 53 |                 'reviews',
 54 |                 'iTunesID',
 55 |                 'latest_publication_date',
 56 |                 'averageDuration',
 57 |                 'frequency',
 58 |                 'episodesNb',
 59 |                 'subscribers',
 60 |                 'thumbnail_id',
 61 |                 'update_date',
 62 |                 'update_status',
 63 |                 'filter_chapter_excluded_keywords',
 64 |                 'category',
 65 |                 'explicit',
 66 |                 'server_id',
 67 |             ],
 68 |         )
 69 |         ##
 70 | 
 71 |         ## changing often and likely not interesting
 72 |         t.drop('ad_campaign')
 73 |         t.drop('bitmaps')
 74 |         t.drop('blocking_services')
 75 |         t.drop('content_policy_violation')
 76 |         t.drop('fts_virtual_episode_stat')
 77 |         t.drop('fts_virtual_episode_docsize')
 78 |         t.drop('fts_virtual_episode_segments')
 79 |         t.drop('fts_virtual_episode_segdir')
 80 |         t.drop('ordered_list')  # just some random numbers, always changing
 81 |         t.drop('statistics')  # just some random numbers, mostly empty
 82 |         t.drop('radio_search_results')
 83 |         t.drop('topics')  # some random topic names.. at some point just disappeared
 84 |         t.drop('iha')  # no idea what is it, contains one entry sometimes; volatile
 85 | 
 86 |         ## probably unnecessary?
 87 |         # tool.drop('chapters')
 88 |         # tool.drop('teams')
 89 |         # tool.drop('topics')
 90 |         # tool.drop('relatedPodcasts')
 91 |         # tool.drop('content_policy_violation')  # lol
 92 |         ##
 93 | 
 94 | 
 95 | if __name__ == '__main__':
 96 |     Normaliser.main()
 97 | 
 98 | 
 99 | def test_podcastaddict() -> None:
100 |     from bleanser.tests.common import skip_if_no_data
101 | 
102 |     skip_if_no_data()
103 | 
104 |     from bleanser.tests.common import TESTDATA, actions2
105 | 
106 |     res = actions2(path=TESTDATA / 'podcastaddict_android', rglob='**/*.db*', Normaliser=Normaliser)
107 |     assert res.remaining == [
108 |         '20180106220736/podcastAddict.db',
109 |         '20190227212300/podcastAddict.db',
110 |         '20200217195816/podcastAddict.db',
111 | 
112 |         '20200406041500/podcastAddict.db',
113 |         # '20210306070017/podcastAddict.db',
114 |         # '20210306070020/podcastAddict.db',
115 |         '20210306140046/podcastAddict.db',
116 | 
117 |         # keep: episode position changed
118 |         '20210306165958/podcastAddict.db',
119 | 
120 |         # '20210509141916/podcastAddict.db',
121 |         # '20210510070001/podcastAddict.db',
122 |         # '20210511185801/podcastAddict.db',
123 |         '20210513164819/podcastAddict.db',
124 |         # some podcast lengths changed... might be useful
125 |         '20210517000609/podcastAddict.db',
126 |         # '20211226145720/podcastAddict.db',
127 |         # '20211226172310/podcastAddict.db',
128 |         # some podcast authors changed... dunno if useful but whatever
129 |         '20211228010151/podcastAddict.db',
130 |     ]  # fmt: skip
131 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/reddit.py:
--------------------------------------------------------------------------------
  1 | from itertools import chain
  2 | 
  3 | from bleanser.core.modules.json import Json, JsonNormaliser, delkeys
  4 | 
  5 | REDDIT_IGNORE_KEYS = {
  6 |     ## TODO hmm maybe do something about these
  7 |     ## might be useful to keep
  8 |     'subreddit_subscribers',
  9 |     'subscribers',
 10 |     'ups',
 11 |     'score',
 12 |     'num_comments',
 13 |     'upvote_ratio',
 14 |     ###
 15 | 
 16 |     ## TODO ??
 17 |     'pwls', # TODO what is it??
 18 |     'likes', # todo?
 19 |     'wls', # TODO ???
 20 |     ##
 21 | 
 22 |     '_comments',
 23 |     'accept_chats',
 24 |     'accept_pms',
 25 |     'advertiser_category',
 26 |     'all_awardings',
 27 |     'allow_chat_post_creation',
 28 |     'allow_discovery',
 29 |     'allow_galleries',
 30 |     'allow_images',
 31 |     'allow_live_comments',
 32 |     'allow_polls',
 33 |     'allow_videogifs',
 34 |     'allow_videos',
 35 |     'allowed_galleries',
 36 |     'archived',
 37 |     'associated_award',
 38 |     'audience_target',
 39 |     'author_flair_background_color',
 40 |     'author_flair_css_class',
 41 |     'author_flair_richtext',
 42 |     'author_flair_template_id',
 43 |     'author_flair_text',
 44 |     'author_flair_text_color',
 45 |     'author_flair_type',
 46 |     'author_patreon_flair',
 47 |     'author_premium',
 48 |     'awarders',
 49 |     'banner_background_color',
 50 |     'banner_background_image',
 51 |     'banner_img',
 52 |     'banner_size',
 53 |     'can_assign_link_flair',
 54 |     'can_assign_user_flair',
 55 |     'can_gild',
 56 |     'collapse_deleted_comments',
 57 |     'collapsed', 'collapsed_reason', # todo potentially interesting?
 58 |     'comment_score_hide_mins',
 59 |     'community_icon',
 60 |     'content_categories',
 61 |     'crosspost_parent_list',
 62 |     'dash_url',
 63 |     'discussion_type',
 64 |     'emojis_custom_size',
 65 |     'emojis_enabled',
 66 |     'event_start', 'event_end', 'event_is_live',
 67 |     'free_form_reports',
 68 |     'gid_1',
 69 |     'gid_2',
 70 |     'gid_3',
 71 |     'gilded',
 72 |     'gildings',
 73 |     'has_menu_widget',
 74 |     'header_img',
 75 |     'header_size',
 76 |     'header_title',
 77 |     'hide_score',
 78 |     'hls_url',
 79 |     'icon_img',
 80 |     'icon_name',
 81 |     'icon_size',
 82 |     'icon_url',
 83 |     'is_chat_post_feature_enabled',
 84 |     'is_crosspostable',
 85 |     'is_crosspostable_subreddit',
 86 |     'is_robot_indexable',
 87 |     'is_self',
 88 |     'is_video',
 89 |     'key_color',
 90 |     'link_flair_css_class',
 91 |     'link_flair_enabled',
 92 |     'link_flair_position',
 93 |     'link_flair_richtext',
 94 |     'link_flair_template_id',
 95 |     'link_flair_text',
 96 |     'link_flair_type',
 97 |     'linked_identities',
 98 |     'media_embed',
 99 |     'media_metadata',
100 |     'mobile_banner_image',
101 |     'new',
102 |     'no_follow',
103 |     'oembed',
104 |     'og_description', 'og_title',
105 |     'original_content_tag_enabled',
106 |     'over18',
107 |     'over_18',
108 |     'owner_id',
109 |     'parent_whitelist_status', # some ads thing
110 |     'password_set',
111 |     'post_hint',
112 |     'pref_no_profanity', 'pref_geopopular', 'pref_top_karma_subreddits',
113 |     'primary_color',
114 |     'report_reasons',
115 |     'restrict_commenting',
116 |     'restrict_posting',
117 |     'rte_mode',
118 |     'score_hidden',
119 |     'secure_media',
120 |     'secure_media_embed',
121 |     'send_replies',
122 |     'show_media',
123 |     'show_media_preview',
124 |     'spoilers_enabled',
125 |     'steward_report',
126 |     'stickied',
127 |     'submission_type',
128 |     'submit_link_label',
129 |     'submit_text_label',
130 |     'suggested_comment_sort',
131 |     'suggested_sort',
132 |     'thumbnail',
133 |     'thumbnail_height',
134 |     'thumbnail_width',
135 |     'top_awarded_type',
136 |     'total_awards_received',
137 |     'treatment_tags',
138 |     'user_flair_richtext',
139 |     'user_flair_template_id',
140 |     'user_flair_text_color',
141 |     'user_flair_type',
142 |     'user_reports',
143 |     'videostream_links_count',
144 |     'whitelist_status',  # some ads thing
145 |     'wiki_enabled',
146 |     'snoovatar_img',
147 |     'snoovatar_size',
148 |     'allow_talks',
149 | 
150 |     ## very flaky
151 |     'link_flair_background_color',
152 |     'link_flair_text_color',
153 |     'call_to_action',  # sometimes null, sometimes not present?
154 |     ##
155 |     ##
156 | 
157 |     ## nothing interesting, some subreddit settings
158 |     'allowed_media_in_comments',
159 |     'comment_contribution_settings',
160 |     'should_archive_posts',
161 |     ##
162 | 
163 |     'awardee_karma',  # sometimes goes to 0 for no reason
164 | 
165 |     # TODO ??
166 |     # 'likes',
167 |     # 'url', # ugh. changed from www.reddit.... to link without reddit domain
168 |     # 'is_favorited',
169 |     # 'is_subscriber',
170 |     # 'domain',
171 |     # should_archive_posts -- not sure?
172 |     #
173 |     #
174 |     # subreddit_type: public/restricted -- actually quite useful info!
175 |     # profile -> link_karma, comment_karma -- probs useful to keep
176 |     #
177 |     # TODO maybe, num_crossposts? have only seen once so far
178 | }  # fmt: skip
179 | 
180 | 
181 | class Normaliser(JsonNormaliser):
182 |     # NOTE: we don't want to prune dominated/use multiway in reddit, because that way we lose timestamps for changes!!!
183 |     PRUNE_DOMINATED = False
184 | 
185 |     def cleanup(self, j: Json) -> Json:
186 |         delkeys(j, keys=REDDIT_IGNORE_KEYS)
187 | 
188 |         # hmm, 'created' changes all the time for some reason starting from 20181124201020
189 |         # https://www.reddit.com/r/redditdev/comments/29991t/whats_the_difference_between_created_and_created/ciiuk24/
190 |         # ok, it's broken, should use created_utc instead
191 |         for v in j.values():
192 |             if not isinstance(v, list):
193 |                 continue
194 |             for i in v:
195 |                 if 'created_utc' in i:
196 |                     i.pop('created', None)
197 | 
198 |                 i.pop('subreddit_type', None)
199 | 
200 |         ## karma is flaky, goes up and down even without actual votes
201 |         ## so make it a bit smoother
202 |         profile = j['profile']
203 |         for kf in ['link_karma', 'total_karma']:
204 |             k = profile.get(kf)
205 |             if k is not None:
206 |                 profile[kf] = k // 10 * 10
207 |         # ugh, total karma is flaking between two values for me consistently
208 |         # but removing it completely only gets rid of 10% of files?
209 |         ##
210 | 
211 |         for u in chain(j['upvoted'], j['downvoted']):
212 |             ## not sure what it is, but flaky from "" to null
213 |             u.pop('category', None)
214 | 
215 |             ## very flaky, often goes from gfycat.com to null
216 |             media = u.get('media')
217 |             if media is not None:
218 |                 media.pop('type', None)
219 |             if media is None or len(media) == 0:
220 |                 u.pop('media', None)
221 | 
222 |             # gallery_data is sometimes flaking to none
223 | 
224 |         for s in j['subreddits']:
225 |             # volatile when we've got enough subreddits -- not worth keeping
226 |             s.pop('description', None)
227 |             s.pop('public_description', None)
228 |             s.pop('public_description_html', None)
229 |             s.pop('submit_text', None)
230 |             s.pop('submit_text_html', None)
231 |             s.pop('disable_contributor_requests', None)
232 | 
233 |         return j
234 | 
235 | 
236 | if __name__ == '__main__':
237 |     Normaliser.main()
238 | 
239 | 
240 | def test_reddit_1() -> None:
241 |     from bleanser.tests.common import skip_if_no_data
242 | 
243 |     skip_if_no_data()
244 | 
245 |     from bleanser.tests.common import TESTDATA, actions
246 |     # TODO add a test for multiway
247 | 
248 |     data = TESTDATA / 'reddit'
249 |     paths = sorted(data.glob('*.json*'))
250 | 
251 |     res = actions(paths=paths, Normaliser=Normaliser)
252 | 
253 |     assert [p.name for p in res.remaining] == [
254 |         'reddit_20211227T164130Z.json',  # first in group
255 |         'reddit_20211227T170106Z.json',  # saved item rolled over
256 |         'reddit_20211227T171058Z.json',  # some saved items rolled over
257 | 
258 |         'reddit_20211227T173058Z.json',  # keeping boundary
259 |         'reddit_20211230T034059Z.json',  # some items rolled over
260 |         'reddit_20211230T035056Z.json',  # some things legit disappeared due to api limits
261 | 
262 |         'reddit_20220102T132059Z.json',  # boundary for the next one
263 |         'reddit_20220102T142057Z.json',  # author changed (likely deleted?)
264 |         'reddit_20220102T164059Z.json',  # last in group
265 |     ]  # fmt: skip
266 | 
267 | 
268 | def test_reddit_2() -> None:
269 |     from bleanser.tests.common import skip_if_no_data
270 | 
271 |     skip_if_no_data()
272 | 
273 |     from bleanser.tests.common import TESTDATA, actions
274 | 
275 |     data = TESTDATA / 'reddit2'
276 |     paths = sorted(data.glob('*.json*'))
277 | 
278 |     res = actions(paths=paths, Normaliser=Normaliser)
279 |     # note: fieles appear to be spaced out by 20 mins instead of 10 (backup frequency)
280 |     # this is ok, because I temporarily moved every other file away in the absence of bleanser
281 |     assert [p.name for p in res.remaining] == [
282 |         'reddit_20210803T121056Z.json',
283 | 
284 |         # ^v -- identical
285 | 
286 |         'reddit_20210803T213053Z.json',
287 | 
288 |         # here: some saved items rolled over
289 |         'reddit_20210803T215050Z.json',
290 | 
291 |         'reddit_20210804T213055Z.json',
292 |     ]  # fmt: skip
293 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/rescuetime.py:
--------------------------------------------------------------------------------
 1 | from bleanser.core.modules.json import JsonNormaliser
 2 | 
 3 | 
 4 | class Normaliser(JsonNormaliser):
 5 |     MULTIWAY = True
 6 |     PRUNE_DOMINATED = True
 7 | 
 8 | 
 9 | if __name__ == '__main__':
10 |     Normaliser.main()
11 | 
12 | 
13 | def test_rescuetime() -> None:
14 |     from bleanser.tests.common import skip_if_no_data
15 | 
16 |     skip_if_no_data()
17 | 
18 |     from bleanser.tests.common import TESTDATA, actions2
19 | 
20 |     res = actions2(path=TESTDATA / 'rescuetime', rglob='*.json*', Normaliser=Normaliser)
21 |     assert res.remaining == [
22 |         'rescuetime_2018-01-02.json.xz',
23 |         'rescuetime_2018-01-04.json.xz',
24 |         'rescuetime_2018-01-07.json.xz',
25 |         'rescuetime_2018-01-10.json.xz',
26 |         'rescuetime_2018-01-11.json.xz',
27 |         #
28 |         # todo these should be present in the result for the following group
29 |         # not sure how to properly test?
30 |         # maybe just grep... after applying instructions
31 |         # Entry(dt=datetime.datetime(2020, 2, 19, 0, 55), duration_s=9, activity='mobile - com.android.launcher3'),
32 |         # Entry(dt=datetime.datetime(2020, 2, 19, 0, 55), duration_s=9, activity='mobile - com.termux'),
33 |         'rescuetime_20200204T010205Z.json',
34 |         'rescuetime_20200219T010207Z.json',
35 |         'rescuetime_20200305T010206Z.json',
36 |         #
37 |         'rescuetime_20211209T011109Z.json.xz',
38 |         'rescuetime_20211218T011116Z.json.xz',
39 |         'rescuetime_20211220T011110Z.json.xz',
40 |         'rescuetime_20211224T011109Z.json.xz',
41 |     ]
42 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/rescuetime_android.py:
--------------------------------------------------------------------------------
 1 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool
 2 | 
 3 | 
 4 | class Normaliser(SqliteNormaliser):
 5 |     MULTIWAY = True
 6 |     PRUNE_DOMINATED = True
 7 | 
 8 |     def check(self, c) -> None:
 9 |         tables = Tool(c).get_tables()
10 |         events = tables['LoggedEvent']
11 |         assert 'started' in events, events
12 |         assert 'appName' in events, events
13 | 
14 |     def cleanup(self, c) -> None:
15 |         self.check(c)
16 | 
17 |         t = Tool(c)
18 |         t.drop('ScanningPause')  # not sure what is it, but seems to be some sort of helper table
19 |         t.drop('SentryLogEntry')  # some internal logging, contributes to tons of changes
20 |         # todo there is also TimeLog, but it seems that they are also write only and consistent so don't impact diffs
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     Normaliser.main()
25 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/skype_android.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from bleanser.core.modules.json import delkeys
 4 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool
 5 | 
 6 | 
 7 | class Normaliser(SqliteNormaliser):
 8 |     MULTIWAY = True
 9 |     PRUNE_DOMINATED = True
10 | 
11 |     def check(self, c) -> None:
12 |         tables = Tool(c).get_tables()
13 |         messages = tables['conversationsv14']
14 |         assert 'nsp_data' in messages, messages
15 | 
16 |     def cleanup(self, c) -> None:
17 |         self.check(c)
18 | 
19 |         t = Tool(c)
20 |         t.drop('conversationsv14_searchTerms_content')
21 |         t.drop('conversationsv14_searchTerms_segments')
22 |         t.drop('conversationsv14_searchTerms_segdir')
23 | 
24 |         t.drop('internaldata')  # very volatile
25 | 
26 |         t.drop('telemetrycachev3')  # volatile, nothing interesting here
27 | 
28 |         def _cleanup_jsons(s):
29 |             if s is None:
30 |                 return None
31 |             j = json.loads(s)
32 |             delkeys(
33 |                 j,
34 |                 keys=[
35 |                     'fetchedDate',  # from profilecachev8, very volatile
36 |                     'up',  # from miniprofilecachev8, very volatile
37 |                 ],
38 |             )
39 |             return json.dumps(j)
40 | 
41 |         c.create_function("CLEANUP_JSONS", 1, _cleanup_jsons)
42 |         list(c.execute('UPDATE profilecachev8     SET nsp_data = CLEANUP_JSONS(nsp_data)'))
43 |         list(c.execute('UPDATE miniprofilecachev8 SET nsp_data = CLEANUP_JSONS(nsp_data)'))
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     Normaliser.main()
48 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/sleepasandroid_android.py:
--------------------------------------------------------------------------------
 1 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool
 2 | 
 3 | 
 4 | class Normaliser(SqliteNormaliser):
 5 |     MULTIWAY = False  # could use it, but no need really?
 6 |     PRUNE_DOMINATED = True
 7 | 
 8 |     def check(self, c) -> None:
 9 |         tables = Tool(c).get_tables()
10 |         assert 'noise' in tables, tables
11 |         assert 'records' in tables, tables
12 | 
13 |     def cleanup(self, c) -> None:
14 |         self.check(c)
15 | 
16 |         # if not finished it's gonna constantly change
17 |         res = c.execute('DELETE FROM records WHERE finished = 0')
18 |         assert res.rowcount <= 1, res.rowcount
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     Normaliser.main()
23 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/smscalls.py:
--------------------------------------------------------------------------------
 1 | from bleanser.core.modules.xml import Normaliser as XmlNormaliser
 2 | 
 3 | 
 4 | class Normaliser(XmlNormaliser):
 5 |     MULTIWAY = True
 6 |     PRUNE_DOMINATED = True
 7 | 
 8 |     def cleanup(self, t):
 9 |         # volatile attributes
10 |         del t.attrib['count']
11 |         del t.attrib['backup_date']
12 |         del t.attrib['backup_set']
13 |         return t
14 | 
15 | 
16 | if __name__ == '__main__':
17 |     Normaliser.main()
18 | 
19 | 
20 | def test_smscalls() -> None:
21 |     from bleanser.tests.common import skip_if_no_data
22 | 
23 |     skip_if_no_data()
24 | 
25 |     from bleanser.tests.common import TESTDATA, actions
26 | 
27 |     data = TESTDATA / 'smscalls'
28 |     paths = sorted(data.glob('*.xml*'))
29 | 
30 |     res = actions(paths=paths, Normaliser=Normaliser)
31 | 
32 |     assert [p.name for p in res.remaining] == [
33 |         'calls-20161211023623.xml',
34 |         'calls-20161218221620.xml',
35 |         'calls-20170308050001.xml',
36 |         # 'calls-20170309065640.xml',
37 |         'calls-20170310063055.xml',
38 |         # 'calls-20170311050001.xml',
39 |         # 'calls-20170312050001.xml',
40 |         # 'calls-20170313050001.xml',
41 |         # 'calls-20170314051813.xml',
42 |         'calls-20170315050001.xml',
43 | 
44 |         # 'calls-20210901043042.xml',
45 |         'calls-20210902043044.xml',
46 |         # 'calls-20210903043044.xml',
47 |         # 'calls-20210904060930.xml',
48 |         # 'calls-20210905043030.xml',
49 |         # 'calls-20210906043031.xml',
50 |         'calls-20210907043032.xml',
51 |         'calls-20210908043032.xml',
52 | 
53 |         'sms-20211008043028.xml',
54 |         # 'sms-20211009043028.xml'
55 |         'sms-20211010043029.xml',
56 |         # 'sms-20211011043029.xml',
57 |         'sms-20211012065557.xml',
58 |         # 'sms-20211013043058.xml',
59 |         # 'sms-20211014043058.xml',
60 |         # 'sms-20211015043059.xml',
61 |         # 'sms-20211016043059.xml',
62 |         # 'sms-20211017043000.xml',
63 |         # 'sms-20211018045758.xml',
64 |         # 'sms-20211019043059.xml',
65 |         # 'sms-20211020043100.xml',
66 |         # 'sms-20211021043000.xml',
67 |         # 'sms-20211022044756.xml',
68 |         # 'sms-20211023043057.xml',
69 |         # 'sms-20211024043057.xml',
70 |         # 'sms-20211025043057.xml',
71 |         # 'sms-20211026051803.xml',
72 |         # 'sms-20211027043004.xml',
73 |         # 'sms-20211028043004.xml',
74 |         'sms-20211029043004.xml',
75 |         # 'sms-20211030043005.xml',
76 |         # 'sms-20211031043005.xml',
77 |         # 'sms-20211101043006.xml',
78 |         # 'sms-20211102043006.xml',
79 |         # 'sms-20211103043007.xml',
80 |         # 'sms-20211104043007.xml',
81 |         # 'sms-20211105102901.xml',
82 |         # 'sms-20211106043002.xml',
83 |         # 'sms-20211107043002.xml',
84 |         # 'sms-20211108043003.xml',
85 |         # 'sms-20211109043004.xml',
86 |         'sms-20211110043004.xml',
87 |     ]  # fmt: skip
88 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/spotify.py:
--------------------------------------------------------------------------------
 1 | from bleanser.core.modules.json import Json, JsonNormaliser, delkeys
 2 | 
 3 | 
 4 | class Normaliser(JsonNormaliser):
 5 |     MULTIWAY = True
 6 |     PRUNE_DOMINATED = True
 7 | 
 8 |     def cleanup(self, j: Json) -> Json:
 9 |         delkeys(
10 |             j,
11 |             keys={
12 |                 'popularity',  # flaky -- relative to other artists, not interesting
13 |                 'album_type',  # sometimes flaky between 'album' and 'compilation'
14 |                 ## flaky metadata (maybe not even worth backing up..)
15 |                 'available_markets',
16 |                 'images',
17 |                 'total_episodes',
18 |                 'preview_url',
19 |                 'release_date',
20 |                 'external_ids',
21 |                 ##
22 |                 'snapshot_id',  # present on playlists, basically hash
23 |             },
24 |         )
25 | 
26 |         if isinstance(j, list):
27 |             # old format, I think this was just 'Liked' playlist
28 |             return j
29 | 
30 |         ## 'flatten' to make it possible to properly diff
31 |         playlists = j['playlists']
32 |         upd_playlists = []
33 |         for p in playlists:
34 |             _pname = p['name']
35 |             if p['owner']['id'] == 'spotify':
36 |                 # these are typically autogenerated playlists like
37 |                 # - "This Is " artist playlists
38 |                 # - mix between two users
39 |                 # they change very often and no point keeping track of them
40 |                 continue
41 |             pid = p['id']
42 |             j[f'playlist_{pid}_tracks'] = p['tracks']
43 |             upd_playlists.append(p)
44 |             del p['tracks']
45 |         j['playlists'] = upd_playlists
46 |         ##
47 | 
48 |         # TODO ugh. tbh, not sure what to do with recently_played -- api only allows recent 50?
49 |         # so they are bound to change super often if you listen to music daily (+ you might even miss some tracks anyway)
50 | 
51 |         return j
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     Normaliser.main()
56 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/spotifyexport.py:
--------------------------------------------------------------------------------
 1 | from bleanser.core.modules.json import Json, JsonNormaliser, delkeys
 2 | 
 3 | 
 4 | class Normaliser(JsonNormaliser):
 5 |     def cleanup(self, j: Json) -> None:
 6 |         ## these change for no reason, and probably no one cares about them
 7 |         delkeys(
 8 |             j,
 9 |             keys={
10 |                 'images',
11 |                 'available_markets',
12 |                 'popularity',
13 |                 'preview_url',
14 |                 'external_urls',
15 |                 'total_episodes',
16 |             },
17 |         )
18 |         ##
19 | 
20 |         # TODO hmm. it changes often... but then it's kind of a useful info..
21 |         # del j['recently_played']
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     Normaliser.main()
26 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/stackexchange.py:
--------------------------------------------------------------------------------
 1 | from bleanser.core.modules.json import Json, JsonNormaliser, delkeys
 2 | 
 3 | 
 4 | class Normaliser(JsonNormaliser):
 5 |     MULTIWAY = True
 6 |     PRUNE_DOMINATED = True
 7 | 
 8 |     def cleanup(self, j: Json) -> Json:
 9 |         delkeys(
10 |             j,
11 |             keys=[
12 |                 ## these are change all the time, and I guess if you were interested in any 'real time' dynamics
13 |                 ## you wouldn't use periodic backups anyway, just write a proper polling tool
14 |                 ## especially considering they are cumulative, fine to prune out
15 |                 'reputation',
16 |                 'view_count',
17 |                 'favorite_count',
18 |                 'up_vote_count',
19 |                 'down_vote_count',
20 |                 'answer_count',
21 |                 'score',
22 |                 ##
23 |                 ##
24 |                 'reputation_change_week',
25 |                 'reputation_change_month',
26 |                 'reputation_change_quarter',
27 |                 'reputation_change_year',
28 |                 'profile_image',
29 |                 'last_access_date',  # last time user loggen in? very flaky
30 |             ],
31 |         )
32 | 
33 |         ##
34 |         # the json maps from 'domain' (e.g. math/english/apple) to the payload with various interesting data
35 |         # so we wanna flatten it first
36 |         nj = {}
37 |         for domain, d in j.items():
38 |             for k, v in d.items():
39 |                 nj[f'{domain}_{k}'] = v
40 |         j = nj
41 |         ##
42 | 
43 |         ##
44 |         for k in list(j.keys()):
45 |             if k.endswith('/privileges'):  # useless crap, achievements/badges
46 |                 del j[k]
47 |         ##
48 |         return j
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     Normaliser.main()
53 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/talon_android.py:
--------------------------------------------------------------------------------
 1 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool
 2 | 
 3 | 
 4 | class Normaliser(SqliteNormaliser):
 5 |     MULTIWAY = True
 6 |     PRUNE_DOMINATED = True
 7 | 
 8 |     def check(self, c) -> None:
 9 |         _tables = Tool(c).get_tables()
10 |         # TODO add something later
11 | 
12 |     def cleanup(self, c) -> None:
13 |         self.check(c)
14 | 
15 |         t = Tool(c)
16 |         # for some reason flaking between en/en_US
17 |         t.drop('android_metadata')
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     Normaliser.main()
22 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/tiktok_android.py:
--------------------------------------------------------------------------------
 1 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool
 2 | 
 3 | 
 4 | class Normaliser(SqliteNormaliser):
 5 |     MULTIWAY = True
 6 |     PRUNE_DOMINATED = True
 7 | 
 8 |     ALLOWED_BLOBS = {
 9 |         ('msg', 'content_pb'),
10 |         ('im_search_index_official_segments', '*'),
11 |         ('im_search_index_official_segdir', '*'),
12 |         ('im_search_index_official_docsize', '*'),
13 |         ('im_search_index_official_stat', '*'),
14 |     }
15 | 
16 |     def check(self, c) -> None:
17 |         tables = Tool(c).get_tables()
18 | 
19 |         messages = tables['msg']
20 |         assert 'msg_uuid' in messages
21 |         assert 'content' in messages
22 | 
23 |     def cleanup(self, c) -> None:
24 |         self.check(c)
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     Normaliser.main()
29 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/tinder_android.py:
--------------------------------------------------------------------------------
  1 | from sqlite3 import Connection
  2 | 
  3 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool
  4 | 
  5 | 
  6 | class Normaliser(SqliteNormaliser):
  7 |     MULTIWAY = True
  8 |     PRUNE_DOMINATED = True
  9 | 
 10 |     def check(self, c: Connection) -> None:
 11 |         tool = Tool(c)
 12 |         tables = tool.get_tables()
 13 |         matches = tables['match']
 14 |         assert 'person_id' in matches, matches
 15 | 
 16 |         messages = tables['message']
 17 |         assert 'text' in messages, messages
 18 |         assert 'match_id' in messages, messages
 19 | 
 20 |     def cleanup(self, c: Connection) -> None:
 21 |         self.check(c)
 22 | 
 23 |         t = Tool(c)
 24 | 
 25 |         t.drop(
 26 |             'instagram_broken',
 27 |             'explore_attribution',
 28 |             #
 29 |             ## messages from Tinder itself
 30 |             'inbox_message',
 31 |             'inbox_message_images',
 32 |             'inbox_message_text_formatting',
 33 |             ##
 34 |         )
 35 | 
 36 |         # eh, don't think it impacts anyway
 37 |         # t.drop('contextual_match')
 38 |         # it contains some photos? dunno
 39 | 
 40 |         # some odd id that increases with no impact for other data
 41 |         t.drop_cols(table='profile_media', cols=['client_sequential_id'])
 42 | 
 43 |         t.drop_cols(table='match_seen_state', cols=['match_id', 'last_message_seen_id'])
 44 | 
 45 |         t.drop('match_your_turn_state')
 46 | 
 47 |         # TODO profile_descriptor?? blob containing presumably profile info, and sometimes jumps quite a bit
 48 | 
 49 |         # this one contributes to _a lot_ of changes, like 40%
 50 |         # and I guess if we properly wanted to track when app was activated, we'd need a different mechanism anyway
 51 |         t.drop('last_activity_date')
 52 | 
 53 |         # hmm what is match_harassing_message??
 54 | 
 55 |         # TODO not sure about this?
 56 |         # t.drop_cols('match', cols=[
 57 |         #     'last_activity_date',
 58 |         # ])
 59 | 
 60 |         # TODO profile_descriptor changes quite a lot? not sure
 61 | 
 62 |         # match->last_activity_date -- hmmm changing quite a bit? is it interesting? not sure
 63 |         #
 64 |         # message->is_liked -- not sure if worth keeping... only for finding out the first change?
 65 |         #
 66 |         # match_read_receipt -- what is it??
 67 |         # match_id	last_seen_message_id	seen_timestamp
 68 |         # seems that last last_seen_message_id can be restored from messages table... but seen_timestamp is unique?
 69 | 
 70 |         # NOTE: for 'extract' mode
 71 |         # match->is_blocked
 72 | 
 73 | 
 74 | if __name__ == '__main__':
 75 |     Normaliser.main()
 76 | 
 77 | 
 78 | def test_tinder() -> None:
 79 |     from bleanser.tests.common import skip_if_no_data
 80 | 
 81 |     skip_if_no_data()
 82 | 
 83 |     from bleanser.tests.common import TESTDATA, actions2
 84 | 
 85 |     res = actions2(path=TESTDATA / 'tinder_android', rglob='**/*.db*', Normaliser=Normaliser)
 86 | 
 87 |     assert res.remaining == [
 88 |         '20210523193545/tinder-3.db',  # keep, first in group
 89 |         # '20210916214349/tinder-3.db',  # MOVE
 90 |         # '20210916223254/tinder-3.db',  # MOVE
 91 |         '20210916232749/tinder-3.db',  # keep, some likes changes etc
 92 |         '20210917004827/tinder-3.db',
 93 |         '20210917014719/tinder-3.db',
 94 |         # '20210917015444/tinder-3.db',
 95 |         # '20210917031235/tinder-3.db',  # MOVE
 96 |         '20210917060029/tinder-3.db',
 97 | 
 98 | 
 99 |         '20211007060802/tinder-3.db',  # keep, first in group
100 |         # '20211007090109/tinder-3.db',
101 |         # '20211007094056/tinder-3.db',
102 |         # '20211007115318/tinder-3.db',
103 |         # '20211007133114/tinder-3.db',
104 |         # '20211007143940/tinder-3.db',
105 |         # '20211007155908/tinder-3.db',
106 |         '20211007165243/tinder-3.db',
107 |         '20211007180708/tinder-3.db',  # keep, bio changed
108 | 
109 |         '20211225050314/tinder-3.db',  # keep: first in group
110 |         # '20211225193930/tinder-3.db',
111 |         # '20211226052237/tinder-3.db',
112 |         # '20211226091116/tinder-3.db',
113 |         # '20211226135158/tinder-3.db',
114 |         # '20211227002918/tinder-3.db',
115 |         # '20211227044403/tinder-3.db',
116 |         '20211227145813/tinder-3.db',  # keep: last in group
117 |     ]  # fmt: skip
118 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/twitter_android.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Normalises data for official twitter Android app
  3 | """
  4 | 
  5 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool
  6 | 
  7 | 
  8 | class Normaliser(SqliteNormaliser):
  9 |     MULTIWAY = True
 10 |     PRUNE_DOMINATED = True
 11 | 
 12 |     def check(self, c) -> None:
 13 |         tables = Tool(c).get_tables()
 14 | 
 15 |         statuses = tables['statuses']
 16 |         assert 'status_id' in statuses
 17 |         assert 'content' in statuses
 18 | 
 19 |         [(total_statuses,)] = c.execute('SELECT COUNT(*) FROM statuses')
 20 |         assert total_statuses > 10  # sanity check
 21 | 
 22 |         [(statuses_without_content,)] = c.execute('SELECT COUNT(*) FROM statuses WHERE content IS NULL')
 23 |         # another sanity check -- to make sure the content is actually stored in this column and not lost during migrations
 24 |         assert statuses_without_content == 0
 25 | 
 26 |         _timeline = tables['timeline']
 27 | 
 28 |     def cleanup(self, c) -> None:
 29 |         self.check(c)
 30 | 
 31 |         t = Tool(c)
 32 | 
 33 |         # some sort of crappy analytics -- A LOT of it
 34 |         # I actually suspect it's the bulk of this database? removing it makes cleanup considerably faster
 35 |         t.drop('feedback_action')
 36 |         t.drop('timeline_feedback_actions')
 37 | 
 38 |         t.drop('promoted_retry')
 39 | 
 40 |         t.drop('card_state')  # only has a couple of rows which are always changing.. some policy crap
 41 | 
 42 |         t.drop('status_groups')  # doesn't looks like anything interesting, contains read state?
 43 | 
 44 |         t.drop('retweets')  # seems like it contains last retweet for each tweet or something.. doesn't actually have tweet data
 45 | 
 46 |         t.drop('tokens')  # some internal thing
 47 | 
 48 |         t.drop_cols(
 49 |             'statuses',
 50 |             cols=[
 51 |                 '_id',  # internal id
 52 |                 ## volatile
 53 |                 'favorite_count',
 54 |                 'retweet_count',
 55 |                 'view_count_info',
 56 |                 'reply_count',
 57 |                 'bookmark_count',
 58 |                 'quote_count',
 59 |                 'tweet_source',  # sometimes NULL at first?
 60 |                 'flags',
 61 |                 'self_thread_id',
 62 |                 'edit_control',  # no idea what it is
 63 |                 'unmention_info',  # no idea, some binary crap (not even text)
 64 |                 'quick_promote_eligibility',
 65 |                 'quoted_status_permalink',
 66 |                 'conversation_control',
 67 |                 ##
 68 |                 #
 69 |                 'r_ent_content',  # contains same data as 'content'
 70 |                 #
 71 |                 # cards contain some extra data embedded from the website (e.g. preview)
 72 |                 # might be actually useful to extract data from it
 73 |                 'card',
 74 |                 'unified_card',
 75 |             ],
 76 |         )
 77 | 
 78 |         # NOTE: in principle tweet data is all in statues table
 79 |         # but we need timeline to reconstruct some feeds (e.g. users own tweets)
 80 |         t.drop_cols(
 81 |             'timeline',
 82 |             cols=[
 83 |                 '_id',  # internal id
 84 |                 ## volatile
 85 |                 'is_read',
 86 |                 'sort_index',
 87 |                 'timeline_chunk_id',
 88 |                 'updated_at',
 89 |                 'scribe_content',  # some "for you" crap
 90 |                 'created_at',  # internal created at, not tweet's
 91 |                 'feedback_action_prompts',
 92 |                 'social_context',
 93 |                 'is_linger_impressed',
 94 |                 'dismissed',
 95 |                 ##
 96 |             ],
 97 |         )
 98 | 
 99 |         c.execute('''
100 |         DELETE FROM timeline
101 |         WHERE entity_group_id LIKE "%cursor%"
102 |            OR entity_group_id LIKE "%who-to-follow%"
103 |            OR entity_group_id LIKE "%trends%"
104 |            OR entity_group_id LIKE "%semantic%"
105 |            OR entity_group_id LIKE "%promoted%"
106 |            OR entity_group_id LIKE "%home-conversation%"
107 |            OR entity_group_id LIKE "%notification%"
108 |            OR entity_id       LIKE "%trends%"
109 |            OR entity_id       LIKE "%superhero%"
110 |         ''')
111 | 
112 |         # after all decided to drop 'timeline' completely.. all actual data is in statuses table anyway
113 |         # - the vast majority of volatile entrites in it are type == 17 (not sure what it is)
114 |         # - it also contains non-user timelines (e.g. when you open someone's profile in twitter app)
115 |         t.drop('timeline')
116 | 
117 |         t.drop('users')  # they change all the time and probs not worth keeping all changes
118 | 
119 |         ## they are empty most of the time? sometimes contains an odd item for some reason
120 |         t.drop('user_groups')
121 |         t.drop('user_metadata')
122 |         ##
123 | 
124 |         def remove_volatile_content(s):
125 |             if s is None:
126 |                 return None
127 |             xxx = s.find(bytes.fromhex('00695858583869306938306938496a'))
128 |             if xxx == -1:
129 |                 return s
130 |             else:
131 |                 return s[:xxx]
132 |                 # if b'movie trailer' in s:
133 |                 print(s.hex(), type(s))
134 |             return s
135 | 
136 |         # ugh... a few tweets sometimes have some binary changes??
137 |         # also this doesn't seem to solve everything sadly.. so for now commenting
138 |         # c.create_function('REMOVE_VOLATILE_CONTENT', 1, remove_volatile_content)
139 |         # list(c.execute('UPDATE statuses SET content = REMOVE_VOLATILE_CONTENT(content)'))
140 | 
141 |         # so it's a bit shit, but content shouldn't really change, and seems too hard to filter out these changes in binary blobs here
142 |         # except edited tweets? but I have a feeling editing is controlled by timeline.updated or something
143 |         # either way it would be so rare it will likely be caught collaterally by other data changes
144 |         c.execute("UPDATE statuses SET content = X'BABABA'")
145 | 
146 | 
147 | if __name__ == '__main__':
148 |     Normaliser.main()
149 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/vk_android.py:
--------------------------------------------------------------------------------
  1 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool
  2 | 
  3 | 
  4 | class Normaliser(SqliteNormaliser):
  5 |     MULTIWAY = True
  6 |     PRUNE_DOMINATED = True
  7 | 
  8 |     # even though we drop some of these, currently columns are dropped via erasing the content, not altering table
  9 |     # so need to keep here too
 10 |     ALLOWED_BLOBS = {
 11 |         ('channel_messages', 'attach'),
 12 | 
 13 |         ('messages', 'avatar'),
 14 |         ('messages', 'attach'),
 15 |         ('messages', 'carousel'),
 16 |         ('messages', 'nested'),
 17 |         ('messages', 'keyboard_buttons'),
 18 | 
 19 |         ('users', 'avatar'),
 20 |         ('users', 'image_status'),
 21 |         ('contacts', 'avatar'),
 22 |         ('groups', 'avatar'),
 23 | 
 24 |         ('dialogs', 'bar_buttons'),
 25 |         ('dialogs', 'chat_settings_members_active'),
 26 |         ('dialogs', 'chat_settings_admins'),
 27 |         ('dialogs', 'chat_settings_avatar'),
 28 |         ('dialogs', 'draft_msg'),
 29 |         ('dialogs', 'expire_msg_vk_ids'),
 30 |         ('dialogs', 'group_call_participants'),
 31 |         ('dialogs', 'keyboard_buttons'),
 32 |         ('dialogs', 'pinned_msg_attaches'),
 33 |         ('dialogs', 'pinned_msg_nested'),
 34 |         ('dialogs', 'pinned_carousel'),
 35 |         ('dialogs', 'unread_mention_msg_vk_ids'),
 36 | 
 37 |         ('mutual_friends', 'mutual_friends_ids'),
 38 |     }  # fmt: skip
 39 | 
 40 |     def is_vkim(self, c) -> bool:
 41 |         tables = Tool(c).get_tables()
 42 |         if 'messages' in tables:
 43 |             return True
 44 |         else:
 45 |             # otherwise must be vk.db
 46 |             return False
 47 | 
 48 |     def check(self, c) -> None:
 49 |         tables = Tool(c).get_tables()
 50 |         if self.is_vkim(c):
 51 |             msgs = tables['messages']
 52 |             assert 'vk_id' in msgs, msgs
 53 |             assert 'time' in msgs, msgs
 54 | 
 55 |             dialogs = tables['dialogs']
 56 |             assert 'id' in dialogs, dialogs
 57 |         else:
 58 |             users = tables['users']
 59 |             assert 'uid' in users, users
 60 |             assert 'firstname' in users, users
 61 | 
 62 |     def cleanup_vk_db(self, c) -> None:
 63 |         t = Tool(c)
 64 |         t.drop(table='friends_hints_order')
 65 |         t.drop_cols(
 66 |             table='users',
 67 |             cols=[
 68 |                 # TODO hmm lately (202309), is_friend seems to be flaky for no reason? even where there are no status changes
 69 |                 'last_updated',
 70 |                 'photo_small',
 71 |                 'lists',  # very flaky for some reason, sometimes just flips to 0??
 72 |                 'name_r',  # seems derived from first/last name, and is very flaky
 73 |             ],
 74 |         )
 75 | 
 76 |     def cleanup(self, c) -> None:
 77 |         self.check(c)  # todo could also call 'check' after just in case
 78 | 
 79 |         if not self.is_vkim(c):
 80 |             self.cleanup_vk_db(c)
 81 |             return
 82 | 
 83 |         t = Tool(c)
 84 | 
 85 |         for table in [
 86 |             'peers_search_content',
 87 |             'peers_search_segments',
 88 |             'peers_search_segdir',
 89 |             'peers_search_docsize',
 90 |             'peers_search_stat',
 91 |             'messages_search_segments',
 92 |             'messages_search_segdir',
 93 |             'messages_search_docsize',
 94 |             'messages_search_stat',
 95 |             'messages_search_content',
 96 |             #
 97 |             'key_value',  # nothing interesting here
 98 |             'integer_generator',  # lol
 99 |             #
100 |             ## no data, just some internal tracking
101 |             'dialogs_history_count',
102 |             'dialogs_history_meta',
103 |             'dialog_weight',
104 |             ##
105 |         ]:
106 |             t.drop(table=table)
107 | 
108 |         t.drop_cols(
109 |             table='users',
110 |             cols=[
111 |                 'avatar',  # flaky and no point tracking really
112 |                 'image_status',
113 |                 ## flaky timestamps
114 |                 'sync_time_overall',
115 |                 'sync_time_online',
116 |                 'online_last_seen',
117 |                 ##
118 |                 'online_app_id',
119 |                 'online_type',
120 |             ],
121 |         )
122 | 
123 |         t.drop_cols(
124 |             table='contacts',
125 |             cols=[
126 |                 'avatar',
127 |                 'sync_time',  # flaky
128 |                 'last_seen_status',  # flaky
129 |             ],
130 |         )
131 | 
132 |         t.drop_cols(
133 |             table='dialogs',
134 |             cols=[
135 |                 'sort_id_server',
136 |                 'sort_id_local',
137 |                 'weight',
138 |                 'read_till_in_msg_vk_id',
139 |                 'read_till_out_msg_vk_id',
140 |                 'last_msg_vk_id',
141 |                 'read_till_in_msg_vk_id_local',
142 |                 'read_till_in_msg_cnv_id',
143 |                 'read_till_out_msg_cnv_id',
144 |                 'last_msg_cnv_id',
145 |                 'count_unread',
146 |                 'count_unread_local',
147 |                 'keyboard_visible',
148 |                 'draft_msg',
149 |                 'bar_name',
150 |                 'bar_exists',
151 |                 'bar_buttons',
152 |                 'bar_text',
153 |                 'bar_icon',
154 |             ],
155 |         )
156 | 
157 |         t.drop_cols(
158 |             table='messages',
159 |             cols=[
160 |                 ## seems flaky -- not sure why, hard to tell since it's a binary blob
161 |                 'attach',
162 |                 'nested',
163 |                 ##
164 |                 'phase_id',  # not sure what is it, some internal stuff
165 |             ],
166 |         )
167 | 
168 |         t.drop_cols(
169 |             table='groups',
170 |             cols=[
171 |                 'avatar',
172 |                 'sync_time',
173 |                 'members_count',
174 |             ],
175 |         )
176 | 
177 | 
178 | if __name__ == '__main__':
179 |     Normaliser.main()
180 | 


--------------------------------------------------------------------------------
/src/bleanser/modules/xml_clean.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | 
 3 | from bleanser.core.modules.xml import *  # noqa: F401, F403
 4 | 
 5 | warnings.warn("Module 'bleanser.modules.xml_clean' is deprecated. Use 'bleanser.core.modules.xml_clean' instead.", DeprecationWarning)
 6 | 
 7 | 
 8 | if __name__ == '__main__':
 9 |     Normaliser.main()  # noqa: F405
10 | 


--------------------------------------------------------------------------------
/src/bleanser/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/karlicoss/bleanser/418c361ee05621af2b1553d40a3618f2cf98b323/src/bleanser/py.typed


--------------------------------------------------------------------------------
/src/bleanser/tests/common.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import os
 4 | from contextlib import contextmanager
 5 | from dataclasses import dataclass
 6 | from pathlib import Path
 7 | 
 8 | import pytest
 9 | 
10 | TESTDATA = Path(__file__).absolute().parent / 'testdata'
11 | 
12 | 
13 | def skip_if_no_data() -> None:
14 |     if 'CI' in os.environ and not TESTDATA.exists():
15 |         pytest.skip('test only works on @karlicoss private data for now')
16 | 
17 | 
18 | @dataclass
19 | class Res:
20 |     pruned: list[Path]
21 |     remaining: list[Path]
22 | 
23 | 
24 | def actions(*, paths: list[Path], Normaliser, threads: int | None = None) -> Res:
25 |     from bleanser.core.common import Keep, Prune
26 |     from bleanser.core.processor import compute_instructions
27 | 
28 |     instructions = list(compute_instructions(paths, Normaliser=Normaliser, threads=threads))
29 |     pruned = []
30 |     remaining = []
31 |     for i in instructions:
32 |         if isinstance(i, Prune):
33 |             pruned.append(i.path)
34 |         elif isinstance(i, Keep):
35 |             remaining.append(i.path)
36 |         else:
37 |             raise RuntimeError(type(i))
38 |     return Res(pruned=pruned, remaining=remaining)
39 | 
40 | 
41 | @dataclass
42 | class Res2:
43 |     pruned: list[str]
44 |     remaining: list[str]
45 | 
46 | 
47 | def actions2(*, path: Path, rglob: str, Normaliser, threads: int | None = None) -> Res2:
48 |     from bleanser.core.main import _get_paths
49 | 
50 |     pp = str(path) + os.sep + rglob
51 |     paths = _get_paths(path=pp, glob=True, from_=None, to=None)
52 |     res = actions(paths=paths, Normaliser=Normaliser, threads=threads)
53 |     pruned = res.pruned
54 |     remaining = res.remaining
55 |     return Res2(
56 |         pruned   =[str(c.relative_to(path)) for c in pruned   ],
57 |         remaining=[str(c.relative_to(path)) for c in remaining],
58 |     )  # fmt: skip
59 | 
60 | 
61 | @contextmanager
62 | def hack_attribute(Normaliser, key, value):
63 |     prev = getattr(Normaliser, key)
64 |     try:
65 |         # TODO meh.. maybe instead instantiate an instance instead of class?
66 |         setattr(Normaliser, key, value)
67 |         yield
68 |     finally:
69 |         setattr(Normaliser, key, prev)
70 | 


--------------------------------------------------------------------------------
/src/bleanser/tests/test_binary.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pathlib import Path
 4 | 
 5 | import pytest
 6 | 
 7 | from bleanser.modules.binary import Normaliser
 8 | from bleanser.tests.common import TESTDATA, actions, hack_attribute, skip_if_no_data
 9 | 
10 | # TODO ugh. how to make relative imports work? pytest doesn't like them...
11 | 
12 | 
13 | def via_fdupes(path: Path) -> list[str]:
14 |     from subprocess import check_output
15 | 
16 |     lines = check_output(['fdupes', '-1', path]).decode('utf8').splitlines()
17 |     to_delete = []
18 |     for line in lines:
19 |         items = line.split()
20 |         # meh... don't get why it's not processing them in order...
21 |         items = sorted(items)
22 |         to_delete.extend(items[1:-1])
23 |     return sorted(to_delete)
24 | 
25 | 
26 | # TODO maybe add some sanity checks?
27 | # e.g. try guessing dates from filenames and making sure they are consistent with mtimes?
28 | # todo need to resort removing to a single command
29 | # and check 'remove' mode separately
30 | @pytest.mark.parametrize(
31 |     'data',
32 |     [
33 |         TESTDATA / 'instapaper',
34 |         TESTDATA / 'hypothesis_xz',
35 |     ],
36 | )
37 | def test_all(data: Path) -> None:
38 |     skip_if_no_data()
39 | 
40 |     paths = sorted(data.glob('*.json*'))
41 |     assert len(paths) > 20, paths  # precondition
42 | 
43 |     with hack_attribute(Normaliser, '_DIFF_FILTER', None):
44 |         res = actions(paths=paths, Normaliser=Normaliser)
45 | 
46 |     expected_deleted = [Path(p) for p in via_fdupes(path=data)]
47 |     assert res.pruned == expected_deleted
48 | 
49 | 
50 | # FIXME hmm need to make sure --dry is the default (maybe add a cmdline test?)
51 | 


--------------------------------------------------------------------------------
/src/bleanser/tests/test_hypothesis.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import pytest
 4 | 
 5 | from bleanser.core.modules.json import JsonNormaliser as Normaliser
 6 | from bleanser.tests.common import TESTDATA, actions, hack_attribute, skip_if_no_data
 7 | 
 8 | data = TESTDATA / 'hypothesis'
 9 | 
10 | 
11 | # total time about 5s?
12 | @pytest.mark.parametrize('num', range(10))
13 | def test_normalise_one(tmp_path: Path, num: int) -> None:  # noqa: ARG001
14 |     skip_if_no_data()
15 | 
16 |     path = data / 'hypothesis_20210625T220028Z.json'
17 |     n = Normaliser(original=path, base_tmp_dir=tmp_path)
18 |     with n.do_normalise():
19 |         pass
20 | 
21 | 
22 | # TODO less verbose mode for tests?
23 | def test_all() -> None:
24 |     skip_if_no_data()
25 | 
26 |     # todo share with main
27 |     paths = sorted(data.glob('*.json'))
28 |     assert len(paths) > 80, paths  # precondition
29 | 
30 |     # 4 workers: 64 seconds
31 |     # 4 workers, pool for asdict: 42 seconds..
32 |     # 2 workers: 81 seconds. hmmm
33 |     with hack_attribute(Normaliser, key='PRUNE_DOMINATED', value=True):
34 |         res = actions(paths=paths, Normaliser=Normaliser, threads=4)
35 |     remaining = {p.name for p in res.remaining}
36 |     assert 0 < len(remaining) < len(paths), remaining  # sanity check
37 | 
38 |     assert {
39 |         'hypothesis_2017-11-21.json',
40 |         'hypothesis_2019-06-11.json',
41 |         'hypothesis_2019-08-18.json',
42 |         'hypothesis_20190923T003014Z.json',
43 |         'hypothesis_20191216T123012Z.json',
44 |         'hypothesis_20200325T140016Z.json',
45 |         'hypothesis_20200720T140043Z.json',
46 |         'hypothesis_20200828T123032Z.json',
47 |         'hypothesis_20201012T140035Z.json',
48 |         'hypothesis_20210223T213023Z.json',
49 |         'hypothesis_20210625T220028Z.json',
50 |     }.issubset(remaining), remaining
51 |     # issubset because concurrency might end up in leaving more files than the absolute minimum
52 | 
53 |     assert len(remaining) < 30, remaining
54 | 
55 | 
56 | # FIXME check move mode
57 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | minversion = 3.21
 3 | # relies on the correct version of Python installed
 4 | envlist = ruff,tests,mypy,mypy-hpi
 5 | # https://github.com/tox-dev/tox/issues/20#issuecomment-247788333
 6 | # hack to prevent .tox from crapping to the project directory
 7 | toxworkdir = {env:TOXWORKDIR_BASE:}{toxinidir}/.tox
 8 | 
 9 | [testenv]
10 | # TODO how to get package name from setuptools?
11 | package_name = "bleanser"
12 | pass_env =
13 | # useful for tests to know they are running under ci
14 |     CI
15 |     CI_*
16 | # respect user's cache dirs to prevent tox from crapping into project dir
17 |     PYTHONPYCACHEPREFIX
18 |     MYPY_CACHE_DIR
19 |     RUFF_CACHE_DIR
20 | 
21 | set_env =
22 | # do not add current working directory to pythonpath
23 | # generally this is more robust and safer, prevents weird issues later on
24 |     PYTHONSAFEPATH=1
25 | 
26 | # default is 'editable', in which tox builds wheel first for some reason? not sure if makes much sense
27 | package = uv-editable
28 | 
29 | 
30 | [testenv:ruff]
31 | skip_install = true
32 | dependency_groups = testing
33 | commands =
34 |     {envpython} -m ruff check \
35 |         {posargs}
36 | 
37 | 
38 | [testenv:tests]
39 | dependency_groups = testing
40 | extras =
41 |     extra
42 |     json
43 |     xml
44 | commands =
45 |     # posargs allow test filtering, e.g. tox ... -- -k test_name
46 |     {envpython} -m pytest \
47 |         --pyargs {[testenv]package_name} --ignore-glob 'src/bleanser/modules/hpi/*' \
48 |         {posargs}
49 | 
50 | 
51 | [testenv:mypy]
52 | dependency_groups = testing
53 | extras =
54 |     extra
55 |     json
56 |     xml
57 | commands =
58 |     {envpython} -m mypy --no-install-types \
59 |         # note: hpi modules are tested below
60 |         -p {[testenv]package_name} --exclude 'hpi/*' \
61 |         # txt report is a bit more convenient to view on CI
62 |         --txt-report  .coverage.mypy     \
63 |         --html-report .coverage.mypy     \
64 |         {posargs}
65 | 
66 | 
67 | [testenv:mypy-hpi]
68 | dependency_groups = testing
69 | extras =
70 |     extra
71 |     json
72 |     xml
73 |     HPI
74 | commands =
75 |     {envpython} -m mypy --no-install-types \
76 |         -p {[testenv]package_name}.modules.hpi \
77 |         # txt report is a bit more convenient to view on CI
78 |         --txt-report  .coverage.mypy-hpi \
79 |         --html-report .coverage.mypy-hpi \
80 |         {posargs}
81 | 


--------------------------------------------------------------------------------