├── .ci ├── release ├── release-uv └── run ├── .github └── workflows │ └── main.yml ├── .gitignore ├── LICENSE ├── README.md ├── conftest.py ├── doall ├── doc └── options.md ├── mypy.ini ├── old ├── README.md ├── common.py ├── foursquare.py ├── jq_normaliser.py ├── json.py ├── json2.py └── reddit.py ├── pyproject.toml ├── pytest.ini ├── ruff.toml ├── scripts └── apt.sh ├── src └── bleanser │ ├── core │ ├── __init__.py │ ├── __main__.py │ ├── common.py │ ├── compat.py │ ├── ext │ │ ├── dummy_executor.py │ │ ├── logging.py │ │ └── sqlite_dumben.py │ ├── main.py │ ├── modules │ │ ├── extract.py │ │ ├── json.py │ │ ├── sqlite.py │ │ ├── tests │ │ │ └── sqlite.py │ │ └── xml.py │ ├── processor.py │ ├── sqlite.py │ └── utils.py │ ├── modules │ ├── antennapod_android.py │ ├── binary.py │ ├── bluemaestro.py │ ├── bumble_android.py │ ├── chrome.py │ ├── fbmessenger_android.py │ ├── firefox.py │ ├── foursquare.py │ ├── ghexport.py │ ├── goodreads.py │ ├── hinge_android.py │ ├── hpi │ │ ├── fbmessenger_android.py │ │ ├── twitter_android.py │ │ └── whatsapp_android.py │ ├── instagram_android.py │ ├── json_new.py │ ├── kobo.py │ ├── lastfm.py │ ├── monzo.py │ ├── pinboard.py │ ├── pocket.py │ ├── podcastaddict_android.py │ ├── reddit.py │ ├── rescuetime.py │ ├── rescuetime_android.py │ ├── skype_android.py │ ├── sleepasandroid_android.py │ ├── smscalls.py │ ├── spotify.py │ ├── spotifyexport.py │ ├── stackexchange.py │ ├── talon_android.py │ ├── tiktok_android.py │ ├── tinder_android.py │ ├── twitter_android.py │ ├── vk_android.py │ ├── whatsapp_android.py │ └── xml_clean.py │ ├── py.typed │ └── tests │ ├── common.py │ ├── test_binary.py │ └── test_hypothesis.py └── tox.ini /.ci/release: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ''' 3 | Run [[file:scripts/release][scripts/release]] to deploy Python package onto [[https://pypi.org][PyPi]] and [[https://test.pypi.org][test PyPi]]. 4 | 5 | The script expects =TWINE_PASSWORD= environment variable to contain the [[https://pypi.org/help/#apitoken][PyPi token]] (not the password!). 6 | 7 | The script can be run manually. 8 | It's also running as =pypi= job in [[file:.github/workflows/main.yml][Github Actions config]]. Packages are deployed on: 9 | - every master commit, onto test pypi 10 | - every new tag, onto production pypi 11 | 12 | You'll need to set =TWINE_PASSWORD= and =TWINE_PASSWORD_TEST= in [[https://help.github.com/en/actions/configuring-and-managing-workflows/creating-and-storing-encrypted-secrets#creating-encrypted-secrets][secrets]] 13 | for Github Actions deployment to work. 14 | ''' 15 | 16 | import os 17 | import sys 18 | from pathlib import Path 19 | from subprocess import check_call 20 | import shutil 21 | 22 | is_ci = os.environ.get('CI') is not None 23 | 24 | def main() -> None: 25 | import argparse 26 | p = argparse.ArgumentParser() 27 | p.add_argument('--test', action='store_true', help='use test pypi') 28 | args = p.parse_args() 29 | 30 | extra = [] 31 | if args.test: 32 | extra.extend(['--repository', 'testpypi']) 33 | 34 | root = Path(__file__).absolute().parent.parent 35 | os.chdir(root) # just in case 36 | 37 | if is_ci: 38 | # see https://github.com/actions/checkout/issues/217 39 | check_call('git fetch --prune --unshallow'.split()) 40 | 41 | dist = root / 'dist' 42 | if dist.exists(): 43 | shutil.rmtree(dist) 44 | 45 | check_call(['python3', '-m', 'build']) 46 | 47 | TP = 'TWINE_PASSWORD' 48 | password = os.environ.get(TP) 49 | if password is None: 50 | print(f"WARNING: no {TP} passed", file=sys.stderr) 51 | import pip_secrets 52 | password = pip_secrets.token_test if args.test else pip_secrets.token # meh 53 | 54 | check_call([ 55 | 'python3', '-m', 'twine', 56 | 'upload', *dist.iterdir(), 57 | *extra, 58 | ], env={ 59 | 'TWINE_USERNAME': '__token__', 60 | TP: password, 61 | **os.environ, 62 | }) 63 | 64 | 65 | if __name__ == '__main__': 66 | main() 67 | -------------------------------------------------------------------------------- /.ci/release-uv: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ''' 3 | Deploys Python package onto [[https://pypi.org][PyPi]] or [[https://test.pypi.org][test PyPi]]. 4 | 5 | - running manually 6 | 7 | You'll need =UV_PUBLISH_TOKEN= env variable 8 | 9 | - running on Github Actions 10 | 11 | Instead of env variable, relies on configuring github as Trusted publisher (https://docs.pypi.org/trusted-publishers/) -- both for test and regular pypi 12 | 13 | It's running as =pypi= job in [[file:.github/workflows/main.yml][Github Actions config]]. 14 | Packages are deployed on: 15 | - every master commit, onto test pypi 16 | - every new tag, onto production pypi 17 | ''' 18 | 19 | UV_PUBLISH_TOKEN = 'UV_PUBLISH_TOKEN' 20 | 21 | import argparse 22 | import os 23 | import shutil 24 | from pathlib import Path 25 | from subprocess import check_call 26 | 27 | is_ci = os.environ.get('CI') is not None 28 | 29 | def main() -> None: 30 | p = argparse.ArgumentParser() 31 | p.add_argument('--use-test-pypi', action='store_true') 32 | args = p.parse_args() 33 | 34 | publish_url = ['--publish-url', 'https://test.pypi.org/legacy/'] if args.use_test_pypi else [] 35 | 36 | root = Path(__file__).absolute().parent.parent 37 | os.chdir(root) # just in case 38 | 39 | if is_ci: 40 | # see https://github.com/actions/checkout/issues/217 41 | check_call('git fetch --prune --unshallow'.split()) 42 | 43 | # TODO ok, for now uv won't remove dist dir if it already exists 44 | # https://github.com/astral-sh/uv/issues/10293 45 | dist = root / 'dist' 46 | if dist.exists(): 47 | shutil.rmtree(dist) 48 | 49 | # todo what is --force-pep517? 50 | check_call(['uv', 'build']) 51 | 52 | if not is_ci: 53 | # CI relies on trusted publishers so doesn't need env variable 54 | assert UV_PUBLISH_TOKEN in os.environ, f'no {UV_PUBLISH_TOKEN} passed' 55 | 56 | check_call(['uv', 'publish', *publish_url]) 57 | 58 | 59 | if __name__ == '__main__': 60 | main() 61 | -------------------------------------------------------------------------------- /.ci/run: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eu 3 | 4 | cd "$(dirname "$0")" 5 | cd .. # git root 6 | 7 | if ! command -v sudo; then 8 | # CI or Docker sometimes doesn't have it, so useful to have a dummy 9 | function sudo { 10 | "$@" 11 | } 12 | fi 13 | 14 | # --parallel-live to show outputs while it's running 15 | tox_cmd='run-parallel --parallel-live' 16 | if [ -n "${CI-}" ]; then 17 | # install OS specific stuff here 18 | case "$OSTYPE" in 19 | darwin*) 20 | # macos 21 | brew install libmagic # for python-magic 22 | brew install diffutils # for GNU diff 23 | ;; 24 | cygwin* | msys* | win*) 25 | # windows 26 | # ugh. parallel stuff seems super flaky under windows, some random failures, "file used by other process" and crap like that 27 | tox_cmd='run' 28 | ;; 29 | *) 30 | # must be linux? 31 | : 32 | ;; 33 | esac 34 | fi 35 | 36 | # NOTE: expects uv installed 37 | uv tool run --with tox-uv tox $tox_cmd "$@" 38 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | # see https://github.com/karlicoss/pymplate for up-to-date reference 2 | 3 | name: CI 4 | on: 5 | push: 6 | branches: '*' 7 | tags: 'v[0-9]+.*' # only trigger on 'release' tags for PyPi 8 | # Ideally I would put this in the pypi job... but github syntax doesn't allow for regexes there :shrug: 9 | pull_request: # needed to trigger on others' PRs 10 | # Note that people who fork it need to go to "Actions" tab on their fork and click "I understand my workflows, go ahead and enable them". 11 | workflow_dispatch: # needed to trigger workflows manually 12 | # todo cron? 13 | inputs: 14 | debug_enabled: 15 | type: boolean 16 | description: 'Run the build with tmate debugging enabled (https://github.com/marketplace/actions/debugging-with-tmate)' 17 | required: false 18 | default: false 19 | 20 | 21 | jobs: 22 | build: 23 | strategy: 24 | fail-fast: false 25 | matrix: 26 | platform: [ubuntu-latest, macos-latest] # , windows-latest] 27 | python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] 28 | exclude: [ 29 | # min and max version is enough for osx running (it's kinda slow) 30 | {platform: macos-latest, python-version: '3.10'}, 31 | {platform: macos-latest, python-version: '3.11'}, 32 | {platform: macos-latest, python-version: '3.12'}, 33 | ] 34 | runs-on: ${{ matrix.platform }} 35 | 36 | # useful for 'optional' pipelines 37 | # continue-on-error: ${{ matrix.platform == 'windows-latest' }} 38 | 39 | steps: 40 | # ugh https://github.com/actions/toolkit/blob/main/docs/commands.md#path-manipulation 41 | - run: echo "$HOME/.local/bin" >> $GITHUB_PATH 42 | 43 | - uses: actions/checkout@v4 44 | with: 45 | submodules: recursive 46 | fetch-depth: 0 # nicer to have all git history when debugging/for tests 47 | 48 | - uses: actions/setup-python@v5 49 | with: 50 | python-version: ${{ matrix.python-version }} 51 | 52 | - uses: astral-sh/setup-uv@v5 53 | with: 54 | enable-cache: false # we don't have lock files, so can't use them as cache key 55 | 56 | - uses: mxschmitt/action-tmate@v3 57 | if: ${{ github.event_name == 'workflow_dispatch' && inputs.debug_enabled }} 58 | 59 | # explicit bash command is necessary for Windows CI runner, otherwise it thinks it's cmd... 60 | - run: bash .ci/run 61 | 62 | - if: matrix.platform == 'ubuntu-latest' # no need to compute coverage for other platforms 63 | uses: actions/upload-artifact@v4 64 | with: 65 | include-hidden-files: true 66 | name: .coverage.mypy_${{ matrix.platform }}_${{ matrix.python-version }} 67 | path: .coverage.mypy/ 68 | 69 | 70 | pypi: 71 | runs-on: ubuntu-latest 72 | needs: [build] # add all other jobs here 73 | permissions: 74 | # necessary for Trusted Publishing 75 | id-token: write 76 | steps: 77 | # ugh https://github.com/actions/toolkit/blob/main/docs/commands.md#path-manipulation 78 | - run: echo "$HOME/.local/bin" >> $GITHUB_PATH 79 | 80 | - uses: actions/checkout@v4 81 | with: 82 | submodules: recursive 83 | 84 | - uses: actions/setup-python@v5 85 | with: 86 | python-version: '3.10' 87 | 88 | - uses: astral-sh/setup-uv@v5 89 | with: 90 | enable-cache: false # we don't have lock files, so can't use them as cache key 91 | 92 | - name: 'release to test pypi' 93 | # always deploy merged master to test pypi 94 | if: github.event_name != 'pull_request' && github.event.ref == 'refs/heads/master' 95 | run: .ci/release-uv --use-test-pypi 96 | 97 | - name: 'release to pypi' 98 | # always deploy tags to release pypi 99 | # NOTE: release tags are guarded by on: push: tags on the top 100 | if: github.event_name != 'pull_request' && startsWith(github.event.ref, 'refs/tags') 101 | run: .ci/release-uv 102 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/python,emacs 3 | # Edit at https://www.gitignore.io/?templates=python,emacs 4 | 5 | ### Emacs ### 6 | # -*- mode: gitignore; -*- 7 | *~ 8 | \#*\# 9 | /.emacs.desktop 10 | /.emacs.desktop.lock 11 | *.elc 12 | auto-save-list 13 | tramp 14 | .\#* 15 | 16 | # Org-mode 17 | .org-id-locations 18 | *_archive 19 | 20 | # flymake-mode 21 | *_flymake.* 22 | 23 | # eshell files 24 | /eshell/history 25 | /eshell/lastdir 26 | 27 | # elpa packages 28 | /elpa/ 29 | 30 | # reftex files 31 | *.rel 32 | 33 | # AUCTeX auto folder 34 | /auto/ 35 | 36 | # cask packages 37 | .cask/ 38 | dist/ 39 | 40 | # Flycheck 41 | flycheck_*.el 42 | 43 | # server auth directory 44 | /server/ 45 | 46 | # projectiles files 47 | .projectile 48 | 49 | # directory configuration 50 | .dir-locals.el 51 | 52 | # network security 53 | /network-security.data 54 | 55 | 56 | ### Python ### 57 | # Byte-compiled / optimized / DLL files 58 | __pycache__/ 59 | *.py[cod] 60 | *$py.class 61 | 62 | # C extensions 63 | *.so 64 | 65 | # Distribution / packaging 66 | .Python 67 | build/ 68 | develop-eggs/ 69 | downloads/ 70 | eggs/ 71 | .eggs/ 72 | lib/ 73 | lib64/ 74 | parts/ 75 | sdist/ 76 | var/ 77 | wheels/ 78 | pip-wheel-metadata/ 79 | share/python-wheels/ 80 | *.egg-info/ 81 | .installed.cfg 82 | *.egg 83 | MANIFEST 84 | 85 | # PyInstaller 86 | # Usually these files are written by a python script from a template 87 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 88 | *.manifest 89 | *.spec 90 | 91 | # Installer logs 92 | pip-log.txt 93 | pip-delete-this-directory.txt 94 | 95 | # Unit test / coverage reports 96 | htmlcov/ 97 | .tox/ 98 | .nox/ 99 | .coverage 100 | .coverage.* 101 | .cache 102 | nosetests.xml 103 | coverage.xml 104 | *.cover 105 | .hypothesis/ 106 | .pytest_cache/ 107 | 108 | # Translations 109 | *.mo 110 | *.pot 111 | 112 | # Scrapy stuff: 113 | .scrapy 114 | 115 | # Sphinx documentation 116 | docs/_build/ 117 | 118 | # PyBuilder 119 | target/ 120 | 121 | # pyenv 122 | .python-version 123 | 124 | # pipenv 125 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 126 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 127 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 128 | # install all needed dependencies. 129 | #Pipfile.lock 130 | 131 | # celery beat schedule file 132 | celerybeat-schedule 133 | 134 | # SageMath parsed files 135 | *.sage.py 136 | 137 | # Spyder project settings 138 | .spyderproject 139 | .spyproject 140 | 141 | # Rope project settings 142 | .ropeproject 143 | 144 | # Mr Developer 145 | .mr.developer.cfg 146 | .project 147 | .pydevproject 148 | 149 | # mkdocs documentation 150 | /site 151 | 152 | # mypy 153 | .mypy_cache/ 154 | .dmypy.json 155 | dmypy.json 156 | 157 | # Pyre type checker 158 | .pyre/ 159 | 160 | # End of https://www.gitignore.io/api/python,emacs 161 | 162 | untracked/ 163 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Dmitrii Gerasimov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | # this is a hack to monkey patch pytest so it handles tests inside namespace packages without __init__.py properly 2 | # without it, pytest can't discover the package root for some reason 3 | # also see https://github.com/karlicoss/pytest_namespace_pkgs for more 4 | 5 | import os 6 | import pathlib 7 | from typing import Optional 8 | 9 | import _pytest.main 10 | import _pytest.pathlib 11 | 12 | # we consider all dirs in repo/ to be namespace packages 13 | root_dir = pathlib.Path(__file__).absolute().parent.resolve() / 'src' 14 | assert root_dir.exists(), root_dir 15 | 16 | # TODO assert it contains package name?? maybe get it via setuptools.. 17 | 18 | namespace_pkg_dirs = [str(d) for d in root_dir.iterdir() if d.is_dir()] 19 | 20 | # resolve_package_path is called from _pytest.pathlib.import_path 21 | # takes a full abs path to the test file and needs to return the path to the 'root' package on the filesystem 22 | resolve_pkg_path_orig = _pytest.pathlib.resolve_package_path 23 | 24 | 25 | def resolve_package_path(path: pathlib.Path) -> Optional[pathlib.Path]: 26 | result = path # search from the test file upwards 27 | for parent in result.parents: 28 | if str(parent) in namespace_pkg_dirs: 29 | return parent 30 | if os.name == 'nt': 31 | # ??? for some reason on windows it is trying to call this against conftest? but not on linux/osx 32 | if path.name == 'conftest.py': 33 | return resolve_pkg_path_orig(path) 34 | raise RuntimeError("Couldn't determine path for ", path) 35 | 36 | 37 | # NOTE: seems like it's not necessary anymore? 38 | # keeping it for now just in case 39 | # after https://github.com/pytest-dev/pytest/pull/13426 we should be able to remove the whole conftest 40 | # _pytest.pathlib.resolve_package_path = resolve_package_path 41 | 42 | 43 | # without patching, the orig function returns just a package name for some reason 44 | # (I think it's used as a sort of fallback) 45 | # so we need to point it at the absolute path properly 46 | # not sure what are the consequences.. maybe it wouldn't be able to run against installed packages? not sure.. 47 | search_pypath_orig = _pytest.main.search_pypath 48 | 49 | 50 | def search_pypath(module_name: str) -> str: 51 | mpath = root_dir / module_name.replace('.', os.sep) 52 | if not mpath.is_dir(): 53 | mpath = mpath.with_suffix('.py') 54 | assert mpath.exists(), mpath # just in case 55 | return str(mpath) 56 | 57 | 58 | _pytest.main.search_pypath = search_pypath 59 | -------------------------------------------------------------------------------- /doall: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from pathlib import Path 3 | from itertools import islice 4 | from subprocess import check_call, run, PIPE 5 | 6 | paths = list(sorted(Path('reddit').glob('*.json'))) 7 | 8 | def different(p1: Path, p2: Path, extract: bool) -> bool: 9 | cmd = [ 10 | './jdiff', '--diff', *(['--extract'] if extract else []), str(p1), str(p2), 11 | ] 12 | print(' ' + ' '.join(cmd)) 13 | res = run(cmd, stdout=PIPE) 14 | assert res.returncode <= 1 15 | return res.returncode == 1 16 | 17 | # TODO domination relationship can be tested via diff inclusion 18 | # TODO different normaliser for csv (e.g. lastfm) 19 | # TODO start erroring when there are enough of them, so it's not too annoying? 20 | # TODO or, maybe only error if the last one triggered. tha 21 | 22 | from_ = 1644 23 | for i, before, after in islice(zip(range(10000000000000), paths, paths[1:]), from_, None): 24 | print(f'comparing {i} {before.name} vs {after.name}: ') 25 | extr_diff = different(before, after, extract=True) 26 | cleanup_diff = different(before, after, extract=False) 27 | # if there are differences, whatever, keep on going 28 | if extr_diff == cleanup_diff: 29 | print(' ok: both normalisers agree ' + ('different' if extr_diff else 'SAME')) 30 | continue 31 | print(' ERROR!!!!!') 32 | 33 | 34 | # if cleanup_diff: 35 | # print(' OK: both normalined and cleaned up') 36 | # assert not cleanup_diff 37 | -------------------------------------------------------------------------------- /doc/options.md: -------------------------------------------------------------------------------- 1 | An explanation of the `--multiway`/`--prune-dominated` options, modified from [zulip chat](https://memex.zulipchat.com/#narrow/stream/279601-hpi/topic/bleanser/near/258276779) 2 | 3 | Say you had a bunch of sqlite databases and mapped them onto text dumps using `normalise`. The idea is to figure out which dumps are redundant. 4 | 5 | Say you've got dumps `C.sql` and `B.sql` -- and you diff them (like literally, [`diff`](https://man7.org/linux/man-pages/man1/diff.1.html)) 6 | 7 | You have the following cases 8 | 9 | - they are exactly the same (`CmpResult.SAME`), so obviously it's safe to remove `A.sql` 10 | - `B.sql` is a superset of `A.sql` (this is `CmpResult.DOMINATES`). In general it's safe to remove `A.sql` in this case, but cause I'm paranoid it's controlled by `delete_dominated` 11 | - `B.sql` isn't a superset of `A.sql`, i.e. some items present in `A` are missing in `B`. (this is `CmpResult.DIFFERENT`). In this case you wanna keep both`A` and `B`. In practice this happens when there is some retention in the database (like with browser history) 12 | - there is also a special value `CmpResult.ERROR`, which also means we want to keep both `A` and `B` (but it's nice to distinguish from `DIFFERENT`) 13 | 14 | Now in the simplest case... you just go through all pairs of adjacent files, compute these `CmpResult`s, and end up with smth like this 15 | I'll use `<` for 'dominated', `=` for 'same', `!=` for 'different': 16 | 17 | `A < B < C != D = E < G != H != I != J < K != L < M < N` 18 | 19 | So in principle, you only need to keep files `C, G, H, I, K, N` and it will still give you a complete set of data when you merge it 20 | 21 | Alternatively, you keep `A, C, D, G, H, I, J, K, L, N` if the `delete_dominated` flag is `False` 22 | 23 | This is called 'two-way' comparison, cause you just consider pairs of adjacent files, so it would be `MULTIWAY = False` 24 | 25 | Multiway comparison; easier to show on an example 26 | 27 | Say we've got these sets of items 28 | 29 | ``` 30 | {A B C} # 0 31 | {B C D} # 1 32 | {C D E} # 2 33 | {X Y Z} # 3 34 | ``` 35 | 36 | If we do two-way comparisons, we'll keep them all because none of them fully contains the previous neighbour. 37 | 38 | However you may notice that union of `0` and `2` completely contains `1`. This is what 'multiway' mode does -- trying to find 'pivot' elements which contain the sets 'between' them. 39 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | pretty = True 3 | show_error_context = True 4 | show_column_numbers = True 5 | show_error_end = True 6 | 7 | check_untyped_defs = True 8 | 9 | # see https://mypy.readthedocs.io/en/stable/error_code_list2.html 10 | warn_redundant_casts = True 11 | strict_equality = True 12 | warn_unused_ignores = True 13 | enable_error_code = deprecated,redundant-expr,possibly-undefined,truthy-bool,truthy-iterable,ignore-without-code,unused-awaitable 14 | 15 | # an example of suppressing 16 | # [mypy-my.config.repos.pdfannots.pdfannots] 17 | # ignore_errors = True 18 | -------------------------------------------------------------------------------- /old/README.md: -------------------------------------------------------------------------------- 1 | some old normalisers, might be useful for reference 2 | -------------------------------------------------------------------------------- /old/common.py: -------------------------------------------------------------------------------- 1 | class CmpResult(Enum): 2 | DIFFERENT = 'different' 3 | SAME = 'same' 4 | DOMINATES = 'dominates' 5 | ERROR = 'error' 6 | 7 | 8 | class Diff(NamedTuple): 9 | cmp: CmpResult 10 | diff: bytes 11 | 12 | 13 | class Relation(NamedTuple): 14 | before: Path 15 | diff: Diff 16 | after: Path 17 | -------------------------------------------------------------------------------- /old/foursquare.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from pathlib import Path 3 | 4 | from jq_normaliser import JqNormaliser, Filter, pipe, jdel as d, jq_del_all 5 | 6 | 7 | def _normalise_coordinates(): 8 | return [ 9 | # TODO shit. take - into account?? 10 | '(.. | .lat?) |= (tostring | .[0:4])', 11 | '(.. | .lng?) |= (tostring | .[0:4])', 12 | ] 13 | 14 | 15 | 16 | class FsqNormaliser(JqNormaliser): 17 | def __init__(self, *args, **kwargs) -> None: 18 | super().__init__(*args, **kwargs, logger_tag='fsq-normaliser', delete_dominated=True, keep_both=False) # type: ignore 19 | 20 | # ok, this on only can delete items or do trivial rewrites 21 | # if we map we might lose data here! 22 | def cleanup(self) -> Filter: 23 | return pipe( 24 | d('.[] | (.meta, .notifications)'), 25 | 26 | d('.[].response.checkins.items[] | (.isMayor, .venue, .likes, .sticker, .like, .ratedAt)'), 27 | 28 | jq_del_all( 29 | 'contact', 30 | ), 31 | jq_del_all( 32 | 'editableUntil', 33 | 'prefix', 34 | 'consumerId', 35 | ), 36 | jq_del_all( 37 | 'lastName', 38 | ), 39 | *_normalise_coordinates(), 40 | # TODO shit. again, we want to assert... 41 | ) 42 | # TODO shit. lat and lng jump randomly.. can we trim them? 43 | # return '.' 44 | # return 'sort_by(.date) | map(map_values(ascii_downcase))' 45 | 46 | def extract(self) -> Filter: 47 | return pipe( 48 | 'map_values(.response)', 49 | 'map_values(.checkins)', 50 | 'map_values(.items)', 51 | '.[]', 52 | 'map({id})', # venue: .venue.name })', just keep venue id?? 53 | *_normalise_coordinates(), 54 | # TODO not sure if we need to sort? 55 | ) 56 | 57 | 58 | 59 | def main(): 60 | norm = FsqNormaliser() 61 | norm.main() 62 | 63 | 64 | if __name__ == '__main__': 65 | main() 66 | -------------------------------------------------------------------------------- /old/json.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from __future__ import annotations 3 | 4 | from contextlib import contextmanager 5 | from itertools import tee 6 | import orjson as json 7 | from pathlib import Path 8 | from typing import Iterator, List 9 | 10 | from bleanser.core.common import logger 11 | from bleanser.core.utils import Json 12 | from bleanser.core.processor import BaseNormaliser 13 | 14 | 15 | from plumbum import local # type: ignore 16 | 17 | 18 | jq = local['jq'] 19 | 20 | 21 | # TODO hmm maybe I just want to use https://github.com/tomnomnom/gron ? 22 | # although would be tricky to chop off the indices... 23 | 24 | # we replace numbers with placeholders since otherwise it's too unstable 25 | # TODO ... not sure if it should be the default 26 | JQ_PATHS = ''' 27 | paths(scalars) as $p 28 | | [ ( [ $p[] | if type == "number" then "X" else tostring end ] | join(".") ) 29 | , ( getpath($p) | tojson ) 30 | ] 31 | | join(": ") 32 | ''' 33 | 34 | import hashlib 35 | 36 | from typing import Iterator, Tuple, Iterable 37 | JPath = str 38 | JVal = str 39 | JHash = str 40 | # TODO ugh. it's a bit too elaborate way to do structural diff, really... 41 | # TODO fuck. this is quite slow, but not sure what should I do about it... 42 | # how to make it work with process pool executor?? 43 | def _aspaths(js: Json) -> Tuple[JHash, Iterable[Tuple[JPath, JVal]]]: 44 | if isinstance(js, (str, int, float, bool, type(None))): 45 | # TODO json dumps? 46 | # TODO do root values really need hash? 47 | vhash = hashlib.md5(str(js).encode('utf8')).hexdigest()[:7] 48 | return (vhash, [('', str(js))]) 49 | 50 | sep = '.' # todo customize? 51 | 52 | # TODO ugh. not very iterative.. 53 | # I guess can't really be, because need information about all siblings before proceeding? 54 | if isinstance(js, list): 55 | ress = [] 56 | for i, c in enumerate(js): 57 | k = str(i) 58 | chash, cres = _aspaths(c) 59 | 60 | for p, v in cres: 61 | cp = chash 62 | ress.append((cp + ('' if len(p) == 0 else (sep + p)), v)) 63 | # TODO list shouldn't be hashed?? 64 | # TODO shit... could this be a problem for something like tags? 65 | return ('', ress) 66 | 67 | if isinstance(js, dict): 68 | # TODO or maybe two pass? then won't need to cache as much? 69 | # TODO could optimize and avoid combining the very top level hash? 70 | ress = [] 71 | hd: dict[str, str] = {} 72 | for k, c in sorted(js.items()): 73 | cp = k 74 | 75 | chash, cres = _aspaths(c) 76 | hd[k] = chash 77 | 78 | for p, v in cres: 79 | ress.append((cp + ('' if len(p) == 0 else (sep + p)), v)) 80 | 81 | dhash = hashlib.md5(json.dumps(hd)).hexdigest()[:7] 82 | return (dhash, ress) 83 | 84 | raise RuntimeError(js, type(js)) 85 | 86 | 87 | def aspaths(js: Json) -> Iterator[str]: 88 | _, res = _aspaths(js=js) 89 | for k, v in res: 90 | yield k + ' : ' + v 91 | 92 | 93 | def test_aspaths() -> None: 94 | j = { 95 | 'root': [ 96 | dict(a=1,b=1), 97 | dict(a=1,b=0), 98 | dict(a=0,b=1), 99 | dict(a=0,b=0), 100 | dict(a=2,b=2), 101 | 102 | dict(a=1,b=0), 103 | dict(a=1,b=1), 104 | ], 105 | 'boop': {'beep': [123, 456]}, 106 | } 107 | paths = list(aspaths(j)) 108 | assert paths == [ 109 | 'boop.beep.202cb96 : 123', 110 | 'boop.beep.250cf8b : 456', 111 | 'root.824ad40.a : 1', 112 | 'root.824ad40.b : 1', 113 | 'root.8a5a377.a : 1', 114 | 'root.8a5a377.b : 0', 115 | 'root.23bbe1a.a : 0', 116 | 'root.23bbe1a.b : 1', 117 | 'root.213c309.a : 0', 118 | 'root.213c309.b : 0', 119 | 'root.8b165c4.a : 2', 120 | 'root.8b165c4.b : 2', 121 | 'root.8a5a377.a : 1', 122 | 'root.8a5a377.b : 0', 123 | 'root.824ad40.a : 1', 124 | 'root.824ad40.b : 1', 125 | ] 126 | 127 | 128 | 129 | def _aspaths_aux(js: Json) -> List[str]: 130 | return list(aspaths(js)) 131 | 132 | 133 | class JsonNormaliser(BaseNormaliser): 134 | # filter out additions; keep the rest 135 | DIFF_FILTER = '> ' 136 | 137 | MULTIWAY = True 138 | # TODO delete dominated 139 | 140 | def cleanup(self, j: Json) -> None: 141 | # TODO not sure if should modify in place? 142 | pass 143 | 144 | @contextmanager 145 | def do_cleanup(self, path: Path, *, wdir: Path) -> Iterator[Path]: 146 | # TODO call .unpacked 147 | 148 | # todo copy paste from SqliteNormaliser 149 | path = path.absolute().resolve() 150 | cleaned = wdir / Path(*path.parts[1:]) / (path.name + '-cleaned') 151 | cleaned.parent.mkdir(parents=True, exist_ok=True) 152 | 153 | with path.open('r') as fp: 154 | j = json.loads(fp.read()) 155 | self.cleanup(j) 156 | # todo sort keys? not sure... 157 | # TODO huh. jq version is almost order of magnitude slower??? 158 | # js = json.dumps(j) # , indent=2, sort_keys=True) 159 | # cmd = jq['-r', JQ_PATHS] 160 | # jq_lines = (cmd << js )().splitlines() 161 | jq_lines = _aspaths_aux(j) 162 | # # move to top 163 | # from concurrent.futures import ProcessPoolExecutor as Pool 164 | # pool = Pool(8) 165 | # # 166 | # fut = pool.submit(_aspaths_aux, j) 167 | # jq_lines = fut.result() 168 | 169 | # TODO later 170 | cleanup_jq_dump = getattr(self, 'cleanup_jq_dump', None) 171 | if cleanup_jq_dump is not None: 172 | cleanup_jq_dump(jq_lines) 173 | with cleaned.open('w') as fp: 174 | for line in jq_lines: 175 | print(line, file=fp) 176 | yield cleaned 177 | 178 | 179 | 180 | def test_json_normaliser_1(tmp_path: Path) -> None: 181 | j = [ 182 | dict(a=1,b=1), 183 | dict(a=1,b=0), 184 | dict(a=0,b=1), 185 | dict(a=0,b=0), 186 | dict(a=2,b=2), 187 | 188 | dict(a=1,b=0), 189 | dict(a=1,b=1), 190 | ] 191 | i = tmp_path / 'input.json' 192 | i.write_text(json.dumps(j)) 193 | 194 | n = JsonNormaliser() 195 | with n.do_cleanup(i, wdir=tmp_path) as c: 196 | res = c.read_text() 197 | 198 | lines = res.splitlines() 199 | assert len(lines) == 14, lines 200 | 201 | lset = set(lines) 202 | # we want to keep these unique 'rows' 203 | assert len(lset) == 10, (lines, lset) 204 | 205 | 206 | def test_json_normaliser_2(tmp_path: Path) -> None: 207 | # TODO ok -- so we need to mark certain 'levels' as rolling instead? uggggh 208 | j = [ 209 | ['b', 1], 210 | ['b', 0], 211 | ['a', 1], 212 | ['a', 0], 213 | ['c', 2], 214 | 215 | ['b', 0], 216 | ['b', 1], 217 | ] 218 | i = tmp_path / 'input.json' 219 | i.write_text(json.dumps(j)) 220 | 221 | n = JsonNormaliser() 222 | with n.do_cleanup(i, wdir=tmp_path) as c: 223 | res = c.read_text() 224 | 225 | lines = res.splitlines() 226 | assert len(lines) == 14, lines 227 | 228 | lset = set(lines) 229 | # TODO right, this won't work now... because we don't want to hash the whole list... 230 | # assert len(lset) == 10, (lines, lset) 231 | 232 | 233 | # can work as generic json processor 234 | if __name__ == '__main__': 235 | from bleanser.core import main 236 | main(Normaliser=JsonNormaliser) 237 | 238 | # just for convenience 239 | from .utils import Json 240 | -------------------------------------------------------------------------------- /old/json2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from pathlib import Path 3 | import sys 4 | from subprocess import check_call 5 | 6 | import json 7 | 8 | # todo hmm, seems that there isn't that much perf difference, at least on hyperfine 9 | # although on the profile, when running with orjson, seems to finish faster?? 10 | # maybe double check later 11 | # import orjson as json 12 | 13 | 14 | # TODO warn about some data being cleaned, refer to the sources 15 | def pp_github(j): 16 | # todo later compare to jq somehow? but doubt it'd be faster 17 | from itertools import chain 18 | 19 | # TODO hmm 20 | # what should we do with repos :::: clones thing? 21 | # maybe we could check domination relationship in a more clever way somehow?... 22 | # e.g. here clones -> count ??? 23 | # ': 0, 'uniques': 0, 'views': []}, 'clones': {'count': 29, 'uniques': 14, 'clones': [{'timestamp': '2021-11-29T00:00:00Z'| 'pull': True}, 'traffic': {'views': {'count': 0, 'uniques': 0, 'views': []}, 'clones': {'count': 27, 'uniques': 13, 'cl 24 | # , 'count': 3, 'uniques': 2}, {'timestamp': '2021-11-30T00:00:00Z', 'count': 2, 'uniques': 1}, {'timestamp': '2021-12-01T| ones': [{'timestamp': '2021-11-29T00:00:00Z', 'count': 1, 'uniques': 1}, {'timestamp': '2021-11-30T00:00:00Z', 'count': 25 | 26 | # TODO not sure what to do with it... 27 | # for x in j['repos']: 28 | # del x['traffic'] 29 | 30 | for x in chain(j['watched'], j['starred']): 31 | for key in [ 32 | 'watchers', 'stargazers_count', 'watchers_count', 33 | 34 | # updated_at -- seems that it's updated every time there is a star etc... 35 | 'updated_at', 36 | 'forks', 'forks_count', 37 | 38 | 'open_issues', 'open_issues_count', 39 | 40 | # eh, not sure about these... 41 | 'pushed_at', 42 | 'size', 43 | ]: 44 | del x[key] 45 | 46 | 47 | def pp_spotify(j): 48 | from bleanser.modules.spotifyexport import Normaliser 49 | n = Normaliser(path='meh') 50 | # todo method to delete multiple keys 51 | n.cleanup(j=j) 52 | 53 | 54 | # TODO need to unflatten playlists somehow 55 | # hmm basically any list-like thing is 'suspicious', because it kinda means denormalised struct 56 | pl2 = [] 57 | for x in j['playlists']: 58 | for t in x['tracks']: 59 | q = {k: v for k, v in x.items()} 60 | q['tracks'] = t 61 | pl2.append(q) 62 | j['playlists'] = pl2 63 | # hmm this is annoying... shared playlists are updating literally every day? 64 | 65 | 66 | def preprocess(*, j, name): 67 | # todo not sure how defensive should be? 68 | 69 | # todo not sure if there is a better way 70 | if '/github-events/' in name: 71 | pp_github(j) 72 | elif '/spotify/' in name: 73 | pp_spotify(j) 74 | 75 | 76 | def process(fo, *, name) -> None: 77 | data = fo.read() 78 | # todo orjson supports memoryview?? 79 | j = json.loads(data) 80 | # todo would be nice to close it here 81 | 82 | preprocess(j=j, name=name) 83 | 84 | if isinstance(j, list): 85 | res = {'': j} # meh 86 | else: 87 | assert isinstance(j, dict), j 88 | res = j 89 | 90 | for k, v in res.items(): 91 | if not isinstance(v, list): 92 | # something like 'profile' data in hypothesis could be a dict 93 | # something like 'notes' in rescuetime could be a scalar (str) 94 | v = [v] # meh 95 | assert isinstance(v, list), (k, v) 96 | for i in v: 97 | # todo dump json here for i; sort keys? 98 | print(f'{k} ::: {i}') 99 | print('ok') 100 | 101 | 102 | def compare(p1: str, p2: str): 103 | assert p1 != '-' and p2 != '-' 104 | # hacky way to compare 105 | def cc(p: str): 106 | if p.endswith('.xz'): 107 | cat = 'xzcat' 108 | else: 109 | cat = 'cat' 110 | # {cat} {p} | {__file__} - 111 | return f'{__file__} {p} | sort' 112 | c1 = cc(p1) 113 | c2 = cc(p2) 114 | # wrap = ' -c "windo set wrap" '# -- eh, not super convenient? 115 | wrap = '' 116 | # TODO pipefail? doesn't work well.. 117 | cmd = f'vimdiff {wrap} <({c1}) <({c2})' 118 | check_call(cmd, shell=True, executable='/bin/bash') 119 | 120 | 121 | def main() -> None: 122 | import argparse 123 | p = argparse.ArgumentParser() 124 | p.add_argument('path1') 125 | p.add_argument('path2', nargs='?') 126 | p.add_argument('--first' , required=False, type=int) 127 | p.add_argument('--second', required=False, type=int) 128 | args = p.parse_args() 129 | 130 | p1 = args.path1 131 | p2 = args.path2 132 | 133 | # TODO compare performance fo handling compressed and uncompressed files 134 | from bleanser.core.kompress import CPath 135 | 136 | assert p1 is not None 137 | 138 | if p2 is not None: 139 | compare(p1=p1, p2=p2) 140 | return 141 | 142 | # handle single file 143 | if p1 == '-': 144 | process(fo=sys.stdin) 145 | return 146 | 147 | pp = Path(p1).absolute() 148 | 149 | if pp.is_dir(): 150 | files = list(sorted(pp.iterdir())) 151 | 152 | first = args.first; assert first is not None 153 | 154 | second = args.second 155 | if second is None: 156 | second = first + 1 157 | assert second < len(files), len(files) 158 | 159 | p1 = str(files[first ]) 160 | p2 = str(files[second]) 161 | compare(p1=p1, p2=p2) 162 | else: 163 | path = str(pp) 164 | with CPath(path).open() as fo: 165 | process(fo=fo, name=path) 166 | 167 | 168 | if __name__ == '__main__': 169 | main() 170 | -------------------------------------------------------------------------------- /old/reddit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from pathlib import Path 3 | 4 | from jq_normaliser import JqNormaliser, Filter, pipe, jdel as d, Filter2 5 | from jq_normaliser import CmpResult # eh, just to bring into scope for backup script 6 | 7 | from kython.kjq import del_all_kjson 8 | 9 | 10 | class RedditNormaliser(JqNormaliser): 11 | def __init__(self, *args, **kwargs) -> None: 12 | super().__init__(*args, **kwargs, logger_tag='reddit-normaliser', delete_dominated=False, keep_both=True) # type: ignore 13 | # TODO wonder if there are many dominated ? 14 | 15 | def cleanup(self) -> Filter: 16 | ignore_keys = ( 17 | 'allow_discovery', 18 | 'event_start', 'event_end', 'event_is_live', 19 | 'allowed_galleries', 20 | 'top_awarded_type', 21 | 'treatment_tags', 22 | # TODO 'edited'? 23 | 'collapsed', 'collapsed_reason', # todo potentially interesting? 24 | 'is_crosspostable_subreddit', 25 | 'og_description', 'og_title', 26 | 'pref_no_profanity', 'pref_geopopular', 'pref_top_karma_subreddits', 27 | 'steward_report', 28 | 'is_video', 29 | 'rte_mode', 30 | 'accept_chats', 31 | 'accept_pms', 32 | 'treatment_tags', 33 | 'password_set', 34 | 'allow_polls', 35 | 'allow_chat_post_creation', 36 | 'is_chat_post_feature_enabled', 37 | 'linked_identities', 38 | 'upvote_ratio', 39 | 'icon_img', 40 | 'icon_size', 41 | 'icon_url', 42 | 'icon_name', 43 | 44 | 'thumbnail_height', 45 | 46 | 'crosspost_parent_list', 47 | 'primary_color', 48 | 'archived', 49 | 'suggested_sort', 50 | 'over_18', 51 | 'over18', 52 | 'allow_videos', 53 | 'allow_images', 54 | 'allow_videogifs', 55 | 56 | 'comment_score_hide_mins', 57 | 'wiki_enabled', 58 | 'suggested_sort', 59 | 'suggested_comment_sort', 60 | 'header_img', 61 | 'header_size', 62 | 'has_menu_widget', 63 | 'banner_background_color', 64 | 'banner_background_image', 65 | 'banner_img', 66 | 'banner_size', 67 | 'mobile_banner_image', 68 | 69 | 'community_icon', 70 | 'no_follow', 71 | 'submission_type', 72 | 'is_crosspostable', 73 | 74 | 'link_flair_enabled', 75 | 'link_flair_position', 76 | 'link_flair_css_class', 77 | 'link_flair_template_id', 78 | 'link_flair_text', 79 | 'link_flair_type', 80 | 'link_flair_richtext', 81 | 82 | 'post_hint', 83 | 'is_robot_indexable', 84 | 'content_categories', 85 | 86 | 'parent_whitelist_status', 87 | 'pwls', 88 | 'whitelist_status', 89 | 'wls', 90 | 'show_media', 91 | 'spoilers_enabled', 92 | 'collapse_deleted_comments', 93 | 'key_color', 94 | 'can_assign_user_flair', 95 | 'emojis_enabled', 96 | 'author_patreon_flair', 97 | "author_flair_richtext", 98 | 'author_flair_text', 99 | 'author_flair_background_color', 100 | 'author_flair_text_color', 101 | 'author_flair_type', 102 | 'author_flair_css_class', 103 | 'author_flair_template_id', 104 | 105 | "original_content_tag_enabled", 106 | 'emojis_custom_size', 107 | 108 | 'gilded', 109 | 'gildings', 110 | 'gid_1', 111 | 'gid_2', 112 | 'gid_3', 113 | 'media_metadata', 114 | 'can_assign_link_flair', 115 | 'advertiser_category', 116 | 'can_gild', 117 | 'user_reports', 118 | 'author', 119 | 'author_fullname', 120 | 'report_reasons', 121 | 'discussion_type', 122 | 'allow_live_comments', 123 | 'score_hidden', 124 | 125 | 'submit_link_label', 126 | 'submit_text_label', 127 | 'header_title', 128 | # TODO reuse it in reddit backup script? 129 | 130 | 'secure_media', 131 | 'domain', 132 | 133 | 'audience_target', 134 | 'free_form_reports', 135 | 136 | 'restrict_commenting', 137 | 'restrict_posting', 138 | 'show_media_preview', 139 | 140 | 'is_favorited', 141 | 'is_subscriber', 142 | 143 | 'oembed', 144 | 'media_embed', 145 | 'secure_media_embed', 146 | 'stickied', 147 | 'owner_id', 148 | 149 | 'all_awardings', 150 | 151 | 'total_awards_received', 152 | 153 | 'likes', 154 | 'send_replies', 155 | 'is_self', 156 | 157 | 'url', # ugh. changed from www.reddit.... to link without reddit domain 158 | '_comments', 159 | 160 | "user_flair_richtext", 161 | "user_flair_template_id", 162 | "user_flair_type", 163 | "user_flair_text_color", 164 | "associated_award", 165 | 166 | 'author_premium', 167 | 'new', 168 | 'awarders', 169 | 'hide_score', 170 | ) 171 | # TODO ugh, some issue with coins null vs 0?? 172 | 173 | # NOTE this step took _really_ long.... e.g. 20 secs vs 0.5 sec for the rest of steps 174 | # dq.append(jq_del_all(*ignore_keys)) 175 | 176 | dq = [] 177 | dq.append('. + if has("inbox") then {} else {"inbox": []} end') # ugh. filling default value 178 | dq.append(d('.saved[].link_url')) # weird, changes for no reason sometimes... 179 | sections = [ 180 | 'saved', 181 | 'comments', 182 | 'upvoted', 183 | 'downvoted', 184 | 'submissions', 185 | 'inbox', 186 | ] 187 | dq.extend([ 188 | d(f'''.{section}[] | ( 189 | .saved, .preview, .body_html, .score, .ups, .description_html, .subreddit_type, .subreddit_subscribers, .selftext_html, .num_comments, .num_crossposts, .thumbnail, .created, .media, 190 | .locked 191 | 192 | )''') for section in sections 193 | ]) 194 | dq.append( 195 | d('.multireddits[] | (.description_html, .created, .owner, .num_subscribers)') 196 | ) 197 | dq.append( 198 | d('''(.profile.subreddit, .subreddits[]) | ( 199 | .disable_contributor_requests 200 | )''') 201 | ) 202 | dq.append( 203 | d('''.profile | ( 204 | .created, 205 | .has_mail, 206 | .inbox_count, 207 | .can_create_subreddit, 208 | .five_follower_send_message, 209 | .features, 210 | .has_gold_subscription, 211 | .has_stripe_subscription, 212 | .has_paypal_subscription, 213 | .has_subscribed_to_premium, 214 | .has_android_subscription, 215 | .has_ios_subscription, 216 | .next_coin_drip_date, 217 | .seen_premium_ftux, 218 | .seen_premium_adblock_modal, 219 | .in_redesign_beta, 220 | .gold_expiration, 221 | .is_gold 222 | )'''), 223 | ) 224 | # del_preview = lambda s: ddel(f'.{s} | .[]') 225 | # dq.extend(del_preview(s) for s in sections) 226 | # TODO shit, that's gonna remove lots of subreddits 227 | # I should also check that result contains reasonable favorites?? 228 | # TODO not sure if it's necessary to sort.. | sort_by(.id) 229 | # dq.append('.subreddits | map(del(.subscribers, .videostream_links_count, .description_html))') # ddel('(.subreddits) | .)') # | del(.videostream_links_count) | del(.description_html) 230 | dq.extend([ 231 | d('.subreddits[] | (.created, .subscribers, .description, .description_html, .videostream_links_count, .submit_text, .submit_text_html)'), 232 | ]) 233 | return Filter2( 234 | jq=pipe(*dq), 235 | extra_del_all=ignore_keys, 236 | ) 237 | 238 | def extract(self) -> Filter: 239 | return pipe( 240 | # TODO FIXME this should be assertive on field existence 241 | 242 | # TODO ehh. dunno about link/comment karma.. it's fuzzy anyway? 243 | # maybe try removing it once and see the difference 244 | 245 | # hmm, created changes all the time for some reason starting from 20181124201020 246 | # https://www.reddit.com/r/redditdev/comments/29991t/whats_the_difference_between_created_and_created/ciiuk24/ 247 | # ok, it's broken 248 | '''.profile |= { 249 | id, 250 | created_utc, 251 | name, 252 | coins, 253 | comment_karma, 254 | link_karma, 255 | subreddit: .subreddit | {subscribers} 256 | }''', 257 | '.comments |= map({id, created_utc, body})', 258 | '.multireddits |= map({id, created_utc, name, subreddits: .subreddits | map_values(.display_name) })', 259 | '.saved |= map({id, created_utc, title, body, selftext})', 260 | '.submissions |= map({id, created_utc, title, selftext})', 261 | '.subreddits |= map({id, created_utc, title, display_name, public_description, subreddit_type})', 262 | '.upvoted |= map({id, created_utc, title, selftext})', 263 | '.downvoted |= map({id, created_utc, title, selftext})', 264 | 265 | '. + if has("inbox") then {} else {"inbox": []} end', # ugh. filling default value 266 | 267 | '.inbox |= map({id, created_utc, title, selftext, body})', 268 | ) 269 | 270 | # 2 styles of normalising: 271 | # first is extracting stuff we expect to see. this is nicer and gives the idea if something actually changed 272 | # second is cleaning up stuff that we don't need 273 | 274 | 275 | 276 | 277 | def main(): 278 | norm = RedditNormaliser() 279 | norm.main(glob='*.json.xz') 280 | 281 | 282 | if __name__ == '__main__': 283 | main() 284 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # see https://github.com/karlicoss/pymplate for up-to-date reference 2 | [project] 3 | dynamic = ["version"] # version is managed by build backend 4 | name = "bleanser" 5 | dependencies = [ 6 | "more-itertools" , 7 | "typing-extensions", 8 | "click" , # nicer cli 9 | "plumbum" , # nicer command composition/piping 10 | "kompress" , # for compressed files processing (TODO potentially could be optional if they don't use compressed files? but how to detect if they are compressed.. maybe via libmagic?) 11 | ] 12 | requires-python = ">=3.9" 13 | 14 | ## these need to be set if you're planning to upload to pypi 15 | # description = "TODO" 16 | license = {file = "LICENSE"} 17 | authors = [ 18 | {name = "Dima Gerasimov (@karlicoss)", email = "karlicoss@gmail.com"}, 19 | ] 20 | maintainers = [ 21 | {name = "Dima Gerasimov (@karlicoss)", email = "karlicoss@gmail.com"}, 22 | ] 23 | # keywords = [] 24 | # # see: http://pypi.python.org/pypi?%3Aaction=list_classifiers 25 | # classifiers = [ 26 | # ] 27 | 28 | 29 | [project.urls] 30 | Homepage = "https://github.com/karlicoss/bleanser" 31 | ## 32 | 33 | 34 | [project.optional-dependencies] 35 | extra = [ 36 | "python-magic", # more reliable mimetype detection -- requires extra binaries, so perhaps best to keep optional 37 | "logzero" , # nicer logging, but can work without it 38 | ] 39 | json = [ 40 | "orjson", # faster json processing (required if you use json-derived modules) 41 | ] 42 | xml = [ 43 | "lxml", # for handling xml files (required if you use xml-derived modules) 44 | ] 45 | zstd = [ 46 | "kompress[zstd]", 47 | ] 48 | HPI = [ # for bleanser.modules.hpi 49 | "HPI", 50 | ] 51 | 52 | [dependency-groups] 53 | testing = [ 54 | "pytest", 55 | "ruff", 56 | "mypy", 57 | "lxml", # for mypy html coverage 58 | 59 | "types-lxml", 60 | ] 61 | 62 | 63 | # workaround for error during uv publishing 64 | # see https://github.com/astral-sh/uv/issues/9513#issuecomment-2519527822 65 | [tool.setuptools] 66 | license-files = [] 67 | 68 | 69 | [build-system] 70 | requires = ["hatchling", "hatch-vcs"] 71 | build-backend = "hatchling.build" 72 | 73 | # unfortunately have to duplicate project name here atm, see https://github.com/pypa/hatch/issues/1894 74 | [tool.hatch.build.targets.wheel] 75 | packages = ["src/bleanser"] 76 | 77 | [tool.hatch.version] 78 | source = "vcs" 79 | 80 | [tool.hatch.version.raw-options] 81 | version_scheme = "python-simplified-semver" 82 | local_scheme = "dirty-tag" 83 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | # discover files that don't follow test_ naming. Useful to keep tests along with the source code 3 | python_files = *.py 4 | 5 | # this setting only impacts package/module naming under pytest, not the discovery 6 | consider_namespace_packages = true 7 | 8 | addopts = 9 | # prevent pytest cache from being created... it craps into project dir and I never use it anyway 10 | -p no:cacheprovider 11 | 12 | # -rap to print tests summary even when they are successful 13 | -rap 14 | --verbose 15 | 16 | # otherwise it won't discover doctests 17 | --doctest-modules 18 | 19 | # show all test durations (unless they are too short) 20 | --durations=0 21 | -------------------------------------------------------------------------------- /ruff.toml: -------------------------------------------------------------------------------- 1 | lint.extend-select = [ 2 | "F", # flakes rules -- default, but extend just in case 3 | "E", # pycodestyle -- default, but extend just in case 4 | "W", # various warnings 5 | 6 | "B", # 'bugbear' set -- various possible bugs 7 | "C4", # flake8-comprehensions -- unnecessary list/map/dict calls 8 | "COM", # trailing commas 9 | "EXE", # various checks wrt executable files 10 | "I", # sort imports 11 | "ICN", # various import conventions 12 | "FBT", # detect use of boolean arguments 13 | "FURB", # various rules 14 | "PERF", # various potential performance speedups 15 | "PD", # pandas rules 16 | "PIE", # 'misc' lints 17 | "PLC", # pylint convention rules 18 | "PLR", # pylint refactor rules 19 | "PLW", # pylint warnings 20 | "PT", # pytest stuff 21 | "PYI", # various type hinting rules 22 | "RET", # early returns 23 | "RUF", # various ruff-specific rules 24 | "TID", # various imports suggestions 25 | "TRY", # various exception handling rules 26 | "UP", # detect deprecated python stdlib stuff 27 | "FA", # suggest using from __future__ import annotations 28 | "PTH", # pathlib migration 29 | "ARG", # unused argument checks 30 | "A", # builtin shadowing 31 | "G", # logging stuff 32 | # "EM", # TODO hmm could be helpful to prevent duplicate err msg in traceback.. but kinda annoying 33 | 34 | # "ALL", # uncomment this to check for new rules! 35 | ] 36 | 37 | # Preserve types, even if a file imports `from __future__ import annotations` 38 | # we need this for cachew to work with HPI types on 3.9 39 | # can probably remove after 3.10? 40 | lint.pyupgrade.keep-runtime-typing = true 41 | 42 | lint.ignore = [ 43 | "D", # annoying nags about docstrings 44 | "N", # pep naming 45 | "TCH", # type checking rules, mostly just suggests moving imports under TYPE_CHECKING 46 | "S", # bandit (security checks) -- tends to be not very useful, lots of nitpicks 47 | "DTZ", # datetimes checks -- complaining about missing tz and mostly false positives 48 | "FIX", # complains about fixmes/todos -- annoying 49 | "TD", # complains about todo formatting -- too annoying 50 | "ANN", # missing type annotations? seems way to strict though 51 | 52 | ### too opinionated style checks 53 | "E501", # too long lines 54 | "E702", # Multiple statements on one line (semicolon) 55 | "E731", # assigning lambda instead of using def 56 | "E741", # Ambiguous variable name: `l` 57 | "E742", # Ambiguous class name: `O 58 | "E401", # Multiple imports on one line 59 | "F403", # import *` used; unable to detect undefined names 60 | ### 61 | 62 | ### 63 | "E722", # Do not use bare `except` ## Sometimes it's useful for defensive imports and that sort of thing.. 64 | "F811", # Redefinition of unused # this gets in the way of pytest fixtures (e.g. in cachew) 65 | 66 | ## might be nice .. but later and I don't wanna make it strict 67 | "E402", # Module level import not at top of file 68 | 69 | "RUF100", # unused noqa -- handle later 70 | "RUF012", # mutable class attrs should be annotated with ClassVar... ugh pretty annoying for user configs 71 | 72 | ### these are just nitpicky, we usually know better 73 | "PLR0911", # too many return statements 74 | "PLR0912", # too many branches 75 | "PLR0913", # too many function arguments 76 | "PLR0915", # too many statements 77 | "PLR1714", # consider merging multiple comparisons 78 | "PLR2044", # line with empty comment 79 | "PLR5501", # use elif instead of else if 80 | "PLR2004", # magic value in comparison -- super annoying in tests 81 | ### 82 | "PLR0402", # import X.Y as Y -- TODO maybe consider enabling it, but double check 83 | 84 | "B009", # calling gettattr with constant attribute -- this is useful to convince mypy 85 | "B010", # same as above, but setattr 86 | "B011", # complains about assert False 87 | "B017", # pytest.raises(Exception) 88 | "B023", # seems to result in false positives? 89 | "B028", # suggest using explicit stacklevel? TODO double check later, but not sure it's useful 90 | 91 | # complains about useless pass, but has sort of a false positive if the function has a docstring? 92 | # this is common for click entrypoints (e.g. in __main__), so disable 93 | "PIE790", 94 | 95 | # a bit too annoying, offers to convert for loops to list comprehension 96 | # , which may heart readability 97 | "PERF401", 98 | 99 | # suggests no using exception in for loops 100 | # we do use this technique a lot, plus in 3.11 happy path exception handling is "zero-cost" 101 | "PERF203", 102 | 103 | "RET504", # unnecessary assignment before returning -- that can be useful for readability 104 | "RET505", # unnecessary else after return -- can hurt readability 105 | 106 | "PLW0603", # global variable update.. we usually know why we are doing this 107 | "PLW2901", # for loop variable overwritten, usually this is intentional 108 | 109 | "PT011", # pytest raises should is too broad 110 | "PT012", # pytest raises should contain a single statement 111 | 112 | "COM812", # trailing comma missing -- mostly just being annoying with long multiline strings 113 | 114 | "PD901", # generic variable name df 115 | 116 | "TRY003", # suggests defining exception messages in exception class -- kinda annoying 117 | "TRY004", # prefer TypeError -- don't see the point 118 | "TRY201", # raise without specifying exception name -- sometimes hurts readability 119 | "TRY400", # TODO double check this, might be useful 120 | "TRY401", # redundant exception in logging.exception call? TODO double check, might result in excessive logging 121 | 122 | "PGH", # TODO force error code in mypy instead? although it also has blanket noqa rule 123 | 124 | "TID252", # Prefer absolute imports over relative imports from parent modules 125 | 126 | "UP038", # suggests using | (union) in isisntance checks.. but it results in slower code 127 | 128 | ## too annoying 129 | "T20", # just complains about prints and pprints 130 | "Q", # flake quotes, too annoying 131 | "C90", # some complexity checking 132 | "G004", # logging statement uses f string 133 | "ERA001", # commented out code 134 | "SLF001", # private member accessed 135 | "BLE001", # do not catch 'blind' Exception 136 | "INP001", # complains about implicit namespace packages 137 | "SIM", # some if statements crap 138 | "RSE102", # complains about missing parens in exceptions 139 | ## 140 | ] 141 | 142 | 143 | lint.flake8-builtins.builtins-allowed-modules = ["json", "logging", "xml"] 144 | 145 | 146 | lint.exclude = [ 147 | "old/**", 148 | ] 149 | -------------------------------------------------------------------------------- /scripts/apt.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | 4 | progs=( 5 | sqlite3 # [optional] sqlite processing 6 | vim # [optional] for vimdiff 7 | fdupes # [optional] duplicate detection tool, for tests 8 | ) 9 | 10 | apt-get update && apt-get --yes install ${progs[@]} 11 | -------------------------------------------------------------------------------- /src/bleanser/core/__init__.py: -------------------------------------------------------------------------------- 1 | from .common import logger 2 | 3 | __all__ = [ 4 | 'logger', 5 | ] 6 | -------------------------------------------------------------------------------- /src/bleanser/core/__main__.py: -------------------------------------------------------------------------------- 1 | # TODO hmm so we kind of need a specific Normaliser for bleanser, so calling 2 | # python3 -m bleanser.core (or just -m bleanser) doesn't make much sense 3 | # it could probs take in module name, and then call it? like python3 -m bleanser modules.xxx 4 | # but it's the same as calling python -m bleanser.modules.xxx 5 | # TODO maybe this thing could do module discovery or something? 6 | def main() -> None: 7 | pass 8 | 9 | 10 | if __name__ == '__main__': 11 | # FIXME warn if we're running this command? kinda confusing otherwise 12 | main() 13 | -------------------------------------------------------------------------------- /src/bleanser/core/common.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from collections.abc import Sequence 4 | from dataclasses import dataclass 5 | from pathlib import Path 6 | from typing import TYPE_CHECKING, Union 7 | 8 | from .ext.logging import LazyLogger 9 | 10 | logger = LazyLogger(__name__, level='debug') 11 | 12 | 13 | @dataclass 14 | class Group: 15 | items: Sequence[Path] 16 | """ 17 | All items in group are tied via 'domination' relationship 18 | Which might be either exact equality, or some sort of 'inclusion' relationship 19 | """ 20 | 21 | pivots: Sequence[Path] 22 | """ 23 | Pivots are the elements that 'define' group. 24 | In general the pivots contain all other elements in the group 25 | Sometimes pivots might be redundant, e.g. if we want to keep both boundaries of the group 26 | """ 27 | 28 | # TODO attach diff or something 29 | # cmp: CmpResult 30 | error: bool 31 | 32 | def __post_init__(self) -> None: 33 | sp = set(self.pivots) 34 | si = set(self.items) 35 | if len(self.items) != len(si): 36 | raise RuntimeError(f'duplicate items: {self}') 37 | if len(self.pivots) != len(sp): 38 | raise RuntimeError(f'duplicate pivots: {self}') 39 | # in theory could have more pivots, but shouldn't happen for now 40 | assert 1 <= len(sp) <= 2, sp 41 | if not (sp <= si): 42 | raise RuntimeError(f"pivots aren't fully contained in items: {self}") 43 | 44 | 45 | @dataclass 46 | class Instruction: 47 | path: Path 48 | group: Group 49 | """ 50 | 'Reason' why the path got a certain instruction 51 | """ 52 | 53 | 54 | @dataclass 55 | class Prune(Instruction): 56 | pass 57 | 58 | 59 | @dataclass 60 | class Keep(Instruction): 61 | pass 62 | 63 | 64 | ### helper to define paramertized tests in function's body 65 | from .utils import under_pytest 66 | 67 | if TYPE_CHECKING or under_pytest: 68 | import pytest 69 | 70 | parametrize = pytest.mark.parametrize 71 | else: 72 | parametrize = lambda *_args, **_kwargs: (lambda f: f) 73 | ### 74 | 75 | 76 | @dataclass 77 | class BaseMode: 78 | pass 79 | 80 | 81 | @dataclass 82 | class Dry(BaseMode): 83 | pass 84 | 85 | 86 | @dataclass 87 | class Move(BaseMode): 88 | path: Path 89 | 90 | def __post_init__(self) -> None: 91 | assert self.path.is_dir(), self.path 92 | 93 | 94 | @dataclass 95 | class Remove(BaseMode): 96 | pass 97 | 98 | 99 | Mode = Union[Dry, Move, Remove] 100 | 101 | 102 | def divide_by_size(*, buckets: int, paths: Sequence[Path]) -> Sequence[Sequence[Path]]: 103 | """ 104 | Divide paths into approximately equally sized groups, while preserving order 105 | """ 106 | res = [] 107 | with_size = [(p, p.stat().st_size) for p in paths] 108 | bucket_size = sum(sz for _, sz in with_size) / buckets 109 | 110 | group: list[Path] = [] 111 | group_size = 0 112 | 113 | def dump() -> None: 114 | nonlocal group_size, group 115 | 116 | if len(group) == 0: 117 | return 118 | 119 | res.append(group) 120 | # print(f"dumping group, size {group_size} {len(group)} {group[0]} {group[-1]}") 121 | 122 | group = [] 123 | group_size = 0 124 | 125 | for p, sz in with_size: 126 | if group_size >= bucket_size: 127 | dump() 128 | group.append(p) 129 | group_size += sz 130 | # last group always needs to be dumped 131 | dump() 132 | 133 | assert len(res) <= buckets 134 | while len(res) < buckets: # can be less if buckets > len(paths) 135 | res.append([]) 136 | 137 | flattened = [] 138 | for r in res: 139 | flattened.extend(r) 140 | assert paths == flattened, res # just a safety check 141 | 142 | return res 143 | -------------------------------------------------------------------------------- /src/bleanser/core/compat.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | if sys.version_info[:2] >= (3, 11): 4 | from typing import Never, Self, assert_never, assert_type # noqa: F401 5 | else: 6 | from typing_extensions import Never, Self, assert_never, assert_type # noqa: F401 7 | -------------------------------------------------------------------------------- /src/bleanser/core/ext/dummy_executor.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from concurrent.futures import Executor, Future 4 | 5 | # https://stackoverflow.com/a/10436851/706389 6 | from typing import Any 7 | 8 | 9 | class DummyExecutor(Executor): 10 | def __init__(self, max_workers: int | None = 1) -> None: 11 | self._shutdown = False 12 | self._max_workers = max_workers 13 | 14 | def submit(self, fn, *args, **kwargs): # type: ignore[override,unused-ignore] # todo type properly after 3.9 15 | if self._shutdown: 16 | raise RuntimeError('cannot schedule new futures after shutdown') 17 | 18 | f: Future[Any] = Future() 19 | try: 20 | result = fn(*args, **kwargs) 21 | except KeyboardInterrupt: 22 | raise 23 | except BaseException as e: 24 | f.set_exception(e) 25 | else: 26 | f.set_result(result) 27 | 28 | return f 29 | 30 | def shutdown(self, wait: bool = True, **kwargs) -> None: # noqa: FBT001,FBT002,ARG002 31 | self._shutdown = True 32 | -------------------------------------------------------------------------------- /src/bleanser/core/ext/logging.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Default logger is a bit meh, see 'test'/run this file for a demo 3 | TODO name 'klogging' to avoid possible conflict with default 'logging' module 4 | TODO shit. too late already? maybe use fallback & deprecate 5 | ''' 6 | 7 | 8 | def test() -> None: 9 | import logging 10 | import sys 11 | from typing import Callable 12 | 13 | M: Callable[[str], None] = lambda s: print(s, file=sys.stderr) 14 | 15 | M(" Logging module's defaults are not great...'") 16 | l = logging.getLogger('test_logger') 17 | # todo why is mypy unhappy about these??? 18 | l.error("For example, this should be logged as error. But it's not even formatted properly, doesn't have logger name or level") 19 | 20 | M(" The reason is that you need to remember to call basicConfig() first") 21 | l.error("OK, this is better. But the default format kinda sucks, I prefer having timestamps and the file/line number") 22 | 23 | M("") 24 | M(" With LazyLogger you get a reasonable logging format, colours and other neat things") 25 | 26 | ll = LazyLogger('test') # No need for basicConfig! 27 | ll.info("default level is INFO") 28 | ll.debug(".. so this shouldn't be displayed") 29 | ll.warning("warnings are easy to spot!") 30 | ll.exception(RuntimeError("exceptions as well")) 31 | 32 | 33 | import logging 34 | import os 35 | from typing import Optional, Union 36 | 37 | Level = int 38 | LevelIsh = Optional[Union[Level, str]] 39 | 40 | 41 | def mklevel(level: LevelIsh) -> Level: 42 | # todo put in some global file, like envvars.py 43 | glevel = os.environ.get('HPI_LOGS', None) 44 | if glevel is not None: 45 | level = glevel 46 | if level is None: 47 | return logging.NOTSET 48 | if isinstance(level, int): 49 | return level 50 | return getattr(logging, level.upper()) 51 | 52 | 53 | FORMAT = '{start}[%(levelname)-7s %(asctime)s %(name)s %(filename)s:%(lineno)d]{end} %(message)s' 54 | FORMAT_COLOR = FORMAT.format(start='%(color)s', end='%(end_color)s') 55 | FORMAT_NOCOLOR = FORMAT.format(start='', end='') 56 | DATEFMT = '%Y-%m-%d %H:%M:%S' 57 | 58 | 59 | def setup_logger(logger: logging.Logger, level: LevelIsh) -> None: 60 | lvl = mklevel(level) 61 | try: 62 | import logzero # type: ignore[import-untyped] 63 | except ModuleNotFoundError: 64 | import warnings 65 | 66 | warnings.warn("You might want to install 'logzero' for nice colored logs!") 67 | logger.setLevel(lvl) 68 | h = logging.StreamHandler() 69 | h.setLevel(lvl) 70 | h.setFormatter(logging.Formatter(fmt=FORMAT_NOCOLOR, datefmt=DATEFMT)) 71 | logger.addHandler(h) 72 | logger.propagate = False # ugh. otherwise it duplicates log messages? not sure about it.. 73 | else: 74 | formatter = logzero.LogFormatter( 75 | fmt=FORMAT_COLOR, 76 | datefmt=DATEFMT, 77 | ) 78 | logzero.setup_logger(logger.name, level=lvl, formatter=formatter) 79 | 80 | 81 | class LazyLogger(logging.Logger): 82 | def __new__(cls, name: str, level: LevelIsh = 'INFO') -> 'LazyLogger': 83 | logger = logging.getLogger(name) 84 | 85 | # this is called prior to all _log calls so makes sense to do it here? 86 | def isEnabledFor_lazyinit(*args, logger=logger, orig=logger.isEnabledFor, **kwargs): 87 | att = 'lazylogger_init_done' 88 | if not getattr(logger, att, False): # init once, if necessary 89 | setup_logger(logger, level=level) 90 | setattr(logger, att, True) 91 | return orig(*args, **kwargs) 92 | 93 | logger.isEnabledFor = isEnabledFor_lazyinit # type: ignore[method-assign] 94 | return logger # type: ignore[return-value] 95 | 96 | 97 | if __name__ == '__main__': 98 | test() 99 | -------------------------------------------------------------------------------- /src/bleanser/core/ext/sqlite_dumben.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # A tool to 'dumb down' an sqlite database and convert into just data rows 3 | # Basically it strips all 4 | # - views 5 | # - indices 6 | # - triggers 7 | # - constraints 8 | # this is useful if you want to mess/cleanup the database, but don't want to trip over constraints/triggers 9 | # NOTE: handling everything as bytes since not sure I wanna mess with encoding here (esp. row data encoding) 10 | from __future__ import annotations 11 | 12 | import hashlib 13 | import os 14 | import re 15 | import shutil 16 | import sqlite3 17 | import subprocess 18 | import sys 19 | from pathlib import Path 20 | from subprocess import DEVNULL, check_call, check_output 21 | from tempfile import TemporaryDirectory 22 | 23 | Tables = dict[str, dict[str, str]] 24 | 25 | 26 | def _get_tables(db: Path) -> Tables: 27 | res: Tables = {} 28 | with sqlite3.connect(f'file:{db}?immutable=1', uri=True) as conn: 29 | tables = [] 30 | for row in conn.execute('SELECT name, type FROM sqlite_master'): 31 | (table, type_) = row 32 | if type_ in {'index', 'view', 'trigger'}: 33 | # todo log what kind of things we are filtering out? 34 | continue 35 | assert type_ == 'table', (table, type_) 36 | tables.append(table) 37 | 38 | for table in tables: 39 | schema: dict[str, str] = {} 40 | for row in conn.execute(f'PRAGMA table_info({table})'): 41 | col = row[1] 42 | type_ = row[2] 43 | schema[col] = type_ 44 | res[table] = schema 45 | return res 46 | 47 | 48 | def _sqlite(*cmd): 49 | return ['sqlite3', '-bail', *cmd] 50 | 51 | 52 | def _dumben_db(output_db: Path) -> None: 53 | # expected to operate on output_db directly 54 | assert output_db.exists(), output_db 55 | 56 | # hmm. CREATE TABLE syntax seems ridiculously complicated https://www.sqlite.org/lang_createtable.html 57 | # so seems pretty hopeless to sanitize off the constraints purely via sqlite? 58 | # the only easy win is making it single line 59 | # "UPDATE sqlite_master SET sql = replace(sql, char(10), ' ');" 60 | 61 | allow_writable_schema = [ 62 | # seems like some versions of sqlite (e.g. on osx don't allow writable schema without this pragma) 63 | # https://github.com/tekartik/sqflite/blob/master/sqflite_common_ffi/doc/custom_pragmas.md?plain=1 64 | "PRAGMA sqflite -- db_config_defensive_off", 65 | "PRAGMA writable_schema=ON", 66 | ] 67 | 68 | # first delete virtual tables -- they might render it impossible to do anything with database at all due to USING 69 | # e.g. fb messenger android msys database has this CREATE VIRTUAL TABLE msys_experiment_cache USING experiment_cache 70 | # either way virtual tables are basically views, no need to keep them 71 | with sqlite3.connect(output_db) as conn: 72 | for cmd in allow_writable_schema: 73 | conn.execute(cmd) 74 | conn.execute('DELETE FROM sqlite_master WHERE sql LIKE "%CREATE VIRTUAL TABLE%"') 75 | conn.close() 76 | 77 | tables = _get_tables(output_db) 78 | 79 | updates = [] 80 | for name, schema in tables.items(): 81 | simple_create = f'CREATE TABLE `{name}` (' + ', '.join(f'`{k}` {v}' for k, v in schema.items()) + ')' 82 | # TODO dunno if worth keeping autoincrement 83 | # without it, all columns with numerical id end up as NULL. although maybe for the best? 84 | upd = f'UPDATE sqlite_master SET sql = "{simple_create}" WHERE name = "{name}";' 85 | updates.append(upd) 86 | 87 | cmds = [ 88 | *allow_writable_schema, 89 | # drop table doesn't work for special sqlite_ tables 90 | # sqlite_sequence is something to do with autoincrement, ends up with some indices noise otherwise 91 | # sqlite_stat{1,2,3,4} is something to do with ANALYZE query 92 | 'DELETE FROM sqlite_master WHERE name = "sqlite_sequence" OR name LIKE "sqlite_stat%";', 93 | # 94 | 'DELETE FROM sqlite_master WHERE type IN ("view", "trigger", "index");', 95 | *updates, 96 | # 97 | # without vacuum, sometimes ended up with "rootpage disagrees with header error", from sqlite code seemed like it had something to do with autovacuum 98 | 'VACUUM', 99 | ] 100 | 101 | # need to set isolation level to None, otherwise VACUUM fails 102 | with sqlite3.connect(output_db, isolation_level=None) as conn: 103 | for cmd in cmds: 104 | conn.execute(cmd) 105 | conn.close() 106 | 107 | # make sure it's not corrupted 108 | # redirect output to DEVNULL, otherwise it's printing "ok" which is a bit annoying 109 | subprocess.check_call(_sqlite(output_db, 'PRAGMA integrity_check;'), stdout=DEVNULL) 110 | 111 | 112 | def run(*, db: Path, output: Path | None, output_as_db: bool) -> None: 113 | if output is not None: 114 | assert not output.exists(), output 115 | 116 | if output is None: 117 | assert output_as_db is False, "can't output to stdout as a binary database" 118 | 119 | if output_as_db: 120 | assert output is not None 121 | 122 | dumben_cache: Path | None = None 123 | _DUMBEN_CACHE_BASE = os.environ.get('SQLITE_DUMBEN_USE_CACHE') 124 | if _DUMBEN_CACHE_BASE is not None: 125 | DUMBEN_CACHE_BASE = Path(_DUMBEN_CACHE_BASE) 126 | DUMBEN_CACHE_BASE.mkdir(parents=True, exist_ok=True) 127 | 128 | fhash = hashlib.md5( 129 | # add code of sqlite_dumben just in case we change logic 130 | db.read_bytes() + Path(__file__).read_bytes() 131 | ).hexdigest() 132 | 133 | dumben_cache = DUMBEN_CACHE_BASE / fhash 134 | if dumben_cache.exists(): 135 | # TODO log it? 136 | shutil.copy(dumben_cache, output) 137 | return 138 | 139 | # if we output as db, just operate on that target database directly 140 | shutil.copy(db, output) 141 | _dumben_db(output) 142 | 143 | if dumben_cache is not None: 144 | shutil.copy(output, dumben_cache) 145 | return 146 | 147 | # otherwise, need to create a temporary db to operate on -- and after that can dump it to sql 148 | # TODO need to be careful, if there are BLOBs in the database they may be dumped as empty strings 149 | with TemporaryDirectory() as td: 150 | tdir = Path(td) 151 | tdb = Path(tdir) / 'tmp.db' 152 | run(db=db, output=tdb, output_as_db=True) 153 | if output is not None: 154 | with output.open('w') as out: 155 | subprocess.run(_sqlite(tdb, '.dump'), check=True, stdout=out) 156 | else: 157 | subprocess.run(_sqlite(tdb, '.dump'), check=True, stdout=sys.stdout) 158 | 159 | 160 | def test_dumben(tmp_path: Path) -> None: 161 | # TODO would be nice to implement integration style test here straight away 162 | sql = ''' 163 | CREATE TABLE departments 164 | ( department_id INTEGER PRIMARY KEY AUTOINCREMENT, 165 | department_name VARCHAR 166 | ); 167 | 168 | CREATE TABLE employees 169 | ( employee_id INTEGER PRIMARY KEY AUTOINCREMENT, 170 | last_name VARCHAR NOT NULL, 171 | first_name VARCHAR, 172 | department_id INTEGER, 173 | CONSTRAINT fk_departments 174 | FOREIGN KEY (department_id) 175 | REFERENCES departments(department_id) 176 | ON DELETE CASCADE 177 | ); 178 | 179 | INSERT INTO departments VALUES (30, 'HR'); 180 | INSERT INTO departments VALUES (999, 'Sales'); 181 | 182 | INSERT INTO employees VALUES (10000, 'Smith', 'John', 30); 183 | INSERT INTO employees VALUES (10001, 'Anderson', 'Dave', 999); 184 | 185 | CREATE VIEW whatevs AS 186 | SELECT * FROM employees; 187 | ''' 188 | 189 | db = tmp_path / 'tmp.db' 190 | subprocess.run(_sqlite(db), input=sql.encode('utf8'), check=True) 191 | 192 | ## precondition -- check that db has multiline CREATE statements 193 | dbd = check_output(_sqlite(db, '.dump')).decode('utf8').splitlines() 194 | assert 'CREATE TABLE employees' in dbd 195 | assert ' CONSTRAINT fk_departments' in dbd 196 | ## 197 | 198 | ## precondition -- check that with foreign key it will indeed impact other tables 199 | check_call(_sqlite(db, 'PRAGMA foreign_keys=on; DELETE FROM departments WHERE department_id = 30;')) 200 | ecnt = int(check_output(_sqlite(db, 'SELECT COUNT(*) FROM employees')).decode('utf8').strip()) 201 | assert ecnt == 1, ecnt 202 | ## 203 | 204 | db.unlink() 205 | subprocess.run(_sqlite(db), input=sql.encode('utf8'), check=True) 206 | 207 | dumb_sql = tmp_path / 'dumb.sql' 208 | run(db=db, output=dumb_sql, output_as_db=False) 209 | dump = dumb_sql.read_text() 210 | dump_lines = dump.splitlines() 211 | 212 | crt = dump_lines[5] # meh but easiest 213 | # make sure it puts the statement on single line 214 | assert re.fullmatch(r'CREATE TABLE `employees` \(`employee_id` INTEGER,.*`department_id` INTEGER.*\);', crt) 215 | # make sure it strips off constraints 216 | assert 'AUTOINCREMENT' not in crt, crt 217 | assert 'CONSTRAINT' not in crt, crt 218 | 219 | assert 'CREATE VIEW' not in dump 220 | 221 | dumb_db = tmp_path / 'dumb.db' 222 | run(db=db, output=dumb_db, output_as_db=True) 223 | check_call(_sqlite(dumb_db, 'PRAGMA foreign_keys=on; DELETE FROM departments WHERE department_id = 30;')) 224 | ecnt = int(check_output(_sqlite(dumb_db, 'SELECT COUNT(*) FROM employees')).decode('utf8').strip()) 225 | assert ecnt == 2, ecnt 226 | 227 | 228 | def main() -> None: 229 | from argparse import ArgumentParser 230 | 231 | p = ArgumentParser() 232 | p.add_argument('--output-as-db', action='store_true') 233 | p.add_argument('--output', type=Path, required=False) 234 | p.add_argument('db', type=Path) 235 | args = p.parse_args() 236 | 237 | run(db=args.db, output=args.output, output_as_db=args.output_as_db) 238 | 239 | 240 | if __name__ == '__main__': 241 | main() 242 | 243 | 244 | # some possible inspiration for testing 245 | # - KoboReader-20211130.sqlite seems to have 246 | # CREATE TRIGGER kobo_plus_asset_cleanup 247 | # - fb messenger android is a good db to test on... lots of weird shit, e.g. transactions 248 | # - bumble android has search_message_removed trigger 249 | # - whatsapp android has loads of weird shit 250 | -------------------------------------------------------------------------------- /src/bleanser/core/main.py: -------------------------------------------------------------------------------- 1 | # not to confuse with __main__.py... meh 2 | from __future__ import annotations 3 | 4 | import os 5 | from glob import glob as do_glob 6 | from pathlib import Path 7 | from typing import cast 8 | 9 | import click 10 | 11 | from .common import Dry, Mode, Move, Remove, logger 12 | from .processor import ( 13 | BaseNormaliser, 14 | apply_instructions, 15 | bleanser_tmp_directory, 16 | compute_instructions, 17 | ) 18 | 19 | 20 | # TODO use context and default_map 21 | # https://click.palletsprojects.com/en/7.x/commands/#overriding-defaults 22 | def main(*, Normaliser: type[BaseNormaliser]) -> None: 23 | # meh.. by default the width is stupid, like 80 chars 24 | context_settings = { 25 | 'max_content_width': 120, 26 | 'show_default': True, 27 | } 28 | 29 | @click.group(context_settings=context_settings) 30 | def call_main() -> None: 31 | pass 32 | 33 | # meh... would be nice to use object but it gets casted to str by click?? 34 | _DEFAULT = '' 35 | 36 | @call_main.command(name='diff', short_help='cleanup two files and diff') 37 | @click.argument('path1', type=str) 38 | @click.argument('path2', default=_DEFAULT) 39 | @click.option('--glob', is_flag=True, default=False, help='Treat the path as glob (in the glob.glob sense)') 40 | @click.option('--vim', is_flag=True, default=False, help='Use vimdiff') 41 | @click.option('--difftool', type=str, help='Custom difftool to use') 42 | @click.option('--from', 'from_', type=int, default=None) 43 | @click.option('--to', type=int, default=None, help='non-inclusive, i.e. [from, to)') 44 | def diff(path1: str, path2: Path, *, glob: bool, from_: int | None, to: int | None, vim: bool, difftool: str) -> None: 45 | path1_: Path 46 | if glob: 47 | assert path2 is cast(Path, _DEFAULT), path2 48 | if to is None: 49 | assert from_ is not None 50 | to = from_ + 2 # by default just compare with the next adjacent element 51 | paths = _get_paths(path=path1, from_=from_, to=to, glob=glob) 52 | else: 53 | assert cast(str, path2) is not _DEFAULT 54 | assert from_ is None 55 | assert to is None 56 | path1_ = Path(path1) 57 | path2 = Path(path2) 58 | paths = [path1_, path2] 59 | 60 | from .processor import compute_diff 61 | 62 | # meh.. 63 | if vim: 64 | difftool = 'vimdiff' 65 | if difftool is not None: 66 | os.environ['DIFFTOOL'] = difftool 67 | 68 | for line in compute_diff(paths, Normaliser=Normaliser): 69 | print(line) 70 | 71 | @call_main.command(name='normalised', short_help='normalise file and dump to stdout') 72 | @click.argument('path', type=Path) 73 | @click.option('--stdout', is_flag=True, help='print normalised files to stdout instead of printing the path to it') 74 | def normalised(*, path: Path, stdout: bool) -> None: 75 | with bleanser_tmp_directory() as base_tmp_dir: 76 | n = Normaliser(original=path, base_tmp_dir=base_tmp_dir) 77 | with n.do_normalise() as cleaned: 78 | if stdout: 79 | print(cleaned.read_text()) 80 | else: 81 | click.secho(f'You can examine normalised file: {cleaned}', fg='green') 82 | click.pause(info="Press any key when you've finished") 83 | 84 | @call_main.command(name='prune', short_help='process & prune files') 85 | @click.argument('path', type=str) 86 | @click.option('--glob', is_flag=True, default=False, help='Treat the path as glob (in the glob.glob sense)') 87 | @click.option('--sort-by', type=click.Choice(['size', 'name']), default='name', help='how to sort input files') 88 | ## 89 | @click.option('--dry', is_flag=True, default=None, help='Do not prune the input files, just print what would happen after pruning.') 90 | @click.option('--remove', is_flag=True, default=None, help='Prune the input files by REMOVING them (be careful!)') 91 | @click.option('--move', type=Path, help='Prune the input files by MOVING them to the specified path. A bit safer than --remove mode.') 92 | ## 93 | @click.option('--yes', is_flag=True, default=False, help="Do not prompt before pruning files (useful for cron etc)") 94 | @click.option( 95 | '--threads', 96 | type=int, 97 | is_flag=False, 98 | flag_value=0, 99 | default=None, 100 | help="Number of threads (processes) to use. Without the flag won't use any, with the flag will try using all available, can also take a specific value. Passed down to PoolExecutor.", 101 | ) 102 | ## 103 | @click.option('--from', 'from_', type=int, default=None) 104 | @click.option('--to', type=int, default=None) 105 | ## 106 | @click.option('--multiway', is_flag=True, default=None, help='force "multiway" cleanup') 107 | @click.option('--prune-dominated', is_flag=True, default=None) 108 | def prune( 109 | *, 110 | path: str, 111 | sort_by: str, 112 | glob: bool, 113 | dry: bool, 114 | move: Path | None, 115 | remove: bool, 116 | threads: int | None, 117 | from_: int | None, 118 | to: int | None, 119 | multiway: bool | None, 120 | prune_dominated: bool | None, 121 | yes: bool, 122 | ) -> None: 123 | modes: list[Mode] = [] 124 | if dry is True: 125 | modes.append(Dry()) 126 | if move is not None: 127 | modes.append(Move(path=move)) 128 | if remove is True: 129 | modes.append(Remove()) 130 | if len(modes) == 0: 131 | modes.append(Dry()) 132 | assert len(modes) == 1, f'please specify exactly one of modes (got {modes})' 133 | [mode] = modes 134 | # TODO eh, would be nice to use some package for mutually exclusive args.. 135 | # e.g. https://stackoverflow.com/questions/37310718/mutually-exclusive-option-groups-in-python-click 136 | 137 | paths = _get_paths(path=path, glob=glob, from_=from_, to=to, sort_by=sort_by) 138 | 139 | if multiway is not None: 140 | Normaliser.MULTIWAY = multiway 141 | if prune_dominated is not None: 142 | Normaliser.PRUNE_DOMINATED = prune_dominated 143 | 144 | instructions = list(compute_instructions(paths, Normaliser=Normaliser, threads=threads)) 145 | # NOTE: for now, forcing list() to make sure instructions compute before path check 146 | # not strictly necessary 147 | for p in paths: 148 | # just in case, to make sure no one messed with files in the meantime 149 | assert p.exists(), p 150 | 151 | need_confirm = not yes 152 | apply_instructions(instructions, mode=mode, need_confirm=need_confirm) 153 | 154 | call_main() 155 | 156 | 157 | def _get_paths(*, path: str, from_: int | None, to: int | None, sort_by: str = "name", glob: bool = False) -> list[Path]: 158 | if not glob: 159 | pp = Path(path) 160 | assert pp.is_dir(), pp 161 | path = str(pp) + os.sep + '**' 162 | paths = [Path(p) for p in do_glob(path, recursive=True)] # noqa: PTH207 163 | paths = [p for p in paths if p.is_file()] 164 | if sort_by == "name": 165 | # assumes sort order is same as date order? guess it's reasonable 166 | paths = sorted(paths) 167 | else: 168 | paths = sorted(paths, key=lambda s: s.stat().st_size) 169 | 170 | if from_ is None: 171 | from_ = 0 172 | if to is None: 173 | to = len(paths) 174 | paths = paths[from_:to] 175 | assert len(paths) > 0 176 | 177 | logger.info('processing %d files (%s ... %s)', len(paths), paths[0], paths[-1]) 178 | return paths 179 | -------------------------------------------------------------------------------- /src/bleanser/core/modules/extract.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Iterator 2 | from contextlib import contextmanager 3 | from pathlib import Path 4 | from typing import Any 5 | 6 | from bleanser.core.processor import ( 7 | BaseNormaliser, 8 | Normalised, 9 | sort_file, 10 | unique_file_in_tempdir, 11 | ) 12 | 13 | 14 | class ExtractObjectsNormaliser(BaseNormaliser): 15 | """ 16 | This is meant to be overridden by a subclass 17 | 18 | extract_objects receives an input file, and should yield data/objects that when converted 19 | to a string, produces some comparable data/object to the normalised/cleaned output file 20 | 21 | possible things this could return is a unique key/id, or a tuple of (key, data), or a 22 | namedtuple/dataclass 23 | 24 | newlines are stripped from the string, so lines can be compared/diffed properly 25 | 26 | Its possible you could use a library or code from https://github.com/karlicoss/HPI 27 | in extract_objects, to use the DAL itself to parse the file https://beepb00p.xyz/exports.html#dal 28 | """ 29 | 30 | def extract_objects(self, path: Path) -> Iterator[Any]: 31 | raise NotImplementedError 32 | # when you subclass, you should do something like 33 | # with path.open('r') as f: 34 | # for object in some_library(f): 35 | # yield (object.id, object.key) 36 | 37 | def _emit_history(self, upath: Path, cleaned) -> None: 38 | """ 39 | calls extract_objects to extract lines from the unpacked path 40 | subclasses should override that to yield some kind of object 41 | out to here 42 | """ 43 | with cleaned.open("w") as f: 44 | for line in self.extract_objects(upath): 45 | # newlines may interfere with the diffing, use the repr of the string 46 | f.write(repr(str(line))) 47 | f.write("\n") 48 | 49 | @contextmanager 50 | def normalise(self, *, path: Path) -> Iterator[Normalised]: 51 | cleaned = unique_file_in_tempdir(input_filepath=path, dir=self.tmp_dir, suffix=path.suffix) 52 | 53 | self._emit_history(path, cleaned) 54 | sort_file(cleaned) 55 | 56 | yield cleaned 57 | 58 | 59 | if __name__ == "__main__": 60 | ExtractObjectsNormaliser.main() 61 | -------------------------------------------------------------------------------- /src/bleanser/core/modules/json.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Iterator 2 | from contextlib import contextmanager 3 | from pathlib import Path 4 | 5 | import orjson 6 | 7 | from bleanser.core.processor import ( 8 | BaseNormaliser, 9 | Normalised, 10 | sort_file, 11 | unique_file_in_tempdir, 12 | ) 13 | 14 | # imports for convenience -- they are used in other modules 15 | from bleanser.core.utils import Json, delkeys, patch_atoms # noqa: F401 16 | 17 | 18 | class JsonNormaliser(BaseNormaliser): 19 | PRUNE_DOMINATED = False 20 | 21 | def cleanup(self, j: Json) -> Json: 22 | ''' 23 | subclasses should override this function, to do the actual cleanup 24 | 25 | cleanup in this context means removing extra JSON keys which are not 26 | needed to produce a normalised representation for a file 27 | ''' 28 | return j 29 | 30 | @contextmanager 31 | def normalise(self, *, path: Path) -> Iterator[Normalised]: 32 | # TODO maybe, later implement some sort of class variable instead of hardcoding 33 | # note: deliberately keeping mime check inside do_cleanup, since it's executed in a parallel process 34 | # otherwise it essentially blocks waiting for all mimes to compute.. 35 | # TODO crap. annoying, sometimes mime determines as text/plain for no reason 36 | # I guess doesn't matter as much, json.loads is the ultimate check it's ineed json 37 | # mp = mime(upath) 38 | # assert mp in { 39 | # 'application/json', 40 | # }, mp 41 | 42 | j = orjson.loads(path.read_text()) 43 | j = self.cleanup(j) 44 | 45 | # create a tempfile to write flattened data to 46 | cleaned = unique_file_in_tempdir(input_filepath=path, dir=self.tmp_dir, suffix='.json') 47 | 48 | with cleaned.open('w') as fo: 49 | if isinstance(j, list): 50 | j = {'': j} # meh 51 | 52 | assert isinstance(j, dict), j 53 | for k, v in j.items(): 54 | if not isinstance(v, list): 55 | # something like 'profile' data in hypothesis could be a dict 56 | # something like 'notes' in rescuetime could be a scalar (str) 57 | v = [v] # meh 58 | assert isinstance(v, list), (k, v) 59 | for i in v: 60 | print(f'{k} ::: {orjson.dumps(i, option=orjson.OPT_SORT_KEYS).decode("utf8")}', file=fo) 61 | 62 | # todo meh... see Fileset._union 63 | # this gives it a bit of a speedup, just calls out to unix sort 64 | sort_file(cleaned) 65 | 66 | yield cleaned 67 | 68 | 69 | if __name__ == '__main__': 70 | JsonNormaliser.main() 71 | 72 | 73 | # TODO actually implement some artificial json test 74 | # 75 | def test_nonidempotence(tmp_path: Path) -> None: 76 | from bleanser.tests.common import actions, hack_attribute 77 | 78 | ''' 79 | Just demonstrates that multiway processing might be 80 | It's probably going to be very hard to fix, likely finding 'minimal' cover (at least in terms of partial ordering) is NP hard? 81 | ''' 82 | 83 | # fmt: off 84 | sets = [ 85 | [], 86 | ['a'], 87 | ['a', 'b'], 88 | [ 'b', 'c'], 89 | ['a', 'b', 'c'], 90 | ] 91 | # fmt: on 92 | for i, s in enumerate(sets): 93 | p = tmp_path / f'{i}.json' 94 | p.write_text(orjson.dumps(s).decode('utf8')) 95 | 96 | with hack_attribute(JsonNormaliser, 'MULTIWAY', value=True), hack_attribute(JsonNormaliser, 'PRUNE_DOMINATED', value=True): 97 | paths = sorted(tmp_path.glob('*.json')) 98 | res = actions(paths=paths, Normaliser=JsonNormaliser) 99 | 100 | assert [p.name for p in res.remaining] == [ 101 | '0.json', # keeping as boundary 102 | '2.json', # keeping because item a has rolled over 103 | '4.json', # keeping as boundary 104 | ] 105 | 106 | paths = list(res.remaining) 107 | res = actions(paths=paths, Normaliser=JsonNormaliser) 108 | assert [p.name for p in res.remaining] == [ 109 | '0.json', 110 | # note: 2.json is removed because fully contained in 4.json 111 | '4.json', 112 | ] 113 | -------------------------------------------------------------------------------- /src/bleanser/core/modules/tests/sqlite.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import sqlite3 4 | from pathlib import Path 5 | 6 | from ...common import Keep, Prune 7 | from ...processor import compute_groups, groups_to_instructions 8 | from ..sqlite import SqliteNormaliser 9 | 10 | 11 | def _make_db(out: Path, values: list[bytes], *, bad: bool = False) -> Path: 12 | with sqlite3.connect(out) as conn: 13 | conn.execute('CREATE TABLE `test` (bbb BLOB)') 14 | conn.executemany( 15 | 'INSERT INTO `test` VALUES (?)', 16 | [(v,) for v in values], 17 | ) 18 | if bad: 19 | # the only way I figured to actually force BLOB column to contain text values 20 | conn.execute('CREATE TABLE `bad` (bbb BLOB)') 21 | conn.execute('INSERT INTO `bad` SELECT cast(bbb AS TEXT) FROM `test`') 22 | conn.execute('DROP TABLE `test`') 23 | conn.execute('ALTER TABLE `bad` RENAME TO `test`') 24 | conn.close() 25 | return out 26 | 27 | 28 | def test_sqlite_blobs_good(tmp_path: Path) -> None: 29 | """ 30 | In this case we have blob data in BLOB column -- so cleanup should work as expected 31 | """ 32 | 33 | class TestNormaliser(SqliteNormaliser): 34 | MULTIWAY = False 35 | PRUNE_DOMINATED = True 36 | 37 | db0 = _make_db(tmp_path / '0.db', [b'\x00\x01']) 38 | db1 = _make_db(tmp_path / '1.db', [b'\x00\x01', b'\x01\x02']) 39 | db2 = _make_db(tmp_path / '2.db', [b'\x00\x01', b'\x01\x02', b'\x02\x03']) 40 | db3 = _make_db(tmp_path / '3.db', [b'\x00\x01', b'\x01\x02', b'\x02\x03', b'\x03\x04']) 41 | dbs = [db0, db1, db2, db3] 42 | 43 | groups = list(compute_groups(dbs, Normaliser=TestNormaliser)) 44 | instructions = list(groups_to_instructions(groups)) 45 | 46 | assert [type(i) for i in instructions] == [ 47 | Keep, 48 | Prune, 49 | Prune, 50 | Keep, 51 | ] 52 | 53 | 54 | def test_sqlite_blobs_bad(tmp_path: Path) -> None: 55 | """ 56 | In this case we have text (!) data in BLOB column. 57 | This will cause errors during cleanup so we'll keep all inputs (even though dbs are identical here) 58 | """ 59 | 60 | class TestNormaliser(SqliteNormaliser): 61 | MULTIWAY = False 62 | PRUNE_DOMINATED = True 63 | 64 | db0 = _make_db(tmp_path / '0.db', [b'\x00', b'\x01', b'\x02'], bad=True) 65 | db1 = _make_db(tmp_path / '1.db', [b'\x00', b'\x01', b'\x02'], bad=True) 66 | db2 = _make_db(tmp_path / '2.db', [b'\x00', b'\x01', b'\x02'], bad=True) 67 | db3 = _make_db(tmp_path / '3.db', [b'\x00', b'\x01', b'\x02'], bad=True) 68 | dbs = [db0, db1, db2, db3] 69 | 70 | groups = list(compute_groups(dbs, Normaliser=TestNormaliser)) 71 | instructions = list(groups_to_instructions(groups)) 72 | 73 | assert [type(i) for i in instructions] == [ 74 | Keep, 75 | Keep, 76 | Keep, 77 | Keep, 78 | ] 79 | 80 | 81 | def test_sqlite_blobs_allowed(tmp_path: Path) -> None: 82 | class TestNormaliser(SqliteNormaliser): 83 | MULTIWAY = False 84 | PRUNE_DOMINATED = True 85 | 86 | ALLOWED_BLOBS = {('test', 'bbb')} 87 | 88 | db0 = _make_db(tmp_path / '0.db', [b'\x00\x01'], bad=True) 89 | db1 = _make_db(tmp_path / '1.db', [b'\x00\x02'], bad=True) 90 | db2 = _make_db(tmp_path / '2.db', [b'\x00\x03'], bad=True) 91 | db3 = _make_db(tmp_path / '3.db', [b'\x00\x04'], bad=True) 92 | dbs = [db0, db1, db2, db3] 93 | 94 | groups = list(compute_groups(dbs, Normaliser=TestNormaliser)) 95 | instructions = list(groups_to_instructions(groups)) 96 | 97 | # this kinda demonstrates what happens if we're not careful and mess up ALLOWED_BLOBS 98 | # sqlite3 will end up dumping supposedly blob data as empty strings 99 | # and this will clean up files that shouldn't be cleaned up (in the files above all data is different!) 100 | assert [type(i) for i in instructions] == [ 101 | Keep, 102 | Prune, 103 | Prune, 104 | Keep, 105 | ] 106 | -------------------------------------------------------------------------------- /src/bleanser/core/modules/xml.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Iterator 2 | from contextlib import contextmanager 3 | from pathlib import Path 4 | 5 | from lxml import etree 6 | 7 | from bleanser.core.processor import ( 8 | BaseNormaliser, 9 | Normalised, 10 | sort_file, 11 | unique_file_in_tempdir, 12 | ) 13 | 14 | 15 | class Normaliser(BaseNormaliser): 16 | PRUNE_DOMINATED = False 17 | 18 | def cleanup(self, t: etree._Element) -> etree._Element: 19 | return t 20 | 21 | @contextmanager 22 | def normalise(self, *, path: Path) -> Iterator[Normalised]: 23 | # todo not sure if need to release some resources here... 24 | parser = etree.XMLParser(remove_blank_text=True) 25 | # TODO we seem to lose comments here... meh 26 | et = etree.fromstring(path.read_bytes(), parser=parser) 27 | # restore newlines just for the top level 28 | assert et.text is None, et.text 29 | et.text = '\n' 30 | for c in et: 31 | assert c.tail is None, c.tail 32 | c.tail = '\n' 33 | 34 | et = self.cleanup(et) 35 | 36 | cleaned = unique_file_in_tempdir(input_filepath=path, dir=self.tmp_dir, suffix='.xml') 37 | cleaned.write_text(etree.tostring(et, encoding="unicode")) 38 | 39 | # TODO what is the assumption about shape? 40 | # either list of xml entries 41 | # or top-level thing with children 42 | 43 | # todo meh... see Fileset._union 44 | # this gives it a bit of a speedup 45 | sort_file(cleaned) 46 | yield cleaned 47 | 48 | 49 | if __name__ == '__main__': 50 | Normaliser.main() 51 | 52 | 53 | def test_xml_simple(tmp_path: Path) -> None: 54 | from bleanser.tests.common import actions, hack_attribute 55 | 56 | f1 = tmp_path / 'f1' 57 | f2 = tmp_path / 'f2' 58 | f3 = tmp_path / 'f3' 59 | f4 = tmp_path / 'f4' 60 | 61 | # make sure it handles 62 | f1.write_text(''' 63 | 64 | text1 65 | text2 66 | 67 | ''') 68 | 69 | f2.write_text(''' 70 | 71 | text2 72 | text3 73 | text4 74 | 75 | ''') 76 | 77 | f3.write_text(''' 78 | 79 | text4 80 | text5 81 | 82 | ''') 83 | 84 | # note: we don't care about order 85 | f4.write_text(''' 86 | 87 | text5 88 | text4 89 | text3 90 | text2 91 | 92 | ''') 93 | 94 | paths123 = [f1, f2, f3] 95 | with hack_attribute(Normaliser, 'MULTIWAY', value=True), hack_attribute(Normaliser, 'PRUNE_DOMINATED', value=True): 96 | res123 = actions(paths=paths123, Normaliser=Normaliser) 97 | assert res123.remaining == paths123 98 | 99 | paths124 = [f1, f2, f4] 100 | with hack_attribute(Normaliser, 'MULTIWAY', value=True), hack_attribute(Normaliser, 'PRUNE_DOMINATED', value=True): 101 | res124 = actions(paths=paths124, Normaliser=Normaliser) 102 | assert res124.remaining == [ 103 | f1, 104 | f4, 105 | ] 106 | 107 | 108 | def test_xml_nested(tmp_path: Path) -> None: 109 | from bleanser.tests.common import actions, hack_attribute 110 | 111 | f1 = tmp_path / 'f1' 112 | f2 = tmp_path / 'f2' 113 | f3 = tmp_path / 'f3' 114 | # make sure we don't just sort all lines and treat them as set 115 | # this could happen if you just pretty print the whole structure and diff 116 | # TODO: tbh this is also a good test for 'simple' handling 117 | f1.write_text(''' 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | ''') 133 | f2.write_text(''' 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | ''') 149 | f3.write_text(''' 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | ''') 165 | 166 | paths = [f1, f2, f3] 167 | with hack_attribute(Normaliser, 'MULTIWAY', value=True), hack_attribute(Normaliser, 'PRUNE_DOMINATED', value=True): 168 | res = actions(paths=paths, Normaliser=Normaliser) 169 | assert res.remaining == [ 170 | f1, 171 | f2, 172 | f3, 173 | ] 174 | -------------------------------------------------------------------------------- /src/bleanser/core/sqlite.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | from bleanser.core.modules.sqlite import * 4 | 5 | warnings.warn( 6 | "Module 'bleanser.core.sqlite' is deprecated. Use 'bleanser.core.modules.sqlite' instead.", 7 | DeprecationWarning, 8 | ) 9 | 10 | 11 | if __name__ == '__main__': 12 | SqliteNormaliser.main() # noqa: F405 13 | -------------------------------------------------------------------------------- /src/bleanser/core/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING 4 | 5 | if not TYPE_CHECKING: 6 | from .compat import assert_never # noqa: F401 7 | 8 | 9 | from pathlib import Path 10 | 11 | 12 | def total_dir_size(d: Path) -> int: 13 | return sum(f.stat().st_size for f in d.glob('**/*') if f.is_file()) 14 | 15 | 16 | import sys 17 | 18 | under_pytest = 'pytest' in sys.modules 19 | ### ugh. pretty horrible... but 20 | # 'PYTEST_CURRENT_TEST' in os.environ 21 | # doesn't work before we're actually inside the test.. and it might be late for decorators, for instance 22 | ### 23 | 24 | 25 | import time 26 | 27 | 28 | class Timer: 29 | def __init__(self, *tags): 30 | self.tags = tags 31 | 32 | def __enter__(self): 33 | self.start = time.time() 34 | return self 35 | 36 | def __exit__(self, *args): 37 | self.end = time.time() 38 | delta = self.end - self.start 39 | print(f"{self.tags} TIME TAKEN: {delta:.1f}", file=sys.stderr) 40 | 41 | 42 | from functools import wraps 43 | 44 | 45 | def timing(f): 46 | @wraps(f) 47 | def wrapped(*args, **kwargs): 48 | with Timer(f.__name__): 49 | return f(*args, **kwargs) 50 | 51 | return wrapped 52 | 53 | 54 | # make it lazy, otherwise it might crash on module import (e.g. on Windows) 55 | # ideally would be nice to fix it properly https://github.com/ahupp/python-magic#windows 56 | import warnings 57 | from functools import lru_cache 58 | from typing import Callable 59 | 60 | 61 | @lru_cache(1) 62 | def _magic() -> Callable[[Path], str | None]: 63 | try: 64 | import magic 65 | except Exception as e: 66 | # logger.exception(e) 67 | defensive_msg: str | None = None 68 | if isinstance(e, ModuleNotFoundError) and e.name == 'magic': 69 | defensive_msg = "python-magic is not detected. It's recommended for better file type detection (pip3 install --user python-magic). See https://github.com/ahupp/python-magic#installation" 70 | elif isinstance(e, ImportError): 71 | emsg = getattr(e, 'msg', '') # make mypy happy 72 | if 'failed to find libmagic' in emsg: # probably the actual library is missing?... 73 | defensive_msg = "couldn't import magic. See https://github.com/ahupp/python-magic#installation" 74 | if defensive_msg is not None: 75 | warnings.warn(defensive_msg) 76 | return lambda path: None # stub # noqa: ARG005 77 | else: 78 | raise e 79 | else: 80 | mm = magic.Magic(mime=True) 81 | return lambda path: mm.from_file(str(path)) 82 | 83 | 84 | def mime(path: Path) -> str | None: 85 | # next, libmagic, it might access the file, so a bit slower 86 | magic = _magic() 87 | return magic(path) 88 | 89 | 90 | from typing import Any 91 | 92 | Json = Any 93 | 94 | 95 | from collections.abc import Collection 96 | 97 | 98 | def delkeys(j: Json, *, keys: str | Collection[str]) -> None: 99 | if isinstance(keys, str): 100 | keys = {keys} # meh 101 | 102 | # todo if primitive, don't do anything 103 | if isinstance(j, (int, float, bool, type(None), str)): 104 | return 105 | elif isinstance(j, list): 106 | for v in j: 107 | delkeys(v, keys=keys) 108 | elif isinstance(j, dict): 109 | for key in keys: 110 | j.pop(key, None) 111 | for v in j.values(): 112 | delkeys(v, keys=keys) 113 | else: 114 | raise RuntimeError(type(j)) 115 | 116 | 117 | def patch_atoms(j: Json, *, patch): 118 | if isinstance(j, (int, float, bool, type(None), str)): 119 | return patch(j) 120 | elif isinstance(j, list): 121 | for i in range(len(j)): 122 | j[i] = patch_atoms(j[i], patch=patch) 123 | return j 124 | elif isinstance(j, dict): 125 | for k in list(j.keys()): 126 | j[k] = patch_atoms(j[k], patch=patch) 127 | return j 128 | else: 129 | raise RuntimeError(type(j)) 130 | -------------------------------------------------------------------------------- /src/bleanser/modules/antennapod_android.py: -------------------------------------------------------------------------------- 1 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool 2 | 3 | 4 | class Normaliser(SqliteNormaliser): 5 | MULTIWAY = True 6 | PRUNE_DOMINATED = True 7 | 8 | def check(self, c) -> None: 9 | tables = Tool(c).get_tables() 10 | assert 'Feeds' in tables, tables 11 | eps = tables['FeedItems'] 12 | assert 'link' in eps 13 | assert 'read' in eps 14 | 15 | # should be safe to use multiway because of these vvv 16 | media = tables['FeedMedia'] 17 | assert 'played_duration' in media 18 | assert 'last_played_time' in media 19 | 20 | def cleanup(self, c) -> None: 21 | self.check(c) 22 | 23 | t = Tool(c) 24 | # often changing, no point keeping 25 | t.drop_cols( 26 | table='Feeds', 27 | cols=[ 28 | 'last_update', 29 | 'last_update_failed', 30 | 'image_url', # volatile 31 | 'minimal_duration_filter', 32 | ], 33 | ) 34 | 35 | t.drop_cols( 36 | table='FeedMedia', 37 | cols=[ 38 | 'download_url', # sometimes change, especially tracking links -- probs not worth keeping anyway 39 | 'filesize', # no idea why would it change, but it does sometimes 40 | ], 41 | ) 42 | 43 | t.drop_cols( 44 | table='FeedItems', 45 | cols=[ 46 | 'title', # useful feed, but volatile so best to ignore 47 | 'content_encoded', # no idea what is it but volatile 48 | 'description', # often changing, no need to keep 49 | 'image_url', # volatile 50 | ], 51 | ) 52 | 53 | t.drop('Queue') 54 | 55 | 56 | if __name__ == '__main__': 57 | Normaliser.main() 58 | -------------------------------------------------------------------------------- /src/bleanser/modules/binary.py: -------------------------------------------------------------------------------- 1 | """ 2 | Format-agnostic, clean up as literal file diffs 3 | """ 4 | # TODO probably should give it a better name... 5 | 6 | from bleanser.core.processor import BaseNormaliser 7 | 8 | 9 | class Normaliser(BaseNormaliser): 10 | # TODO need to be careful about using it... 11 | # for non-structured data might mess it up by accident if it's weirdly ordered 12 | pass 13 | 14 | 15 | if __name__ == '__main__': 16 | Normaliser.main() 17 | -------------------------------------------------------------------------------- /src/bleanser/modules/bluemaestro.py: -------------------------------------------------------------------------------- 1 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool 2 | 3 | 4 | class Normaliser(SqliteNormaliser): 5 | # multiway is useful at the very least for old db format, it only kept rolling 6K points or something in the db 6 | MULTIWAY = True 7 | PRUNE_DOMINATED = True 8 | 9 | def check(self, c) -> None: 10 | tool = Tool(c) 11 | tables = tool.get_tables() 12 | info_tables = [x for x in tables if x.endswith('_info')] 13 | if len(info_tables) == 0: 14 | # old db format 15 | data = tables['data'] 16 | assert 'Time' in data, data 17 | assert 'Temperature' in data, data 18 | else: 19 | # TODO hmm how to add some proper check here without too much duplication? 20 | pass 21 | 22 | def cleanup(self, c) -> None: 23 | self.check(c) 24 | tool = Tool(c) 25 | 26 | tables = tool.get_tables() 27 | info_tables = [x for x in tables if x.endswith('_info')] 28 | if len(info_tables) == 0: 29 | # old db format 30 | # log_index doesn't correspond to anything real, there are timestamps 31 | tool.drop_cols(table='data', cols=['log_index']) 32 | # changes every time db is exported, no point 33 | tool.drop_cols(table='info', cols=['last_download', 'last_pointer']) 34 | else: 35 | for info_table in info_tables: 36 | # possible to have multiple info tables, e.g. if you have multiple devices 37 | 38 | device, _ = info_table.split('_') 39 | 40 | ## get rid of downloadUnix -- it's changing after export and redundant info 41 | [[ut]] = list(c.execute(f'SELECT downloadUnix FROM {device}_info')) 42 | last_logs = [t for t in tables if t.endswith('log')] 43 | if len(last_logs) == 0: 44 | # seems like no data yet 45 | return 46 | last_log = max(last_logs) 47 | if last_log == f'{device}_{ut}_log': 48 | # TODO annoying that it needs to be defensive... 49 | # for some dbs it actually does happen, e.g. around 20211102085345 50 | tool.drop_cols(table=f'{device}_info', cols=['downloadUnix']) 51 | 52 | 53 | if __name__ == '__main__': 54 | Normaliser.main() 55 | 56 | 57 | # TODO think I've had jdoe or something with example databases.. 58 | def test_bluemaestro() -> None: 59 | from bleanser.tests.common import skip_if_no_data 60 | 61 | skip_if_no_data() 62 | 63 | from bleanser.tests.common import TESTDATA, actions2 64 | 65 | res = actions2(path=TESTDATA / 'bluemaestro', rglob='**/*.db*', Normaliser=Normaliser) 66 | 67 | assert res.remaining == [ 68 | '20180720.db', 69 | # '20180724.db', # move 70 | '20180728.db', 71 | # '20180730.db', # move 72 | '20180731.db', 73 | 74 | '20190723100032.db', # keep, everything changed 75 | # TODO need to investigate, some values have changed a bit, like 1st digit after decimal point 76 | # even timestamps changed sometimes (e.g. just last second) 77 | # hpi bluemaestro module has something for handling this, I think 78 | '20190724101707.db', 79 | # same as above 80 | '20190727104723.db', 81 | 82 | '20200208225936.db', # keep, everything changed (several months diff) 83 | # '20201209083427/bmgateway.db', # move, completely dominated by the next 84 | # '20210131102917/bmgateway.db', # move, completely dominated by the next 85 | # '20210207183947/bmgateway.db', # move, completely dominated by the next 86 | '20210216211844/bmgateway.db', # keep, errored because couldn't find last _log item 87 | '20211103234924/bmgateway.db', # same, previous errored 88 | '20211106191208/bmgateway.db', 89 | ] # fmt: skip 90 | -------------------------------------------------------------------------------- /src/bleanser/modules/bumble_android.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from bleanser.core.modules.json import delkeys 4 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool 5 | 6 | 7 | class Normaliser(SqliteNormaliser): 8 | MULTIWAY = True 9 | PRUNE_DOMINATED = True 10 | 11 | def check(self, c) -> None: 12 | tables = Tool(c).get_tables() 13 | 14 | # fmt: off 15 | message = tables['message'] 16 | conv_info = tables['conversation_info'] 17 | 18 | assert 'id' in message 19 | assert 'conversation_id' in message 20 | assert 'payload' in message 21 | assert 'created_timestamp' in message 22 | 23 | assert 'user_id' in conv_info 24 | assert 'user_name' in conv_info 25 | # fmt: on 26 | 27 | def cleanup(self, c) -> None: 28 | self.check(c) 29 | 30 | t = Tool(c) 31 | t.drop('search_fts_segments') 32 | t.drop('search_fts_segdir') 33 | t.drop('search_fts_docsize') 34 | t.drop('search_fts_content') 35 | t.drop('search_fts_stat') 36 | t.drop('message_read_info') 37 | 38 | t.drop_cols('conversation_info', cols=[ 39 | 'user_image_url', 40 | 'photo_url', 41 | 'last_seen_message_id', 42 | 'covid_preferences', 43 | 44 | 'chat_input_settings', 45 | 46 | 'match_status', # ?? either NULL or -1 or some weird hash thing?? 47 | 48 | 'sending_multimedia_enabled', 49 | 'disabled_multimedia_explanation', 50 | ]) # fmt: skip 51 | # for extract: photo_id can be a bit volatile 52 | 53 | # mm, user photos are a bit annoying, urls are flaky 54 | def _cleanup_jsons(s): 55 | if s is None: 56 | return None 57 | j = json.loads(s) 58 | delkeys( 59 | j, 60 | keys=[ 61 | 'url', # for conversation_info.user_photos & message.payload 62 | 'expiration_timestamp', # for message.payload 63 | ], 64 | ) 65 | return json.dumps(j) 66 | 67 | c.create_function("CLEANUP_JSONS", 1, _cleanup_jsons) 68 | list(c.execute('UPDATE conversation_info SET user_photos = CLEANUP_JSONS(user_photos)')) 69 | list(c.execute('UPDATE message SET payload = CLEANUP_JSONS(payload)')) 70 | 71 | 72 | if __name__ == '__main__': 73 | Normaliser.main() 74 | -------------------------------------------------------------------------------- /src/bleanser/modules/chrome.py: -------------------------------------------------------------------------------- 1 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool 2 | 3 | 4 | class Normaliser(SqliteNormaliser): 5 | MULTIWAY = True 6 | PRUNE_DOMINATED = True 7 | 8 | ALLOWED_BLOBS = { 9 | ('downloads', 'hash'), 10 | ('typed_url_sync_metadata', 'value'), 11 | } 12 | 13 | def check(self, c) -> None: 14 | tables = Tool(c).get_tables() 15 | # fmt: off 16 | v = tables['visits'] 17 | assert 'visit_time' in v, v 18 | assert 'url' in v, v # note: url is an int id 19 | 20 | u = tables['urls'] 21 | assert 'url' in u, u 22 | assert 'title' in u, u 23 | # fmt: on 24 | 25 | def cleanup(self, c) -> None: 26 | self.check(c) 27 | 28 | t = Tool(c) 29 | t.drop_cols( 30 | 'urls', 31 | cols=[ 32 | # TODO similar issue to firefox -- titles sometimes jump because of notifications (e.g. twitter) 33 | # maybe could sanitize it? 34 | # cleans up like 15% databases if I wipe it completely? 35 | # the annoying thing is that sqlite doesn't have support for regex... 36 | # 'title', 37 | # 38 | # aggregates, no need for them 39 | 'visit_count', 40 | 'typed_count', 41 | 'last_visit_time', 42 | ], 43 | ) 44 | t.drop_cols( 45 | 'segment_usage', 46 | cols=['visit_count'], 47 | ) 48 | c.execute('DELETE FROM meta WHERE key IN ("typed_url_model_type_state", "early_expiration_threshold")') 49 | 50 | # hmm, not sure -- it might change? 51 | # cleans up about 10% files 52 | # t.drop_cols( 53 | # 'visits', 54 | # cols=['visit_duration'], 55 | # ) 56 | 57 | 58 | if __name__ == '__main__': 59 | Normaliser.main() 60 | -------------------------------------------------------------------------------- /src/bleanser/modules/firefox.py: -------------------------------------------------------------------------------- 1 | from sqlite3 import Connection 2 | 3 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool 4 | 5 | 6 | class Normaliser(SqliteNormaliser): 7 | MULTIWAY = True 8 | PRUNE_DOMINATED = True 9 | 10 | def is_old_firefox(self, c: Connection) -> bool: 11 | tool = Tool(c) 12 | tables = tool.get_tables() 13 | if 'bookmarks' in tables: 14 | return True 15 | if 'moz_bookmarks' in tables: 16 | return False 17 | raise RuntimeError(f"Unexpected schema {tables}") 18 | 19 | def check(self, c: Connection) -> None: 20 | tool = Tool(c) 21 | tables = tool.get_tables() 22 | # fmt: off 23 | if self.is_old_firefox(c): 24 | v = tables['visits'] 25 | assert 'history_guid' in v, v 26 | assert 'date' in v, v 27 | 28 | h = tables['history'] 29 | assert 'url' in h, h 30 | assert 'guid' in h, h 31 | else: 32 | b = tables['moz_bookmarks'] 33 | assert 'dateAdded' in b, b 34 | assert 'guid' in b, b 35 | 36 | h = tables['moz_historyvisits'] 37 | assert 'place_id' in h, h 38 | assert 'visit_date' in h, h 39 | 40 | p = tables['moz_places'] 41 | assert 'url' in p, p 42 | assert 'id' in p, p 43 | # fmt: on 44 | 45 | def cleanup(self, c: Connection) -> None: 46 | self.check(c) 47 | 48 | if self.is_old_firefox(c): 49 | self.cleanup_old(c) 50 | return 51 | 52 | # otherwise, assume new db format 53 | 54 | tool = Tool(c) 55 | [(visits_before,)] = c.execute('SELECT count(*) FROM moz_historyvisits') 56 | tool.drop_cols( 57 | table='moz_places', 58 | cols=[ 59 | # aggregates, changing all the time 60 | 'frecency', 61 | 'recalc_frecency', 62 | 'alt_frecency', 63 | 'recalc_alt_frecency', 64 | 'last_visit_date', 65 | 'visit_count', 66 | # ugh... sometimes changes because of notifications, e.g. twitter/youtube?, or during page load 67 | 'hidden', 68 | 'typed', 69 | 'title', 70 | 'description', 71 | 'preview_image_url', 72 | 'foreign_count', # just some internal refcount thing... https://bugzilla.mozilla.org/show_bug.cgi?id=1017502 73 | ## mobile only 74 | 'visit_count_local', 75 | 'last_visit_date_local', 76 | 'last_visit_date_remote', 77 | 'sync_status', 78 | 'sync_change_counter', 79 | ## 80 | ## ? maybe mobile only 81 | 'visit_count_remote', 82 | ## 83 | ], 84 | ) 85 | # ugh. sometimes changes for no reason... 86 | # and anyway, for history the historyvisits table refers place_id (this table's actual id) 87 | # also use update instead delete because phone db used to have UNIQUE constraint... 88 | c.execute('UPDATE moz_places SET guid=id') 89 | tool.drop_cols( 90 | table='moz_bookmarks', 91 | cols=['lastModified'], # changing all the time for no reason? 92 | # todo hmm dateAdded might change when e.g. firefox reinstalls and it adds default bookmarks 93 | # probably not worth the trouble 94 | ) 95 | tool.drop('moz_meta') 96 | tool.drop('moz_origins') # prefix/host/frequency -- not interesting 97 | # tool.drop('moz_annos') # not sure -- contains downloads data? might be volatile 98 | 99 | tool.drop_cols( 100 | 'moz_inputhistory', 101 | cols=[ 102 | 'use_count', # eh, some floating point that changes all the time 103 | ], 104 | ) 105 | 106 | tool.drop_cols( 107 | 'moz_bookmarks_synced', 108 | cols=[ 109 | 'id', # id always changes, and they have guid instead 110 | 'serverModified', # changes without any actual changes to bookmark? 111 | ], 112 | ) 113 | 114 | ## fenix 115 | tool.drop_cols( 116 | 'moz_bookmarks_synced_structure', 117 | cols=[ 118 | # I think it's the position in bookmarks list, doesn't matter 119 | 'position', 120 | ], 121 | ) 122 | tool.drop('moz_places_metadata_search_queries') 123 | 124 | tool.drop_cols( 125 | 'moz_places_metadata', 126 | cols=[ 127 | ## volatile 128 | 'updated_at', 129 | 'total_view_time', 130 | 'typing_time', 131 | 'key_presses', 132 | 'scrolling_time', 133 | 'scrolling_distance', 134 | ## 135 | ], 136 | ) 137 | ## 138 | 139 | # TODO do we still need it? 140 | # sanity check just in case... can remove after we get rid of triggers properly... 141 | [(visits_after,)] = c.execute('SELECT count(*) FROM moz_historyvisits') 142 | assert visits_before == visits_after, (visits_before, visits_after) 143 | 144 | def cleanup_old(self, c) -> None: 145 | tool = Tool(c) 146 | 147 | # TODO could be pretty useful + really marginal benefits form cleaning it up, like 5% of databases maybe 148 | # tool.drop('searchhistory') 149 | 150 | tool.drop('thumbnails') 151 | tool.drop('favicons') 152 | 153 | # doesn't really have anything interesting? ... 154 | # just some image urls and maybe titles... likely no one cares about them 155 | tool.drop('page_metadata') 156 | 157 | tool.drop_cols( 158 | 'bookmarks', 159 | # we don't care about these 160 | cols=[ 161 | 'position', 162 | 'localVersion', 163 | 'syncVersion', 164 | 'modified', # also seems to depend on bookmark position 165 | 'guid', # sort of a hash and changes with position changes too? 166 | ], 167 | ) 168 | tool.drop_cols( 169 | 'clients', 170 | cols=['last_modified'], 171 | ) 172 | tool.drop_cols( 173 | 'history', 174 | cols=[ 175 | # aggregates, changing all the time 176 | 'visits', 177 | 'visits_local', 178 | 'visits_remote', 179 | ## 180 | # hmm, this seems to be last date.. actual dates are in 'visits' 181 | 'date', 182 | 'date_local', 183 | 'date_remote', 184 | ## 185 | 'title', 186 | # ugh. changes dynamically. e.g. (1) on twitter/telegram notifications 187 | # could update in some elaborate manner. idk 188 | 'modified', # ? changes for no apparent reason, probs because of the corresponding aggregates 189 | ], 190 | ) 191 | 192 | tool.drop_cols( 193 | 'remote_devices', 194 | cols=[ 195 | # probs only the presence of devices is interesting.. 196 | # changing all the time for no reason 197 | '_id', 198 | 'modified', 199 | 'last_access_time', 200 | 'created', # yes, this also changed all the time 201 | ], 202 | ) 203 | 204 | # FIXME hmm... 205 | # on the one hand, kind of interesting info.. 206 | # on the other, they change A LOT, so we'll miss most of tab snapshots anyway... 207 | # also newer databases don't have tab information anyway.. so I guess for now best to clean them up.. 208 | tool.drop('tabs') 209 | # tool.drop_cols( 210 | # 'tabs', 211 | # cols=['_id', 'favicon', 'position',], 212 | # ) 213 | 214 | 215 | if __name__ == '__main__': 216 | Normaliser.main() 217 | 218 | 219 | # TODO need to make sure we test 'rolling' visits 220 | # these look like they are completely cumulative in terms of history 221 | def test_fenix() -> None: 222 | from bleanser.tests.common import skip_if_no_data 223 | 224 | skip_if_no_data() 225 | 226 | from bleanser.tests.common import TESTDATA, actions2 227 | 228 | res = actions2(path=TESTDATA / 'fenix', rglob='**/*.sqlite*', Normaliser=Normaliser) 229 | assert res.remaining == [ 230 | # eh, too lazy to document the reason for keeping them... 231 | # many of them are just bookmark changes 232 | '20210327103953/places.sqlite', 233 | '20210408155753/places.sqlite', 234 | '20210419092604/places.sqlite', 235 | '20210514081246/places.sqlite', 236 | # '20210517094437/places.sqlite', # move 237 | # '20210517175309/places.sqlite', # move 238 | # '20210520132446/places.sqlite', # move 239 | # '20210522092831/places.sqlite', # move 240 | # '20210524152154/places.sqlite', # move 241 | # '20210526075434/places.sqlite', # move 242 | # '20210527062123/places.sqlite', # move 243 | # '20210530172804/places.sqlite', # move 244 | # '20210601165208/places.sqlite', # move 245 | # '20210602192530/places.sqlite', # move 246 | # '20210603032923/places.sqlite', # move 247 | '20210603144405/places.sqlite', 248 | '20210623234309/places.sqlite', 249 | '20210717141629/places.sqlite', 250 | ] 251 | -------------------------------------------------------------------------------- /src/bleanser/modules/foursquare.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from collections.abc import Iterator 4 | from typing import Any 5 | 6 | from bleanser.core.modules.json import Json, JsonNormaliser, delkeys 7 | 8 | TARGET = object() 9 | 10 | 11 | def _check_and_extract(x, schema) -> Iterator[Any]: 12 | if schema is TARGET: 13 | yield x 14 | return 15 | if type(schema) == type: # noqa: E721 16 | assert isinstance(x, schema), x 17 | return 18 | if type(schema) == list: # noqa: E721 19 | [sch] = schema 20 | assert isinstance(x, list), x 21 | for i in x: 22 | yield from _check_and_extract(x=i, schema=sch) 23 | return 24 | 25 | assert type(schema) == dict, schema # noqa: E721 26 | assert isinstance(x, dict), x 27 | 28 | xk = x.keys() 29 | sk = schema.keys() 30 | assert xk == sk, (sk, xk) 31 | for k in xk: 32 | yield from _check_and_extract(x=x[k], schema=schema[k]) 33 | 34 | 35 | def check_and_extract(x, schema) -> Any: 36 | [res] = list(_check_and_extract(x=x, schema=schema)) 37 | return res 38 | 39 | 40 | # TODO move to some generic helper 41 | SCHEMA = { 42 | 'meta': { 43 | 'code': int, 44 | 'requestId': str, 45 | }, 46 | 'notifications': [ 47 | { 48 | 'item': { 49 | 'unreadCount': int, 50 | }, 51 | 'type': str, 52 | }, 53 | ], 54 | 'response': { 55 | 'checkins': { 56 | 'count': int, 57 | 'items': TARGET, 58 | } 59 | }, 60 | } 61 | 62 | 63 | class Normaliser(JsonNormaliser): 64 | PRUNE_DOMINATED = True 65 | # hmm, I guess makes sense to make MULTIWAY = False considering it seems to be cumulative... kinda safer this way 66 | # on the otherhand useful to keep multiway for renamed venues? ugh 67 | MULTIWAY = True 68 | 69 | def cleanup(self, j: Json) -> Json: 70 | # ok, a bit nasty -- foursquare export seems to be a list of some sort of responses.. 71 | assert isinstance(j, list) 72 | 73 | res = [] 74 | for d in j: 75 | l = check_and_extract(x=d, schema=SCHEMA) 76 | assert isinstance(l, list) 77 | res.extend(l) 78 | 79 | for c in res: 80 | # some id that might change, probs useless 81 | v = c.get('venue', None) 82 | if v is not None: 83 | v['contact'].pop('facebook', None) # don't care 84 | v['contact'].pop('instagram', None) # don't care 85 | v.pop('verified', None) # don't care 86 | v.pop('delivery', None) # eh, we don't care about what venue uses for delivery 87 | 88 | # todo would be nice to support compose keys for delkeys.. 89 | # e.g. ('venue', 'contact', 'facebook') 90 | delkeys( 91 | c, 92 | keys={ 93 | ## these are just always changing, nothing we can do about it 94 | 'checkinsCount', 95 | 'usersCount', 96 | 'tipCount', 97 | ## 98 | 'sticker', # very volatile, some crap that 4sq sets on places 99 | # ugh. lat/lng are volatile, varying after 4th digit after dot for some reason 100 | 'lat', 101 | 'lng', # TODO instead round to 4th digit or something?? 102 | }, 103 | ) 104 | 105 | return res 106 | 107 | 108 | if __name__ == '__main__': 109 | Normaliser.main() 110 | -------------------------------------------------------------------------------- /src/bleanser/modules/ghexport.py: -------------------------------------------------------------------------------- 1 | from bleanser.core.modules.json import Json, JsonNormaliser 2 | 3 | 4 | class Normaliser(JsonNormaliser): 5 | PRUNE_DOMINATED = True 6 | MULTIWAY = True 7 | 8 | def cleanup(self, j: Json) -> Json: 9 | if isinstance(j, list): 10 | # old format -- I think only contained events log or something 11 | return j 12 | 13 | profile = j.get('profile') 14 | if profile is not None: 15 | profile.pop('disk_usage', None) 16 | profile.pop('updated_at', None) # I think it updates at any github activity, so pretty pointless 17 | profile.pop('followers', None) # pretty volatile, so not worth keeping + reflected in "followers" field anyway 18 | 19 | volatile = [ 20 | 'stargazers_count', 21 | 'watchers', 22 | 'watchers_count', 23 | 'forks', 24 | 'forks_count', 25 | 'open_issues', 26 | 'open_issues_count', 27 | ] 28 | 29 | for what in ['repos', 'watched', 'starred', 'subscriptions']: 30 | thing = j.get(what) 31 | if thing is None: 32 | continue 33 | for r in thing: 34 | # these are gonna be super flaky, so just ignore from diff 35 | # for our own repos they are duplicated in events anyway 36 | for k in [ 37 | *volatile, 38 | 'updated_at', 39 | 'pushed_at', 40 | 'size', 41 | ]: 42 | r.pop(k, None) 43 | 44 | repo_name = r["full_name"] 45 | if repo_name == 'emacs-straight/advice-patch': 46 | r.pop('description') 47 | # changes every day automatically 48 | # TODO move to private overlay? 49 | 50 | for r in j['repos']: 51 | repo_name = r["full_name"] 52 | 53 | for k in volatile: 54 | v = r.get(k) 55 | if v is None: 56 | continue 57 | r[k] = r[k] // 10 * 10 # round up to nearest multiple of 10 so there are less diffs 58 | 59 | ## need to 'flatten' traffic, otherwise it can't properly figure out diffs 60 | ## TODO possible to make generic, e.g. hint the normaliser that we need to flatten .repos.traffic.clones field 61 | traffic = r.get('traffic') 62 | if traffic is None: 63 | continue 64 | for key in ['clones', 'views']: 65 | xxx = traffic[key] 66 | xxx.pop('count') # aggregate 67 | xxx.pop('uniques') # aggregate 68 | assert xxx.keys() == {key} 69 | # NOTE: we ignore first and last traffic entry since timestamps are aligned to the closest day 70 | # so they are always going to be kinda flaky 71 | for c in xxx[key][1:-1]: 72 | ts = c['timestamp'] 73 | j[f'{repo_name}_traffic_{key}_{ts}'] = c 74 | xxx.pop(key) 75 | for key in ['popular/paths', 'popular/referrers']: 76 | # TODO hmm these are still quite flaky? they collect stats over last two weeks so can change a lot.. 77 | j[f'{repo_name}_traffic_{key}'] = traffic[key] 78 | traffic.pop(key) 79 | 80 | # TODO should probably prefer in place cleanup to make consistent with sqlite? not sure 81 | return j 82 | 83 | 84 | if __name__ == '__main__': 85 | Normaliser.main() 86 | -------------------------------------------------------------------------------- /src/bleanser/modules/goodreads.py: -------------------------------------------------------------------------------- 1 | from bleanser.core.modules.xml import Normaliser as XmlNormaliser 2 | 3 | 4 | class Normaliser(XmlNormaliser): 5 | MULTIWAY = True 6 | PRUNE_DOMINATED = True 7 | 8 | def cleanup(self, t): 9 | for key in [ 10 | 'average_rating', 11 | 'text_reviews_count', 12 | 'ratings_count', 13 | 'book/description', # volatile 14 | ]: 15 | for x in t.findall('.//' + key): 16 | x.getparent().remove(x) 17 | return t 18 | 19 | 20 | if __name__ == '__main__': 21 | Normaliser.main() 22 | -------------------------------------------------------------------------------- /src/bleanser/modules/hinge_android.py: -------------------------------------------------------------------------------- 1 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool 2 | 3 | 4 | class Normaliser(SqliteNormaliser): 5 | MULTIWAY = True 6 | PRUNE_DOMINATED = True 7 | 8 | ALLOWED_BLOBS = { 9 | # hopefully should be fine, all the metadata seems to be present in the table 10 | ('chat_messages', 'serialized'), 11 | ('channels', 'serialized'), 12 | } 13 | 14 | def check(self, c) -> None: 15 | tables = Tool(c).get_tables() 16 | msgs = tables['chat_messages'] 17 | # TODO hmm, maybe 'created' just means created in the db? 18 | assert 'sent' in msgs, msgs 19 | assert 'body' in msgs, msgs 20 | assert 'messageId' in msgs, msgs 21 | profiles = tables['profiles'] 22 | assert 'userId' in profiles, profiles 23 | # not sure if really useful at all but whatever 24 | channels = tables['channels'] 25 | assert 'subjectId' in channels, channels 26 | 27 | def cleanup(self, c) -> None: 28 | self.check(c) # todo could also call 'check' after just in case 29 | t = Tool(c) 30 | # seems that e.g. liked_content has some retention, so will need multiway 31 | 32 | # TODO not sure if it can be useful at all?? it contains something like 'Today' etc... 33 | # it generates tons of changes.. so I'd rather drop it I guess 34 | t.drop_cols(table='profiles', cols=['lastActiveStatus', 'lastActiveStatusId']) 35 | 36 | # not sure what's the point of updated col here, it just changes for all entries at the same time 37 | t.drop_cols(table='channels', cols=['updated', 'serialized']) 38 | 39 | # eh, not sure, they appear to be modified without actual changes to other cols? 40 | # fmt: off 41 | t.drop_cols(table='profiles' , cols=['created', 'updated', 'hidden']) 42 | t.drop_cols(table='answers' , cols=['created', 'modified']) 43 | t.drop_cols(table='player_media' , cols=['created']) 44 | t.drop_cols(table='subject_media', cols=['created']) 45 | # fmt: on 46 | 47 | # instagram urls change all the time (they contain some sort of token) 48 | # and expire quickly anyway.. so just easier to cleanup 49 | c.execute('UPDATE subject_media SET photoUrl="", thumbnailUrl="", videoUrl="" WHERE source = "instagram"') 50 | # todo width,height are changing all the time for some reason for subject_media 51 | 52 | # TODO pending_ratings?? 53 | 54 | ## 55 | t.drop(table='metrics') 56 | # TODO WTF?? they are collecting some network stats and putting in the db? e.g. metered/vpn/etc 57 | t.drop(table='networks') 58 | 59 | t.drop(table='preference_choices') # search prefrences -- change all the time and not interesting 60 | t.drop(table='pending_ratings') # flaky, seems like contains intermediate state 61 | 62 | ## clean up unnecessary profile/media data 63 | # seems 3 - seems like if there is a conversation with user, so worth keeping 64 | # state 1 - seems like 'liked', probs not worth tracking 65 | # state 11 is possibly 'seen', so not super interesting 66 | delete_profiles = 'FROM profiles WHERE state in (1, 11)' 67 | for tbl in ['subject_media', 'answers']: 68 | c.execute(f'DELETE FROM {tbl} WHERE userId IN (SELECT userId {delete_profiles})') 69 | # delete orphans too 70 | c.execute(f'DELETE FROM {tbl} WHERE userId NOT IN (SELECT userId FROM profiles)') 71 | c.execute(f'DELETE {delete_profiles}') 72 | ## 73 | 74 | ## id seems to be very unstable, as if they are resequenced all the time... 75 | remove_ids = [ 76 | 'answers', 77 | 'player_media', 78 | 'basic_choices', 79 | 'branding', 80 | 'channels', 81 | 'surveys', 82 | 'subject_media', 83 | 'liked_content', 84 | ] 85 | for table in remove_ids: 86 | t.drop_cols(table=table, cols=['id']) 87 | 88 | t.drop(table='standouts_content') # things are flaky here, even urls are changing between databases -- likely they are expiring 89 | 90 | t.drop_cols(table='surveys', cols=['receivedByHinge']) 91 | t.drop_cols(table='call_prompt_packs', cols=['position']) 92 | # player_media are user pics? might be useful.. 93 | t.drop_cols(table='player_media', cols=['position']) 94 | t.drop_cols(table='subject_media', cols=['position']) 95 | t.drop_cols(table='products', cols=['lastApiUpdate', 'lastStoreUpdate']) 96 | ## 97 | 98 | 99 | if __name__ == '__main__': 100 | Normaliser.main() 101 | -------------------------------------------------------------------------------- /src/bleanser/modules/hpi/fbmessenger_android.py: -------------------------------------------------------------------------------- 1 | # NOTE: this is experimental for now, best to use the corresponding module bleanser.modules.* instead 2 | import os 3 | from collections.abc import Iterator 4 | from pathlib import Path 5 | from typing import Any 6 | 7 | from my.core.cfg import tmp_config 8 | 9 | from bleanser.core.modules.extract import ExtractObjectsNormaliser 10 | 11 | ## disable cache, otherwise it's gonna flush it all the time 12 | # TODO this should be in some sort of common module 13 | os.environ["CACHEW_DISABLE"] = "*" 14 | os.environ.pop("ENLIGHTEN_ENABLE", None) 15 | os.environ["LOGGING_LEVEL_my_fbmessenger_android"] = "WARNING" 16 | ## 17 | 18 | import my.fbmessenger.android as module 19 | 20 | 21 | class Normaliser(ExtractObjectsNormaliser): 22 | MULTIWAY = True 23 | PRUNE_DOMINATED = True 24 | 25 | def extract_objects(self, path: Path) -> Iterator[Any]: 26 | class config: 27 | class fbmessenger: 28 | class android: 29 | export_path = path 30 | # TODO facebook_id?? 31 | 32 | with tmp_config(modules=module.__name__, config=config): 33 | assert len(module.inputs()) == 1 # sanity check to make sure tmp_config worked as expected 34 | for m in module.messages(): 35 | yield "message", m 36 | for c in module.contacts(): 37 | yield "contact", c 38 | 39 | 40 | if __name__ == "__main__": 41 | Normaliser.main() 42 | -------------------------------------------------------------------------------- /src/bleanser/modules/hpi/twitter_android.py: -------------------------------------------------------------------------------- 1 | # NOTE: this is experimental for now, best to use the corresponding module bleanser.modules.* instead 2 | import os 3 | from collections.abc import Iterator 4 | from pathlib import Path 5 | from typing import Any 6 | 7 | from my.core.cfg import tmp_config 8 | 9 | from bleanser.core.modules.extract import ExtractObjectsNormaliser 10 | 11 | ## disable cache, otherwise it's gonna flush it all the time 12 | # TODO this should be in some sort of common module 13 | os.environ['CACHEW_DISABLE'] = '*' 14 | os.environ.pop('ENLIGHTEN_ENABLE', None) 15 | os.environ['LOGGING_LEVEL_my_twitter_android'] = 'WARNING' 16 | ## 17 | 18 | import my.twitter.android as twitter_android 19 | 20 | 21 | class Normaliser(ExtractObjectsNormaliser): 22 | MULTIWAY = True 23 | PRUNE_DOMINATED = True 24 | 25 | def extract_objects(self, path: Path) -> Iterator[Any]: 26 | class config: 27 | class twitter: 28 | class android: 29 | export_path = path 30 | 31 | with tmp_config(modules=twitter_android.__name__, config=config): 32 | assert len(twitter_android.inputs()) == 1 # sanity check to make sure tmp_config worked as expected 33 | for x in twitter_android.bookmarks(): 34 | yield 'bookmark', x 35 | for x in twitter_android.likes(): 36 | yield 'like', x 37 | for x in twitter_android.tweets(): 38 | yield 'tweet', x 39 | 40 | 41 | if __name__ == '__main__': 42 | Normaliser.main() 43 | -------------------------------------------------------------------------------- /src/bleanser/modules/hpi/whatsapp_android.py: -------------------------------------------------------------------------------- 1 | # NOTE: this is experimental for now, best to use the corresponding module bleanser.modules.* instead 2 | import os 3 | from collections.abc import Iterator 4 | from pathlib import Path 5 | from typing import Any 6 | 7 | from my.core.cfg import tmp_config 8 | 9 | from bleanser.core.modules.extract import ExtractObjectsNormaliser 10 | 11 | ## disable cache, otherwise it's gonna flush it all the time 12 | # TODO this should be in some sort of common module 13 | os.environ["CACHEW_DISABLE"] = "*" 14 | os.environ.pop("ENLIGHTEN_ENABLE", None) 15 | os.environ["LOGGING_LEVEL_my_whatsapp_android"] = "WARNING" 16 | ## 17 | 18 | import my.whatsapp.android as module 19 | 20 | 21 | class Normaliser(ExtractObjectsNormaliser): 22 | MULTIWAY = True 23 | PRUNE_DOMINATED = True 24 | 25 | def extract_objects(self, path: Path) -> Iterator[Any]: 26 | class config: 27 | class whatsapp: 28 | class android: 29 | export_path = path 30 | # TODO my_user_id? 31 | 32 | with tmp_config(modules=module.__name__, config=config): 33 | assert len(module.inputs()) == 1 # sanity check to make sure tmp_config worked as expected 34 | yield from module.entities() 35 | 36 | 37 | if __name__ == "__main__": 38 | Normaliser.main() 39 | -------------------------------------------------------------------------------- /src/bleanser/modules/instagram_android.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from bleanser.core.modules.json import delkeys, patch_atoms 4 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool 5 | 6 | 7 | def _patch_volatile_urls(x): 8 | # these contain some sort of hashes and change all the time 9 | if not isinstance(x, str): 10 | return x 11 | if 'fbcdn.net' in x: 12 | return "" 13 | if 'cdninstagram' in x: 14 | return "" 15 | return x 16 | 17 | 18 | def _cleanup_jsons(s): 19 | if s is None: 20 | return None 21 | 22 | if isinstance(s, bytes): 23 | j = json.loads(s.decode('utf8')) 24 | else: 25 | # hmm normally it's bytes, but on odd occasions (old databases??) was str? odd 26 | j = json.loads(s) 27 | 28 | # TODO thread_v2_id -- might be useful for some other processing? 29 | delkeys(j, keys=[ 30 | ## messages db 31 | 'user', # eh. super volatile fields inside it... even full name changes all the time for no reason? 32 | 'is_replied_to_msg_taken_down', 33 | 'hscroll_share', # some reaction bullshit 34 | 'account_badges', 35 | ## 36 | 37 | ## threads db 38 | 'recipients', # same as 'user' in messages db.. pretty volatile 39 | 'has_older_thread_messages_on_server', 40 | 'interop_user_type', 41 | 'transparency_product_enabled', 42 | 'notification_preview_controls', 43 | 'thread_context_items', # some volatile follower counts? 44 | 'snippet', 45 | 'theme', 46 | 'ig_thread_capabilities', 47 | 'ai_agent_social_signal_message_count', 48 | 'has_groups_xac_ineligible_user', 49 | ## 50 | 51 | 'is_group_xac_calling_eligible', 52 | 'processed_business_suggestion', 53 | 54 | 'url_expiration_timestamp_us', 55 | 'is_eligible_for_igd_stacks', 56 | 'profile_pic_url', # volatile 57 | 'all_media_count', 58 | 'displayed_action_button_type', 59 | 'is_epd', 60 | 'liked_clips_count', 61 | 'reel_media_seen_timestamp', 62 | 'latest_besties_reel_media', 63 | 'latest_fanclub_reel_media', 64 | 'latest_reel_media', 65 | 66 | 'follow_friction_type', 67 | 'playable_url_info', 68 | 'preview_url_info', 69 | 'muting', 70 | 'biz_thread_throttling_state', 71 | 'badge_count', 72 | 'follower_count', 73 | 'following_count', 74 | 75 | 'last_seen_at', 76 | 77 | 'client_context', # seems to be same as client_item_id -- volatile 78 | 79 | 'feed_post_reshare_disabled', 80 | 81 | 'is_sent_by_viewer', # very volatile for no reason?? 82 | 83 | 'followed_by', 84 | 'account_type', # sometimes changes between 1 and 2? 85 | 'fan_club_info', # seems like page description 86 | 87 | 'is_business', 88 | 'is_following_current_user', 89 | 'is_interest_account', 90 | 'wa_addressable', 91 | 92 | 'inviter', # thread inviter? volatile 93 | 94 | # seems like fields in it appear and disappear for no reason without any actual status changes 95 | 'friendship_status', 96 | 97 | 'hide_in_thread', 98 | 'forward_score', 99 | 100 | ## I think these are properties of messages.user json blob 101 | 'paid_partnership_info', 102 | 'biz_user_inbox_state', 103 | 'has_exclusive_feed_content', 104 | 'has_encrypted_backup', 105 | 'is_using_unified_inbox_for_direct', 106 | 'personal_account_ads_page_id', 107 | 'personal_account_ads_page_name', 108 | 'show_account_transparency_details', 109 | 'organic_tracking_token', 110 | 'should_show_category', 111 | 'fundraiser_tag', 112 | ## 113 | 114 | 'unseen_count', 115 | 'send_attribution', 116 | 'send_silently', 117 | 'smart_suggestion', 118 | 'idempotence_token', 119 | 120 | ## threads.recipients properties 121 | 'can_coauthor_posts', 122 | 'can_coauthor_posts_with_music', 123 | ## 124 | 125 | 'visual_messages_newest_cursor', 126 | 'thread_messages_oldest_cursor', 127 | ]) # fmt: skip 128 | j = patch_atoms(j, patch=_patch_volatile_urls) 129 | return json.dumps(j, sort_keys=True).encode('utf8') 130 | 131 | 132 | class Normaliser(SqliteNormaliser): 133 | MULTIWAY = True 134 | PRUNE_DOMINATED = True 135 | 136 | def check(self, c) -> None: 137 | tables = Tool(c).get_tables() 138 | msgs = tables['messages'] 139 | assert 'timestamp' in msgs 140 | assert 'text' in msgs 141 | 142 | _threads = tables['threads'] 143 | 144 | def cleanup(self, c) -> None: 145 | self.check(c) 146 | 147 | t = Tool(c) 148 | t.drop('session') # super volatile 149 | 150 | for tbl in ['messages', 'threads']: 151 | t.drop_cols( 152 | tbl, 153 | cols=[ 154 | # changes all the time without changing content 155 | '_id', 156 | # 157 | # kinda volatile, seems to change some time after it's inserted? 158 | # doesn't seem used in any indexes etc 159 | 'client_item_id', 160 | ], 161 | ) 162 | 163 | t.drop_cols('threads', cols=['last_activity_time']) 164 | 165 | # so message/thread_info tables also contain a json field with raw data, and it's very volatile 166 | # to clean it up, tried using this at first: 167 | # SELECT _id, message_type, message, json_remove(message, (SELECT DISTINCT(fullkey) FROM messages, json_tree(message) WHERE atom LIKE '%cdninstagram%')) FROM messages ORDER BY message_type 168 | # it was promising, but it seems that it's not possible to pass multiple arguments from a scalar subquery 169 | # it only ended up removing the first key 170 | c.create_function("CLEANUP_JSONS", 1, _cleanup_jsons) 171 | queries = [ 172 | 'UPDATE messages SET message = CLEANUP_JSONS(message)', 173 | 'UPDATE threads SET thread_info = CLEANUP_JSONS(thread_info)', 174 | ] 175 | for query in queries: 176 | list(c.execute(query)) 177 | # a bit insane and experimental... but worked surprisingly smoothly and fast? 178 | ## 179 | 180 | 181 | if __name__ == '__main__': 182 | Normaliser.main() 183 | -------------------------------------------------------------------------------- /src/bleanser/modules/json_new.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | from bleanser.core.modules.json import * # noqa: F403, F401 4 | 5 | warnings.warn("Module 'bleanser.modules.json_new' is deprecated. Use 'bleanser.core.modules.json' instead.", DeprecationWarning) 6 | 7 | 8 | if __name__ == '__main__': 9 | JsonNormaliser.main() # noqa: F405 10 | -------------------------------------------------------------------------------- /src/bleanser/modules/kobo.py: -------------------------------------------------------------------------------- 1 | from sqlite3 import Connection 2 | 3 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool 4 | 5 | 6 | class Normaliser(SqliteNormaliser): 7 | # events are only snapshots, so probs makes sense 8 | MULTIWAY = True 9 | PRUNE_DOMINATED = True 10 | 11 | def check(self, c: Connection) -> None: 12 | tool = Tool(c) 13 | tables = tool.get_tables() 14 | assert 'content' in tables, tables 15 | bm = tables['Bookmark'] 16 | # fmt: off 17 | assert 'ExtraAnnotationData' in bm, bm 18 | assert 'BookmarkID' in bm, bm 19 | assert 'DateCreated' in bm, bm 20 | # fmt: on 21 | assert 'BookAuthors' in tables, tables 22 | 23 | def cleanup(self, c: Connection) -> None: 24 | self.check(c) 25 | 26 | tool = Tool(c) 27 | 28 | tool.fix_bad_blob_column(table='Activity', column='Data') 29 | tool.fix_bad_blob_column(table='Event', column='ExtraData') 30 | tool.fix_bad_blob_column(table='Bookmark', column='ExtraAnnotationData') 31 | 32 | tool.drop('content') # some cached book data? so not very interesting when it changes.. 33 | tool.drop('content_keys') # just some image meta 34 | tool.drop('volume_shortcovers') # just some hashes 35 | tool.drop('volume_tabs') # some hashes 36 | tool.drop('KoboPlusAssets') # some builtin faqs/manuals etc 37 | tool.drop('KoboPlusAssetGroup') # some builtin faqs/manuals etc 38 | tool.drop('Tab') # shop tabs 39 | tool.drop('Achievement') 40 | # TODO DbVersion? 41 | # TODO version in user table? 42 | 43 | tool.drop_cols(table='Event', cols=['Checksum']) 44 | 45 | ## these are changing all the time 46 | # TODO not sure about RecentBook? 47 | c.execute(''' 48 | DELETE FROM Activity 49 | WHERE Type IN ( 50 | "Recommendations", 51 | "TopPicksTab", 52 | "Top50" 53 | ) 54 | ''') 55 | ## 56 | # TODO hmm maybe drop all RecentBook from Activity? although doesn't help all that much 57 | 58 | c.execute(''' 59 | DELETE FROM AnalyticsEvents 60 | WHERE Type IN ( 61 | "PluggedIn", 62 | "BatteryLevelAtSync" 63 | )''') 64 | 65 | ## this changes all the time (Shelf only contains some meta entries, this isn't actual book access time) 66 | c.execute('UPDATE Shelf SET _SyncTime = NULL, LastAccessed = NULL, LastModified = NULL WHERE Id = "ReadingList"') 67 | ## 68 | 69 | tool.drop_cols( 70 | table='user', 71 | cols=[ 72 | 'SyncContinuationToken', 73 | 'KoboAccessToken', 74 | 'KoboAccessTokenExpiry', 75 | 'AuthToken', 76 | 'RefreshToken', 77 | 'Loyalty', 78 | 'PrivacyPermissions', # not very interesting, contains this stuff https://github.com/shadow81627/scrapey/blob/6dc2a7bba7f5adf2e3335c68e30208c71cfb5c2d/cookies.json#L950 79 | ], 80 | ) 81 | tool.drop_cols( 82 | table='Bookmark', 83 | cols=[ 84 | # TODO UserID?? 85 | # TODO ugh. DateCreated sometimes rounds to nearest second? wtf... 86 | # 87 | 'SyncTime', 88 | 'Version', # not sure what it is, but sometimes changing? 89 | # 90 | 'StartContainerChildIndex', 91 | 'EndContainerChildIndex', # ???? 92 | # 93 | 'StartContainerPath', 94 | 'EndContainerPath', 95 | ], 96 | ) 97 | # TODO Event table -- not sure... it trackes event counts, so needs to be cumulative or something? 98 | # yep, they def seem to messing up a lot 99 | # TODO Activity -- dates changing all the time... not sure 100 | 101 | 102 | if __name__ == '__main__': 103 | Normaliser.main() 104 | -------------------------------------------------------------------------------- /src/bleanser/modules/lastfm.py: -------------------------------------------------------------------------------- 1 | from bleanser.core.modules.json import Json, JsonNormaliser 2 | 3 | 4 | class Normaliser(JsonNormaliser): 5 | MULTIWAY = True 6 | PRUNE_DOMINATED = True 7 | 8 | def cleanup(self, j: Json) -> Json: 9 | # ugh sometimes case changes for no reason 10 | for x in j: 11 | for k, v in list(x.items()): 12 | if isinstance(v, str): 13 | # defensive, there was a date (around 2019-01-16) when dates glitched and were ints... 14 | x[k] = v.lower() 15 | return j 16 | # todo would be nice to use jq for that... e.g. older filter was 17 | # 'sort_by(.date) | map(map_values(ascii_downcase?))' 18 | 19 | 20 | if __name__ == '__main__': 21 | Normaliser.main() 22 | 23 | 24 | def test_lastfm() -> None: 25 | """ 26 | This test also highlights how multiway cleanup is more efficient than twoway 27 | """ 28 | from bleanser.tests.common import skip_if_no_data 29 | 30 | skip_if_no_data() 31 | 32 | from bleanser.tests.common import TESTDATA, actions, hack_attribute 33 | 34 | data = TESTDATA / 'lastfm' 35 | paths = sorted(data.glob('*.json')) 36 | 37 | with hack_attribute(Normaliser, key='MULTIWAY', value=False): 38 | res = actions(paths=paths, Normaliser=Normaliser) 39 | assert [p.name for p in res.pruned] == [ 40 | 'lastfm_20211107T011431Z.json', # fully contained in lastfm_20211127T011459Z 41 | ] 42 | 43 | with hack_attribute(Normaliser, key='MULTIWAY', value=True): 44 | res = actions(paths=paths, Normaliser=Normaliser) 45 | assert [p.name for p in res.remaining] == [ 46 | 'lastfm_2017-08-29.json', # keeping : initial: X + a 47 | 48 | # disappeared (a), and a bunch of items added (Y) 49 | # (a) ::: {"album": "", "artist": "pusha t/haim/q-tip/stromae/lorde", "date": "1503868125", "name": "meltdown (\u0438\u0437 \u0444\u0438\u043b\u044c\u043c\u0430 \u00ab\u0433\u043e\u043b\u043e\u0434\u043d\u044b\u0435 \u0438\u0433\u0440\u044b: \u0441\u043e\u0439\u043a\u0430-\u043f\u0435\u0440\u0435\u0441\u043c\u0435\u0448\u043d\u0438\u0446\u0430\u00bb. \u0447\u0430\u0441\u0442\u044c i)"} 50 | # 'lastfm_2017-09-01.json', # removing: X + Y 51 | 52 | # bunch of items were added (Z + b) 53 | 'lastfm_2017-09-19.json', # keeping : X + Y + Z + b 54 | 55 | # but b disappeared in this: so the previous item is the last pivot 56 | # (b) ::: {"album": "", "artist": "denny berthiaume", "date": "1505649846", "name": "moon river"} 57 | # 'lastfm_2017-09-22.json', # removing: X + Y + Z + Q 58 | 59 | 'lastfm_2017-10-31.json', # keeping : last item in group 60 | 61 | # this item is only present in this file: 62 | # ::: {"album": "departed glories", "artist": "biosphere", "date": "1635619124", "name": "than is the mater"} 63 | 'lastfm_20211031T001458Z.json', 64 | 65 | # this item is only present in this file: 66 | # > ::: {"album": "2010", "artist": "earl sweatshirt", "date": "1638578097", "name": "2010"} 67 | 'lastfm_20211204T011641Z.json', 68 | 69 | # last item 70 | 'lastfm_20220103T011522Z.json', 71 | ] # fmt: skip 72 | -------------------------------------------------------------------------------- /src/bleanser/modules/monzo.py: -------------------------------------------------------------------------------- 1 | from bleanser.core.modules.json import Json, JsonNormaliser, delkeys 2 | 3 | 4 | class Normaliser(JsonNormaliser): 5 | MULTIWAY = True 6 | PRUNE_DOMINATED = True 7 | 8 | def cleanup(self, j: Json) -> Json: 9 | delkeys( 10 | j, 11 | keys=[ 12 | 'account_balance', # obvs flaky 13 | 'suggested_tags', 14 | 'website', 15 | # 16 | 'address', 17 | 'formatted', 18 | 'logo', 19 | # 20 | ## flaky and useless 21 | 'mastercard_lifecycle_id', 22 | 'mastercard_clearing_message_id', 23 | 'token_transaction_identifier', 24 | 'tab_id', 25 | ## 26 | # 27 | 'settled', 28 | 'updated', 29 | 'amount_is_pending', 30 | # 31 | 'payee_id', # odd but sometimes flaky 32 | 'can_add_to_tab', 33 | ], 34 | ) 35 | 36 | if isinstance(j, list): 37 | # old format, only transactions for one account 38 | return j 39 | 40 | # flatten out transactions 41 | for account, d in list(j.items()): 42 | transactions = d['data']['transactions'] 43 | j[f'{account}_transactions'] = transactions 44 | del d['data']['transactions'] 45 | return j 46 | 47 | 48 | if __name__ == '__main__': 49 | Normaliser.main() 50 | -------------------------------------------------------------------------------- /src/bleanser/modules/pinboard.py: -------------------------------------------------------------------------------- 1 | from bleanser.core.modules.json import JsonNormaliser 2 | 3 | 4 | class Normaliser(JsonNormaliser): 5 | MULTIWAY = True 6 | PRUNE_DOMINATED = True 7 | 8 | 9 | if __name__ == '__main__': 10 | Normaliser.main() 11 | 12 | 13 | # TODO pinboard: tag summaries might be flaky 14 | # might be worth doing 15 | # if isinstance(j, dict): 16 | # del j['tags'] 17 | 18 | 19 | def test_pinboard() -> None: 20 | from bleanser.tests.common import skip_if_no_data 21 | 22 | skip_if_no_data() 23 | 24 | from bleanser.tests.common import TESTDATA, actions 25 | 26 | data = TESTDATA / 'pinboard' 27 | 28 | paths = sorted(data.glob('*.json')) 29 | 30 | res = actions(paths=paths, Normaliser=Normaliser) 31 | 32 | # note: some items duplicate in pinboard... 33 | # e.g. in bookmarks_2019-08-06.json.xz 34 | # ::: {"description": "Visual Leak Detector - Enhanced Memory Leak Detection for Visual C++ - CodeProject", "extended": "", "hash": "ef6dcf9d2987ea1f4919b31024c33662", "href": "http://www.codeproject.com/KB/applications/visualleakdetector.aspx", "meta": "8341db79448607b145078e00e69c8003", "shared": "yes", "tags": "debugging cpp", "time": "2014-02-09T01:02:57Z", "toread": "no"} 35 | 36 | assert [p.name for p in res.remaining] == [ 37 | 38 | 'bookmarks_2019-08-06.json' , # first in group 39 | # fully contained in the next 40 | # 'bookmarks_2019-08-07.json' , : MOVE 41 | 42 | # has to keep the next because for example this bookmark is flaky: 43 | # rg 'An Easy Explaination Of First And Follow Sets' | sort 44 | # bookmarks_2019-08-07.json:{"href":"http:\/\/www.jambe.co.nz\/UNI\/FirstAndFollowSets.html","description":"An Easy Explaination Of First And Follow Sets","extended":"","meta":"c68c6b649d587543bae12367e6fce8ec","hash":"3688a0bcfb0ee9f7cb7fbda43aabe131","time":"2014-02-09T01:03:03Z","shared":"yes","toread":"no","tags":"cs parsing"}, 45 | # bookmarks_20190924T010105Z.json:{"href":"http:\/\/www.jambe.co.nz\/UNI\/FirstAndFollowSets.html","description":"An Easy Explaination Of First And Follow Sets","extended":"","meta":"c68c6b649d587543bae12367e6fce8ec","hash":"3688a0bcfb0ee9f7cb7fbda43aabe131","time":"2014-02-09T01:03:03Z","shared":"yes","toread":"no","tags":"cs parsing"}, 46 | # bookmarks_20190929T124250Z.json: "description": "An Easy Explaination Of First And Follow Sets", 47 | # pinboard_20201231T011022Z.json: "description": "An Easy Explaination Of First And Follow Sets", 48 | # pinboard_20210220T011105Z.json: "description": "An Easy Explaination Of First And Follow Sets", 49 | # pinboard_20210221T011013Z.json: "description": "An Easy Explaination Of First And Follow Sets", 50 | # pinboard_20220103T011019Z.json: "description": "An Easy Explaination Of First And Follow Sets", 51 | 'bookmarks_20190924T010105Z.json', #: will keep 52 | 53 | # there is a whole bunch of flaky bookmarks like that ^ in pinboard, so won't bother annotating the rest 54 | 55 | # 'bookmarks_20190925T010106Z.json', : MOVE 56 | 'bookmarks_20190929T010107Z.json', #: will keep 57 | 'bookmarks_20190929T124250Z.json', #: will keep 58 | # 'bookmarks_20190930T010107Z.json', : MOVE 59 | # 'bookmarks_20191015T010107Z.json', : MOVE 60 | # 'bookmarks_20191016T010107Z.json', : MOVE 61 | # 'bookmarks_20191122T010108Z.json', : MOVE 62 | # 'bookmarks_20191123T010107Z.json', : MOVE 63 | 'bookmarks_20191205T010108Z.json', #: will keep 64 | # 'bookmarks_20191206T010107Z.json', : MOVE 65 | # 'bookmarks_20191207T010107Z.json', : MOVE 66 | 'pinboard_20200501T011005Z.json' , #: will keep 67 | # 'pinboard_20200502T011005Z.json' , : MOVE 68 | 'pinboard_20200614T011006Z.json' , #: will keep 69 | 'pinboard_20200615T001107Z.json' , #: will keep 70 | # 'pinboard_20200616T001008Z.json' , : MOVE 71 | # 'pinboard_20200812T001014Z.json' , : MOVE 72 | # 'pinboard_20200813T001016Z.json' , : MOVE 73 | # 'pinboard_20200814T001018Z.json' , : MOVE 74 | # 'pinboard_20200815T001017Z.json' , : MOVE 75 | 'pinboard_20200826T001017Z.json' , #: will keep 76 | # 'pinboard_20200827T001019Z.json' , : MOVE 77 | 'pinboard_20201230T011025Z.json' , #: will keep 78 | # 'pinboard_20201231T011022Z.json' , : MOVE 79 | # 'pinboard_20210220T011105Z.json' , : MOVE 80 | # 'pinboard_20210221T011013Z.json' , : MOVE 81 | 'pinboard_20220103T011019Z.json' , #: will keep 82 | ] # fmt: skip 83 | -------------------------------------------------------------------------------- /src/bleanser/modules/pocket.py: -------------------------------------------------------------------------------- 1 | from bleanser.core.modules.json import Json, JsonNormaliser 2 | 3 | 4 | class Normaliser(JsonNormaliser): 5 | MULTIWAY = True 6 | PRUNE_DOMINATED = True 7 | 8 | def cleanup(self, j: Json) -> Json: 9 | del j['since'] # flaky 10 | return j 11 | 12 | 13 | if __name__ == '__main__': 14 | Normaliser.main() 15 | -------------------------------------------------------------------------------- /src/bleanser/modules/podcastaddict_android.py: -------------------------------------------------------------------------------- 1 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool 2 | 3 | 4 | class Normaliser(SqliteNormaliser): 5 | MULTIWAY = True 6 | PRUNE_DOMINATED = True 7 | 8 | ALLOWED_BLOBS = { 9 | ('fts_virtual_episode_segments', 'block'), 10 | ('fts_virtual_episode_segdir', 'root'), 11 | ('fts_virtual_episode_docsize', 'size'), 12 | ('fts_virtual_episode_stat', 'value'), 13 | } 14 | 15 | # TODO this would be useful as a base class method 16 | # could be called before cleanup/extract etc 17 | def check(self, c) -> None: 18 | tables = Tool(c).get_tables() 19 | assert 'podcasts' in tables, tables 20 | eps = tables['episodes'] 21 | # to make sure it's safe to use multiway/prune dominated: 22 | assert 'playbackDate' in eps 23 | assert 'position_to_resume' in eps 24 | 25 | def cleanup(self, c) -> None: 26 | self.check(c) 27 | 28 | t = Tool(c) 29 | ## often changing, no point keeping 30 | t.drop_cols( 31 | table='episodes', 32 | cols=[ 33 | 'thumbnail_id', 34 | 'new_status', 35 | 'downloaded_status_int', 36 | 'thumbsRating', 37 | ], 38 | ) 39 | 40 | # no point tracking podcasts we're not following 41 | c.execute('DELETE FROM podcasts WHERE subscribed_status = 0') 42 | 43 | t.drop_cols( 44 | table='podcasts', 45 | cols=[ 46 | ## volatile at times, a bit annoying 47 | 'author', 48 | 'description', 49 | ## 50 | 'last_modified', 51 | 'etag', # ?? sometimes contains quoted last_modified or something.. 52 | 'rating', 53 | 'reviews', 54 | 'iTunesID', 55 | 'latest_publication_date', 56 | 'averageDuration', 57 | 'frequency', 58 | 'episodesNb', 59 | 'subscribers', 60 | 'thumbnail_id', 61 | 'update_date', 62 | 'update_status', 63 | 'filter_chapter_excluded_keywords', 64 | 'category', 65 | 'explicit', 66 | 'server_id', 67 | ], 68 | ) 69 | ## 70 | 71 | ## changing often and likely not interesting 72 | t.drop('ad_campaign') 73 | t.drop('bitmaps') 74 | t.drop('blocking_services') 75 | t.drop('content_policy_violation') 76 | t.drop('fts_virtual_episode_stat') 77 | t.drop('fts_virtual_episode_docsize') 78 | t.drop('fts_virtual_episode_segments') 79 | t.drop('fts_virtual_episode_segdir') 80 | t.drop('ordered_list') # just some random numbers, always changing 81 | t.drop('statistics') # just some random numbers, mostly empty 82 | t.drop('radio_search_results') 83 | t.drop('topics') # some random topic names.. at some point just disappeared 84 | t.drop('iha') # no idea what is it, contains one entry sometimes; volatile 85 | 86 | ## probably unnecessary? 87 | # tool.drop('chapters') 88 | # tool.drop('teams') 89 | # tool.drop('topics') 90 | # tool.drop('relatedPodcasts') 91 | # tool.drop('content_policy_violation') # lol 92 | ## 93 | 94 | 95 | if __name__ == '__main__': 96 | Normaliser.main() 97 | 98 | 99 | def test_podcastaddict() -> None: 100 | from bleanser.tests.common import skip_if_no_data 101 | 102 | skip_if_no_data() 103 | 104 | from bleanser.tests.common import TESTDATA, actions2 105 | 106 | res = actions2(path=TESTDATA / 'podcastaddict_android', rglob='**/*.db*', Normaliser=Normaliser) 107 | assert res.remaining == [ 108 | '20180106220736/podcastAddict.db', 109 | '20190227212300/podcastAddict.db', 110 | '20200217195816/podcastAddict.db', 111 | 112 | '20200406041500/podcastAddict.db', 113 | # '20210306070017/podcastAddict.db', 114 | # '20210306070020/podcastAddict.db', 115 | '20210306140046/podcastAddict.db', 116 | 117 | # keep: episode position changed 118 | '20210306165958/podcastAddict.db', 119 | 120 | # '20210509141916/podcastAddict.db', 121 | # '20210510070001/podcastAddict.db', 122 | # '20210511185801/podcastAddict.db', 123 | '20210513164819/podcastAddict.db', 124 | # some podcast lengths changed... might be useful 125 | '20210517000609/podcastAddict.db', 126 | # '20211226145720/podcastAddict.db', 127 | # '20211226172310/podcastAddict.db', 128 | # some podcast authors changed... dunno if useful but whatever 129 | '20211228010151/podcastAddict.db', 130 | ] # fmt: skip 131 | -------------------------------------------------------------------------------- /src/bleanser/modules/reddit.py: -------------------------------------------------------------------------------- 1 | from itertools import chain 2 | 3 | from bleanser.core.modules.json import Json, JsonNormaliser, delkeys 4 | 5 | REDDIT_IGNORE_KEYS = { 6 | ## TODO hmm maybe do something about these 7 | ## might be useful to keep 8 | 'subreddit_subscribers', 9 | 'subscribers', 10 | 'ups', 11 | 'score', 12 | 'num_comments', 13 | 'upvote_ratio', 14 | ### 15 | 16 | ## TODO ?? 17 | 'pwls', # TODO what is it?? 18 | 'likes', # todo? 19 | 'wls', # TODO ??? 20 | ## 21 | 22 | '_comments', 23 | 'accept_chats', 24 | 'accept_pms', 25 | 'advertiser_category', 26 | 'all_awardings', 27 | 'allow_chat_post_creation', 28 | 'allow_discovery', 29 | 'allow_galleries', 30 | 'allow_images', 31 | 'allow_live_comments', 32 | 'allow_polls', 33 | 'allow_videogifs', 34 | 'allow_videos', 35 | 'allowed_galleries', 36 | 'archived', 37 | 'associated_award', 38 | 'audience_target', 39 | 'author_flair_background_color', 40 | 'author_flair_css_class', 41 | 'author_flair_richtext', 42 | 'author_flair_template_id', 43 | 'author_flair_text', 44 | 'author_flair_text_color', 45 | 'author_flair_type', 46 | 'author_patreon_flair', 47 | 'author_premium', 48 | 'awarders', 49 | 'banner_background_color', 50 | 'banner_background_image', 51 | 'banner_img', 52 | 'banner_size', 53 | 'can_assign_link_flair', 54 | 'can_assign_user_flair', 55 | 'can_gild', 56 | 'collapse_deleted_comments', 57 | 'collapsed', 'collapsed_reason', # todo potentially interesting? 58 | 'comment_score_hide_mins', 59 | 'community_icon', 60 | 'content_categories', 61 | 'crosspost_parent_list', 62 | 'dash_url', 63 | 'discussion_type', 64 | 'emojis_custom_size', 65 | 'emojis_enabled', 66 | 'event_start', 'event_end', 'event_is_live', 67 | 'free_form_reports', 68 | 'gid_1', 69 | 'gid_2', 70 | 'gid_3', 71 | 'gilded', 72 | 'gildings', 73 | 'has_menu_widget', 74 | 'header_img', 75 | 'header_size', 76 | 'header_title', 77 | 'hide_score', 78 | 'hls_url', 79 | 'icon_img', 80 | 'icon_name', 81 | 'icon_size', 82 | 'icon_url', 83 | 'is_chat_post_feature_enabled', 84 | 'is_crosspostable', 85 | 'is_crosspostable_subreddit', 86 | 'is_robot_indexable', 87 | 'is_self', 88 | 'is_video', 89 | 'key_color', 90 | 'link_flair_css_class', 91 | 'link_flair_enabled', 92 | 'link_flair_position', 93 | 'link_flair_richtext', 94 | 'link_flair_template_id', 95 | 'link_flair_text', 96 | 'link_flair_type', 97 | 'linked_identities', 98 | 'media_embed', 99 | 'media_metadata', 100 | 'mobile_banner_image', 101 | 'new', 102 | 'no_follow', 103 | 'oembed', 104 | 'og_description', 'og_title', 105 | 'original_content_tag_enabled', 106 | 'over18', 107 | 'over_18', 108 | 'owner_id', 109 | 'parent_whitelist_status', # some ads thing 110 | 'password_set', 111 | 'post_hint', 112 | 'pref_no_profanity', 'pref_geopopular', 'pref_top_karma_subreddits', 113 | 'primary_color', 114 | 'report_reasons', 115 | 'restrict_commenting', 116 | 'restrict_posting', 117 | 'rte_mode', 118 | 'score_hidden', 119 | 'secure_media', 120 | 'secure_media_embed', 121 | 'send_replies', 122 | 'show_media', 123 | 'show_media_preview', 124 | 'spoilers_enabled', 125 | 'steward_report', 126 | 'stickied', 127 | 'submission_type', 128 | 'submit_link_label', 129 | 'submit_text_label', 130 | 'suggested_comment_sort', 131 | 'suggested_sort', 132 | 'thumbnail', 133 | 'thumbnail_height', 134 | 'thumbnail_width', 135 | 'top_awarded_type', 136 | 'total_awards_received', 137 | 'treatment_tags', 138 | 'user_flair_richtext', 139 | 'user_flair_template_id', 140 | 'user_flair_text_color', 141 | 'user_flair_type', 142 | 'user_reports', 143 | 'videostream_links_count', 144 | 'whitelist_status', # some ads thing 145 | 'wiki_enabled', 146 | 'snoovatar_img', 147 | 'snoovatar_size', 148 | 'allow_talks', 149 | 150 | ## very flaky 151 | 'link_flair_background_color', 152 | 'link_flair_text_color', 153 | 'call_to_action', # sometimes null, sometimes not present? 154 | ## 155 | ## 156 | 157 | ## nothing interesting, some subreddit settings 158 | 'allowed_media_in_comments', 159 | 'comment_contribution_settings', 160 | 'should_archive_posts', 161 | ## 162 | 163 | 'awardee_karma', # sometimes goes to 0 for no reason 164 | 165 | # TODO ?? 166 | # 'likes', 167 | # 'url', # ugh. changed from www.reddit.... to link without reddit domain 168 | # 'is_favorited', 169 | # 'is_subscriber', 170 | # 'domain', 171 | # should_archive_posts -- not sure? 172 | # 173 | # 174 | # subreddit_type: public/restricted -- actually quite useful info! 175 | # profile -> link_karma, comment_karma -- probs useful to keep 176 | # 177 | # TODO maybe, num_crossposts? have only seen once so far 178 | } # fmt: skip 179 | 180 | 181 | class Normaliser(JsonNormaliser): 182 | # NOTE: we don't want to prune dominated/use multiway in reddit, because that way we lose timestamps for changes!!! 183 | PRUNE_DOMINATED = False 184 | 185 | def cleanup(self, j: Json) -> Json: 186 | delkeys(j, keys=REDDIT_IGNORE_KEYS) 187 | 188 | # hmm, 'created' changes all the time for some reason starting from 20181124201020 189 | # https://www.reddit.com/r/redditdev/comments/29991t/whats_the_difference_between_created_and_created/ciiuk24/ 190 | # ok, it's broken, should use created_utc instead 191 | for v in j.values(): 192 | if not isinstance(v, list): 193 | continue 194 | for i in v: 195 | if 'created_utc' in i: 196 | i.pop('created', None) 197 | 198 | i.pop('subreddit_type', None) 199 | 200 | ## karma is flaky, goes up and down even without actual votes 201 | ## so make it a bit smoother 202 | profile = j['profile'] 203 | for kf in ['link_karma', 'total_karma']: 204 | k = profile.get(kf) 205 | if k is not None: 206 | profile[kf] = k // 10 * 10 207 | # ugh, total karma is flaking between two values for me consistently 208 | # but removing it completely only gets rid of 10% of files? 209 | ## 210 | 211 | for u in chain(j['upvoted'], j['downvoted']): 212 | ## not sure what it is, but flaky from "" to null 213 | u.pop('category', None) 214 | 215 | ## very flaky, often goes from gfycat.com to null 216 | media = u.get('media') 217 | if media is not None: 218 | media.pop('type', None) 219 | if media is None or len(media) == 0: 220 | u.pop('media', None) 221 | 222 | # gallery_data is sometimes flaking to none 223 | 224 | for s in j['subreddits']: 225 | # volatile when we've got enough subreddits -- not worth keeping 226 | s.pop('description', None) 227 | s.pop('public_description', None) 228 | s.pop('public_description_html', None) 229 | s.pop('submit_text', None) 230 | s.pop('submit_text_html', None) 231 | s.pop('disable_contributor_requests', None) 232 | 233 | return j 234 | 235 | 236 | if __name__ == '__main__': 237 | Normaliser.main() 238 | 239 | 240 | def test_reddit_1() -> None: 241 | from bleanser.tests.common import skip_if_no_data 242 | 243 | skip_if_no_data() 244 | 245 | from bleanser.tests.common import TESTDATA, actions 246 | # TODO add a test for multiway 247 | 248 | data = TESTDATA / 'reddit' 249 | paths = sorted(data.glob('*.json*')) 250 | 251 | res = actions(paths=paths, Normaliser=Normaliser) 252 | 253 | assert [p.name for p in res.remaining] == [ 254 | 'reddit_20211227T164130Z.json', # first in group 255 | 'reddit_20211227T170106Z.json', # saved item rolled over 256 | 'reddit_20211227T171058Z.json', # some saved items rolled over 257 | 258 | 'reddit_20211227T173058Z.json', # keeping boundary 259 | 'reddit_20211230T034059Z.json', # some items rolled over 260 | 'reddit_20211230T035056Z.json', # some things legit disappeared due to api limits 261 | 262 | 'reddit_20220102T132059Z.json', # boundary for the next one 263 | 'reddit_20220102T142057Z.json', # author changed (likely deleted?) 264 | 'reddit_20220102T164059Z.json', # last in group 265 | ] # fmt: skip 266 | 267 | 268 | def test_reddit_2() -> None: 269 | from bleanser.tests.common import skip_if_no_data 270 | 271 | skip_if_no_data() 272 | 273 | from bleanser.tests.common import TESTDATA, actions 274 | 275 | data = TESTDATA / 'reddit2' 276 | paths = sorted(data.glob('*.json*')) 277 | 278 | res = actions(paths=paths, Normaliser=Normaliser) 279 | # note: fieles appear to be spaced out by 20 mins instead of 10 (backup frequency) 280 | # this is ok, because I temporarily moved every other file away in the absence of bleanser 281 | assert [p.name for p in res.remaining] == [ 282 | 'reddit_20210803T121056Z.json', 283 | 284 | # ^v -- identical 285 | 286 | 'reddit_20210803T213053Z.json', 287 | 288 | # here: some saved items rolled over 289 | 'reddit_20210803T215050Z.json', 290 | 291 | 'reddit_20210804T213055Z.json', 292 | ] # fmt: skip 293 | -------------------------------------------------------------------------------- /src/bleanser/modules/rescuetime.py: -------------------------------------------------------------------------------- 1 | from bleanser.core.modules.json import JsonNormaliser 2 | 3 | 4 | class Normaliser(JsonNormaliser): 5 | MULTIWAY = True 6 | PRUNE_DOMINATED = True 7 | 8 | 9 | if __name__ == '__main__': 10 | Normaliser.main() 11 | 12 | 13 | def test_rescuetime() -> None: 14 | from bleanser.tests.common import skip_if_no_data 15 | 16 | skip_if_no_data() 17 | 18 | from bleanser.tests.common import TESTDATA, actions2 19 | 20 | res = actions2(path=TESTDATA / 'rescuetime', rglob='*.json*', Normaliser=Normaliser) 21 | assert res.remaining == [ 22 | 'rescuetime_2018-01-02.json.xz', 23 | 'rescuetime_2018-01-04.json.xz', 24 | 'rescuetime_2018-01-07.json.xz', 25 | 'rescuetime_2018-01-10.json.xz', 26 | 'rescuetime_2018-01-11.json.xz', 27 | # 28 | # todo these should be present in the result for the following group 29 | # not sure how to properly test? 30 | # maybe just grep... after applying instructions 31 | # Entry(dt=datetime.datetime(2020, 2, 19, 0, 55), duration_s=9, activity='mobile - com.android.launcher3'), 32 | # Entry(dt=datetime.datetime(2020, 2, 19, 0, 55), duration_s=9, activity='mobile - com.termux'), 33 | 'rescuetime_20200204T010205Z.json', 34 | 'rescuetime_20200219T010207Z.json', 35 | 'rescuetime_20200305T010206Z.json', 36 | # 37 | 'rescuetime_20211209T011109Z.json.xz', 38 | 'rescuetime_20211218T011116Z.json.xz', 39 | 'rescuetime_20211220T011110Z.json.xz', 40 | 'rescuetime_20211224T011109Z.json.xz', 41 | ] 42 | -------------------------------------------------------------------------------- /src/bleanser/modules/rescuetime_android.py: -------------------------------------------------------------------------------- 1 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool 2 | 3 | 4 | class Normaliser(SqliteNormaliser): 5 | MULTIWAY = True 6 | PRUNE_DOMINATED = True 7 | 8 | def check(self, c) -> None: 9 | tables = Tool(c).get_tables() 10 | events = tables['LoggedEvent'] 11 | assert 'started' in events, events 12 | assert 'appName' in events, events 13 | 14 | def cleanup(self, c) -> None: 15 | self.check(c) 16 | 17 | t = Tool(c) 18 | t.drop('ScanningPause') # not sure what is it, but seems to be some sort of helper table 19 | t.drop('SentryLogEntry') # some internal logging, contributes to tons of changes 20 | # todo there is also TimeLog, but it seems that they are also write only and consistent so don't impact diffs 21 | 22 | 23 | if __name__ == '__main__': 24 | Normaliser.main() 25 | -------------------------------------------------------------------------------- /src/bleanser/modules/skype_android.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from bleanser.core.modules.json import delkeys 4 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool 5 | 6 | 7 | class Normaliser(SqliteNormaliser): 8 | MULTIWAY = True 9 | PRUNE_DOMINATED = True 10 | 11 | def check(self, c) -> None: 12 | tables = Tool(c).get_tables() 13 | messages = tables['conversationsv14'] 14 | assert 'nsp_data' in messages, messages 15 | 16 | def cleanup(self, c) -> None: 17 | self.check(c) 18 | 19 | t = Tool(c) 20 | t.drop('conversationsv14_searchTerms_content') 21 | t.drop('conversationsv14_searchTerms_segments') 22 | t.drop('conversationsv14_searchTerms_segdir') 23 | 24 | t.drop('internaldata') # very volatile 25 | 26 | t.drop('telemetrycachev3') # volatile, nothing interesting here 27 | 28 | def _cleanup_jsons(s): 29 | if s is None: 30 | return None 31 | j = json.loads(s) 32 | delkeys( 33 | j, 34 | keys=[ 35 | 'fetchedDate', # from profilecachev8, very volatile 36 | 'up', # from miniprofilecachev8, very volatile 37 | ], 38 | ) 39 | return json.dumps(j) 40 | 41 | c.create_function("CLEANUP_JSONS", 1, _cleanup_jsons) 42 | list(c.execute('UPDATE profilecachev8 SET nsp_data = CLEANUP_JSONS(nsp_data)')) 43 | list(c.execute('UPDATE miniprofilecachev8 SET nsp_data = CLEANUP_JSONS(nsp_data)')) 44 | 45 | 46 | if __name__ == '__main__': 47 | Normaliser.main() 48 | -------------------------------------------------------------------------------- /src/bleanser/modules/sleepasandroid_android.py: -------------------------------------------------------------------------------- 1 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool 2 | 3 | 4 | class Normaliser(SqliteNormaliser): 5 | MULTIWAY = False # could use it, but no need really? 6 | PRUNE_DOMINATED = True 7 | 8 | def check(self, c) -> None: 9 | tables = Tool(c).get_tables() 10 | assert 'noise' in tables, tables 11 | assert 'records' in tables, tables 12 | 13 | def cleanup(self, c) -> None: 14 | self.check(c) 15 | 16 | # if not finished it's gonna constantly change 17 | res = c.execute('DELETE FROM records WHERE finished = 0') 18 | assert res.rowcount <= 1, res.rowcount 19 | 20 | 21 | if __name__ == '__main__': 22 | Normaliser.main() 23 | -------------------------------------------------------------------------------- /src/bleanser/modules/smscalls.py: -------------------------------------------------------------------------------- 1 | from bleanser.core.modules.xml import Normaliser as XmlNormaliser 2 | 3 | 4 | class Normaliser(XmlNormaliser): 5 | MULTIWAY = True 6 | PRUNE_DOMINATED = True 7 | 8 | def cleanup(self, t): 9 | # volatile attributes 10 | del t.attrib['count'] 11 | del t.attrib['backup_date'] 12 | del t.attrib['backup_set'] 13 | return t 14 | 15 | 16 | if __name__ == '__main__': 17 | Normaliser.main() 18 | 19 | 20 | def test_smscalls() -> None: 21 | from bleanser.tests.common import skip_if_no_data 22 | 23 | skip_if_no_data() 24 | 25 | from bleanser.tests.common import TESTDATA, actions 26 | 27 | data = TESTDATA / 'smscalls' 28 | paths = sorted(data.glob('*.xml*')) 29 | 30 | res = actions(paths=paths, Normaliser=Normaliser) 31 | 32 | assert [p.name for p in res.remaining] == [ 33 | 'calls-20161211023623.xml', 34 | 'calls-20161218221620.xml', 35 | 'calls-20170308050001.xml', 36 | # 'calls-20170309065640.xml', 37 | 'calls-20170310063055.xml', 38 | # 'calls-20170311050001.xml', 39 | # 'calls-20170312050001.xml', 40 | # 'calls-20170313050001.xml', 41 | # 'calls-20170314051813.xml', 42 | 'calls-20170315050001.xml', 43 | 44 | # 'calls-20210901043042.xml', 45 | 'calls-20210902043044.xml', 46 | # 'calls-20210903043044.xml', 47 | # 'calls-20210904060930.xml', 48 | # 'calls-20210905043030.xml', 49 | # 'calls-20210906043031.xml', 50 | 'calls-20210907043032.xml', 51 | 'calls-20210908043032.xml', 52 | 53 | 'sms-20211008043028.xml', 54 | # 'sms-20211009043028.xml' 55 | 'sms-20211010043029.xml', 56 | # 'sms-20211011043029.xml', 57 | 'sms-20211012065557.xml', 58 | # 'sms-20211013043058.xml', 59 | # 'sms-20211014043058.xml', 60 | # 'sms-20211015043059.xml', 61 | # 'sms-20211016043059.xml', 62 | # 'sms-20211017043000.xml', 63 | # 'sms-20211018045758.xml', 64 | # 'sms-20211019043059.xml', 65 | # 'sms-20211020043100.xml', 66 | # 'sms-20211021043000.xml', 67 | # 'sms-20211022044756.xml', 68 | # 'sms-20211023043057.xml', 69 | # 'sms-20211024043057.xml', 70 | # 'sms-20211025043057.xml', 71 | # 'sms-20211026051803.xml', 72 | # 'sms-20211027043004.xml', 73 | # 'sms-20211028043004.xml', 74 | 'sms-20211029043004.xml', 75 | # 'sms-20211030043005.xml', 76 | # 'sms-20211031043005.xml', 77 | # 'sms-20211101043006.xml', 78 | # 'sms-20211102043006.xml', 79 | # 'sms-20211103043007.xml', 80 | # 'sms-20211104043007.xml', 81 | # 'sms-20211105102901.xml', 82 | # 'sms-20211106043002.xml', 83 | # 'sms-20211107043002.xml', 84 | # 'sms-20211108043003.xml', 85 | # 'sms-20211109043004.xml', 86 | 'sms-20211110043004.xml', 87 | ] # fmt: skip 88 | -------------------------------------------------------------------------------- /src/bleanser/modules/spotify.py: -------------------------------------------------------------------------------- 1 | from bleanser.core.modules.json import Json, JsonNormaliser, delkeys 2 | 3 | 4 | class Normaliser(JsonNormaliser): 5 | MULTIWAY = True 6 | PRUNE_DOMINATED = True 7 | 8 | def cleanup(self, j: Json) -> Json: 9 | delkeys( 10 | j, 11 | keys={ 12 | 'popularity', # flaky -- relative to other artists, not interesting 13 | 'album_type', # sometimes flaky between 'album' and 'compilation' 14 | ## flaky metadata (maybe not even worth backing up..) 15 | 'available_markets', 16 | 'images', 17 | 'total_episodes', 18 | 'preview_url', 19 | 'release_date', 20 | 'external_ids', 21 | ## 22 | 'snapshot_id', # present on playlists, basically hash 23 | }, 24 | ) 25 | 26 | if isinstance(j, list): 27 | # old format, I think this was just 'Liked' playlist 28 | return j 29 | 30 | ## 'flatten' to make it possible to properly diff 31 | playlists = j['playlists'] 32 | upd_playlists = [] 33 | for p in playlists: 34 | _pname = p['name'] 35 | if p['owner']['id'] == 'spotify': 36 | # these are typically autogenerated playlists like 37 | # - "This Is " artist playlists 38 | # - mix between two users 39 | # they change very often and no point keeping track of them 40 | continue 41 | pid = p['id'] 42 | j[f'playlist_{pid}_tracks'] = p['tracks'] 43 | upd_playlists.append(p) 44 | del p['tracks'] 45 | j['playlists'] = upd_playlists 46 | ## 47 | 48 | # TODO ugh. tbh, not sure what to do with recently_played -- api only allows recent 50? 49 | # so they are bound to change super often if you listen to music daily (+ you might even miss some tracks anyway) 50 | 51 | return j 52 | 53 | 54 | if __name__ == '__main__': 55 | Normaliser.main() 56 | -------------------------------------------------------------------------------- /src/bleanser/modules/spotifyexport.py: -------------------------------------------------------------------------------- 1 | from bleanser.core.modules.json import Json, JsonNormaliser, delkeys 2 | 3 | 4 | class Normaliser(JsonNormaliser): 5 | def cleanup(self, j: Json) -> None: 6 | ## these change for no reason, and probably no one cares about them 7 | delkeys( 8 | j, 9 | keys={ 10 | 'images', 11 | 'available_markets', 12 | 'popularity', 13 | 'preview_url', 14 | 'external_urls', 15 | 'total_episodes', 16 | }, 17 | ) 18 | ## 19 | 20 | # TODO hmm. it changes often... but then it's kind of a useful info.. 21 | # del j['recently_played'] 22 | 23 | 24 | if __name__ == '__main__': 25 | Normaliser.main() 26 | -------------------------------------------------------------------------------- /src/bleanser/modules/stackexchange.py: -------------------------------------------------------------------------------- 1 | from bleanser.core.modules.json import Json, JsonNormaliser, delkeys 2 | 3 | 4 | class Normaliser(JsonNormaliser): 5 | MULTIWAY = True 6 | PRUNE_DOMINATED = True 7 | 8 | def cleanup(self, j: Json) -> Json: 9 | delkeys( 10 | j, 11 | keys=[ 12 | ## these are change all the time, and I guess if you were interested in any 'real time' dynamics 13 | ## you wouldn't use periodic backups anyway, just write a proper polling tool 14 | ## especially considering they are cumulative, fine to prune out 15 | 'reputation', 16 | 'view_count', 17 | 'favorite_count', 18 | 'up_vote_count', 19 | 'down_vote_count', 20 | 'answer_count', 21 | 'score', 22 | ## 23 | ## 24 | 'reputation_change_week', 25 | 'reputation_change_month', 26 | 'reputation_change_quarter', 27 | 'reputation_change_year', 28 | 'profile_image', 29 | 'last_access_date', # last time user loggen in? very flaky 30 | ], 31 | ) 32 | 33 | ## 34 | # the json maps from 'domain' (e.g. math/english/apple) to the payload with various interesting data 35 | # so we wanna flatten it first 36 | nj = {} 37 | for domain, d in j.items(): 38 | for k, v in d.items(): 39 | nj[f'{domain}_{k}'] = v 40 | j = nj 41 | ## 42 | 43 | ## 44 | for k in list(j.keys()): 45 | if k.endswith('/privileges'): # useless crap, achievements/badges 46 | del j[k] 47 | ## 48 | return j 49 | 50 | 51 | if __name__ == '__main__': 52 | Normaliser.main() 53 | -------------------------------------------------------------------------------- /src/bleanser/modules/talon_android.py: -------------------------------------------------------------------------------- 1 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool 2 | 3 | 4 | class Normaliser(SqliteNormaliser): 5 | MULTIWAY = True 6 | PRUNE_DOMINATED = True 7 | 8 | def check(self, c) -> None: 9 | _tables = Tool(c).get_tables() 10 | # TODO add something later 11 | 12 | def cleanup(self, c) -> None: 13 | self.check(c) 14 | 15 | t = Tool(c) 16 | # for some reason flaking between en/en_US 17 | t.drop('android_metadata') 18 | 19 | 20 | if __name__ == '__main__': 21 | Normaliser.main() 22 | -------------------------------------------------------------------------------- /src/bleanser/modules/tiktok_android.py: -------------------------------------------------------------------------------- 1 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool 2 | 3 | 4 | class Normaliser(SqliteNormaliser): 5 | MULTIWAY = True 6 | PRUNE_DOMINATED = True 7 | 8 | ALLOWED_BLOBS = { 9 | ('msg', 'content_pb'), 10 | ('im_search_index_official_segments', '*'), 11 | ('im_search_index_official_segdir', '*'), 12 | ('im_search_index_official_docsize', '*'), 13 | ('im_search_index_official_stat', '*'), 14 | } 15 | 16 | def check(self, c) -> None: 17 | tables = Tool(c).get_tables() 18 | 19 | messages = tables['msg'] 20 | assert 'msg_uuid' in messages 21 | assert 'content' in messages 22 | 23 | def cleanup(self, c) -> None: 24 | self.check(c) 25 | 26 | 27 | if __name__ == '__main__': 28 | Normaliser.main() 29 | -------------------------------------------------------------------------------- /src/bleanser/modules/tinder_android.py: -------------------------------------------------------------------------------- 1 | from sqlite3 import Connection 2 | 3 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool 4 | 5 | 6 | class Normaliser(SqliteNormaliser): 7 | MULTIWAY = True 8 | PRUNE_DOMINATED = True 9 | 10 | def check(self, c: Connection) -> None: 11 | tool = Tool(c) 12 | tables = tool.get_tables() 13 | matches = tables['match'] 14 | assert 'person_id' in matches, matches 15 | 16 | messages = tables['message'] 17 | assert 'text' in messages, messages 18 | assert 'match_id' in messages, messages 19 | 20 | def cleanup(self, c: Connection) -> None: 21 | self.check(c) 22 | 23 | t = Tool(c) 24 | 25 | t.drop( 26 | 'instagram_broken', 27 | 'explore_attribution', 28 | # 29 | ## messages from Tinder itself 30 | 'inbox_message', 31 | 'inbox_message_images', 32 | 'inbox_message_text_formatting', 33 | ## 34 | ) 35 | 36 | # eh, don't think it impacts anyway 37 | # t.drop('contextual_match') 38 | # it contains some photos? dunno 39 | 40 | # some odd id that increases with no impact for other data 41 | t.drop_cols(table='profile_media', cols=['client_sequential_id']) 42 | 43 | t.drop_cols(table='match_seen_state', cols=['match_id', 'last_message_seen_id']) 44 | 45 | t.drop('match_your_turn_state') 46 | 47 | # TODO profile_descriptor?? blob containing presumably profile info, and sometimes jumps quite a bit 48 | 49 | # this one contributes to _a lot_ of changes, like 40% 50 | # and I guess if we properly wanted to track when app was activated, we'd need a different mechanism anyway 51 | t.drop('last_activity_date') 52 | 53 | # hmm what is match_harassing_message?? 54 | 55 | # TODO not sure about this? 56 | # t.drop_cols('match', cols=[ 57 | # 'last_activity_date', 58 | # ]) 59 | 60 | # TODO profile_descriptor changes quite a lot? not sure 61 | 62 | # match->last_activity_date -- hmmm changing quite a bit? is it interesting? not sure 63 | # 64 | # message->is_liked -- not sure if worth keeping... only for finding out the first change? 65 | # 66 | # match_read_receipt -- what is it?? 67 | # match_id last_seen_message_id seen_timestamp 68 | # seems that last last_seen_message_id can be restored from messages table... but seen_timestamp is unique? 69 | 70 | # NOTE: for 'extract' mode 71 | # match->is_blocked 72 | 73 | 74 | if __name__ == '__main__': 75 | Normaliser.main() 76 | 77 | 78 | def test_tinder() -> None: 79 | from bleanser.tests.common import skip_if_no_data 80 | 81 | skip_if_no_data() 82 | 83 | from bleanser.tests.common import TESTDATA, actions2 84 | 85 | res = actions2(path=TESTDATA / 'tinder_android', rglob='**/*.db*', Normaliser=Normaliser) 86 | 87 | assert res.remaining == [ 88 | '20210523193545/tinder-3.db', # keep, first in group 89 | # '20210916214349/tinder-3.db', # MOVE 90 | # '20210916223254/tinder-3.db', # MOVE 91 | '20210916232749/tinder-3.db', # keep, some likes changes etc 92 | '20210917004827/tinder-3.db', 93 | '20210917014719/tinder-3.db', 94 | # '20210917015444/tinder-3.db', 95 | # '20210917031235/tinder-3.db', # MOVE 96 | '20210917060029/tinder-3.db', 97 | 98 | 99 | '20211007060802/tinder-3.db', # keep, first in group 100 | # '20211007090109/tinder-3.db', 101 | # '20211007094056/tinder-3.db', 102 | # '20211007115318/tinder-3.db', 103 | # '20211007133114/tinder-3.db', 104 | # '20211007143940/tinder-3.db', 105 | # '20211007155908/tinder-3.db', 106 | '20211007165243/tinder-3.db', 107 | '20211007180708/tinder-3.db', # keep, bio changed 108 | 109 | '20211225050314/tinder-3.db', # keep: first in group 110 | # '20211225193930/tinder-3.db', 111 | # '20211226052237/tinder-3.db', 112 | # '20211226091116/tinder-3.db', 113 | # '20211226135158/tinder-3.db', 114 | # '20211227002918/tinder-3.db', 115 | # '20211227044403/tinder-3.db', 116 | '20211227145813/tinder-3.db', # keep: last in group 117 | ] # fmt: skip 118 | -------------------------------------------------------------------------------- /src/bleanser/modules/twitter_android.py: -------------------------------------------------------------------------------- 1 | """ 2 | Normalises data for official twitter Android app 3 | """ 4 | 5 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool 6 | 7 | 8 | class Normaliser(SqliteNormaliser): 9 | MULTIWAY = True 10 | PRUNE_DOMINATED = True 11 | 12 | def check(self, c) -> None: 13 | tables = Tool(c).get_tables() 14 | 15 | statuses = tables['statuses'] 16 | assert 'status_id' in statuses 17 | assert 'content' in statuses 18 | 19 | [(total_statuses,)] = c.execute('SELECT COUNT(*) FROM statuses') 20 | assert total_statuses > 10 # sanity check 21 | 22 | [(statuses_without_content,)] = c.execute('SELECT COUNT(*) FROM statuses WHERE content IS NULL') 23 | # another sanity check -- to make sure the content is actually stored in this column and not lost during migrations 24 | assert statuses_without_content == 0 25 | 26 | _timeline = tables['timeline'] 27 | 28 | def cleanup(self, c) -> None: 29 | self.check(c) 30 | 31 | t = Tool(c) 32 | 33 | # some sort of crappy analytics -- A LOT of it 34 | # I actually suspect it's the bulk of this database? removing it makes cleanup considerably faster 35 | t.drop('feedback_action') 36 | t.drop('timeline_feedback_actions') 37 | 38 | t.drop('promoted_retry') 39 | 40 | t.drop('card_state') # only has a couple of rows which are always changing.. some policy crap 41 | 42 | t.drop('status_groups') # doesn't looks like anything interesting, contains read state? 43 | 44 | t.drop('retweets') # seems like it contains last retweet for each tweet or something.. doesn't actually have tweet data 45 | 46 | t.drop('tokens') # some internal thing 47 | 48 | t.drop_cols( 49 | 'statuses', 50 | cols=[ 51 | '_id', # internal id 52 | ## volatile 53 | 'favorite_count', 54 | 'retweet_count', 55 | 'view_count_info', 56 | 'reply_count', 57 | 'bookmark_count', 58 | 'quote_count', 59 | 'tweet_source', # sometimes NULL at first? 60 | 'flags', 61 | 'self_thread_id', 62 | 'edit_control', # no idea what it is 63 | 'unmention_info', # no idea, some binary crap (not even text) 64 | 'quick_promote_eligibility', 65 | 'quoted_status_permalink', 66 | 'conversation_control', 67 | ## 68 | # 69 | 'r_ent_content', # contains same data as 'content' 70 | # 71 | # cards contain some extra data embedded from the website (e.g. preview) 72 | # might be actually useful to extract data from it 73 | 'card', 74 | 'unified_card', 75 | ], 76 | ) 77 | 78 | # NOTE: in principle tweet data is all in statues table 79 | # but we need timeline to reconstruct some feeds (e.g. users own tweets) 80 | t.drop_cols( 81 | 'timeline', 82 | cols=[ 83 | '_id', # internal id 84 | ## volatile 85 | 'is_read', 86 | 'sort_index', 87 | 'timeline_chunk_id', 88 | 'updated_at', 89 | 'scribe_content', # some "for you" crap 90 | 'created_at', # internal created at, not tweet's 91 | 'feedback_action_prompts', 92 | 'social_context', 93 | 'is_linger_impressed', 94 | 'dismissed', 95 | ## 96 | ], 97 | ) 98 | 99 | c.execute(''' 100 | DELETE FROM timeline 101 | WHERE entity_group_id LIKE "%cursor%" 102 | OR entity_group_id LIKE "%who-to-follow%" 103 | OR entity_group_id LIKE "%trends%" 104 | OR entity_group_id LIKE "%semantic%" 105 | OR entity_group_id LIKE "%promoted%" 106 | OR entity_group_id LIKE "%home-conversation%" 107 | OR entity_group_id LIKE "%notification%" 108 | OR entity_id LIKE "%trends%" 109 | OR entity_id LIKE "%superhero%" 110 | ''') 111 | 112 | # after all decided to drop 'timeline' completely.. all actual data is in statuses table anyway 113 | # - the vast majority of volatile entrites in it are type == 17 (not sure what it is) 114 | # - it also contains non-user timelines (e.g. when you open someone's profile in twitter app) 115 | t.drop('timeline') 116 | 117 | t.drop('users') # they change all the time and probs not worth keeping all changes 118 | 119 | ## they are empty most of the time? sometimes contains an odd item for some reason 120 | t.drop('user_groups') 121 | t.drop('user_metadata') 122 | ## 123 | 124 | def remove_volatile_content(s): 125 | if s is None: 126 | return None 127 | xxx = s.find(bytes.fromhex('00695858583869306938306938496a')) 128 | if xxx == -1: 129 | return s 130 | else: 131 | return s[:xxx] 132 | # if b'movie trailer' in s: 133 | print(s.hex(), type(s)) 134 | return s 135 | 136 | # ugh... a few tweets sometimes have some binary changes?? 137 | # also this doesn't seem to solve everything sadly.. so for now commenting 138 | # c.create_function('REMOVE_VOLATILE_CONTENT', 1, remove_volatile_content) 139 | # list(c.execute('UPDATE statuses SET content = REMOVE_VOLATILE_CONTENT(content)')) 140 | 141 | # so it's a bit shit, but content shouldn't really change, and seems too hard to filter out these changes in binary blobs here 142 | # except edited tweets? but I have a feeling editing is controlled by timeline.updated or something 143 | # either way it would be so rare it will likely be caught collaterally by other data changes 144 | c.execute("UPDATE statuses SET content = X'BABABA'") 145 | 146 | 147 | if __name__ == '__main__': 148 | Normaliser.main() 149 | -------------------------------------------------------------------------------- /src/bleanser/modules/vk_android.py: -------------------------------------------------------------------------------- 1 | from bleanser.core.modules.sqlite import SqliteNormaliser, Tool 2 | 3 | 4 | class Normaliser(SqliteNormaliser): 5 | MULTIWAY = True 6 | PRUNE_DOMINATED = True 7 | 8 | # even though we drop some of these, currently columns are dropped via erasing the content, not altering table 9 | # so need to keep here too 10 | ALLOWED_BLOBS = { 11 | ('channel_messages', 'attach'), 12 | 13 | ('messages', 'avatar'), 14 | ('messages', 'attach'), 15 | ('messages', 'carousel'), 16 | ('messages', 'nested'), 17 | ('messages', 'keyboard_buttons'), 18 | 19 | ('users', 'avatar'), 20 | ('users', 'image_status'), 21 | ('contacts', 'avatar'), 22 | ('groups', 'avatar'), 23 | 24 | ('dialogs', 'bar_buttons'), 25 | ('dialogs', 'chat_settings_members_active'), 26 | ('dialogs', 'chat_settings_admins'), 27 | ('dialogs', 'chat_settings_avatar'), 28 | ('dialogs', 'draft_msg'), 29 | ('dialogs', 'expire_msg_vk_ids'), 30 | ('dialogs', 'group_call_participants'), 31 | ('dialogs', 'keyboard_buttons'), 32 | ('dialogs', 'pinned_msg_attaches'), 33 | ('dialogs', 'pinned_msg_nested'), 34 | ('dialogs', 'pinned_carousel'), 35 | ('dialogs', 'unread_mention_msg_vk_ids'), 36 | 37 | ('mutual_friends', 'mutual_friends_ids'), 38 | } # fmt: skip 39 | 40 | def is_vkim(self, c) -> bool: 41 | tables = Tool(c).get_tables() 42 | if 'messages' in tables: 43 | return True 44 | else: 45 | # otherwise must be vk.db 46 | return False 47 | 48 | def check(self, c) -> None: 49 | tables = Tool(c).get_tables() 50 | if self.is_vkim(c): 51 | msgs = tables['messages'] 52 | assert 'vk_id' in msgs, msgs 53 | assert 'time' in msgs, msgs 54 | 55 | dialogs = tables['dialogs'] 56 | assert 'id' in dialogs, dialogs 57 | else: 58 | users = tables['users'] 59 | assert 'uid' in users, users 60 | assert 'firstname' in users, users 61 | 62 | def cleanup_vk_db(self, c) -> None: 63 | t = Tool(c) 64 | t.drop(table='friends_hints_order') 65 | t.drop_cols( 66 | table='users', 67 | cols=[ 68 | # TODO hmm lately (202309), is_friend seems to be flaky for no reason? even where there are no status changes 69 | 'last_updated', 70 | 'photo_small', 71 | 'lists', # very flaky for some reason, sometimes just flips to 0?? 72 | 'name_r', # seems derived from first/last name, and is very flaky 73 | ], 74 | ) 75 | 76 | def cleanup(self, c) -> None: 77 | self.check(c) # todo could also call 'check' after just in case 78 | 79 | if not self.is_vkim(c): 80 | self.cleanup_vk_db(c) 81 | return 82 | 83 | t = Tool(c) 84 | 85 | for table in [ 86 | 'peers_search_content', 87 | 'peers_search_segments', 88 | 'peers_search_segdir', 89 | 'peers_search_docsize', 90 | 'peers_search_stat', 91 | 'messages_search_segments', 92 | 'messages_search_segdir', 93 | 'messages_search_docsize', 94 | 'messages_search_stat', 95 | 'messages_search_content', 96 | # 97 | 'key_value', # nothing interesting here 98 | 'integer_generator', # lol 99 | # 100 | ## no data, just some internal tracking 101 | 'dialogs_history_count', 102 | 'dialogs_history_meta', 103 | 'dialog_weight', 104 | ## 105 | ]: 106 | t.drop(table=table) 107 | 108 | t.drop_cols( 109 | table='users', 110 | cols=[ 111 | 'avatar', # flaky and no point tracking really 112 | 'image_status', 113 | ## flaky timestamps 114 | 'sync_time_overall', 115 | 'sync_time_online', 116 | 'online_last_seen', 117 | ## 118 | 'online_app_id', 119 | 'online_type', 120 | ], 121 | ) 122 | 123 | t.drop_cols( 124 | table='contacts', 125 | cols=[ 126 | 'avatar', 127 | 'sync_time', # flaky 128 | 'last_seen_status', # flaky 129 | ], 130 | ) 131 | 132 | t.drop_cols( 133 | table='dialogs', 134 | cols=[ 135 | 'sort_id_server', 136 | 'sort_id_local', 137 | 'weight', 138 | 'read_till_in_msg_vk_id', 139 | 'read_till_out_msg_vk_id', 140 | 'last_msg_vk_id', 141 | 'read_till_in_msg_vk_id_local', 142 | 'read_till_in_msg_cnv_id', 143 | 'read_till_out_msg_cnv_id', 144 | 'last_msg_cnv_id', 145 | 'count_unread', 146 | 'count_unread_local', 147 | 'keyboard_visible', 148 | 'draft_msg', 149 | 'bar_name', 150 | 'bar_exists', 151 | 'bar_buttons', 152 | 'bar_text', 153 | 'bar_icon', 154 | ], 155 | ) 156 | 157 | t.drop_cols( 158 | table='messages', 159 | cols=[ 160 | ## seems flaky -- not sure why, hard to tell since it's a binary blob 161 | 'attach', 162 | 'nested', 163 | ## 164 | 'phase_id', # not sure what is it, some internal stuff 165 | ], 166 | ) 167 | 168 | t.drop_cols( 169 | table='groups', 170 | cols=[ 171 | 'avatar', 172 | 'sync_time', 173 | 'members_count', 174 | ], 175 | ) 176 | 177 | 178 | if __name__ == '__main__': 179 | Normaliser.main() 180 | -------------------------------------------------------------------------------- /src/bleanser/modules/xml_clean.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | from bleanser.core.modules.xml import * # noqa: F401, F403 4 | 5 | warnings.warn("Module 'bleanser.modules.xml_clean' is deprecated. Use 'bleanser.core.modules.xml_clean' instead.", DeprecationWarning) 6 | 7 | 8 | if __name__ == '__main__': 9 | Normaliser.main() # noqa: F405 10 | -------------------------------------------------------------------------------- /src/bleanser/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/karlicoss/bleanser/418c361ee05621af2b1553d40a3618f2cf98b323/src/bleanser/py.typed -------------------------------------------------------------------------------- /src/bleanser/tests/common.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | from contextlib import contextmanager 5 | from dataclasses import dataclass 6 | from pathlib import Path 7 | 8 | import pytest 9 | 10 | TESTDATA = Path(__file__).absolute().parent / 'testdata' 11 | 12 | 13 | def skip_if_no_data() -> None: 14 | if 'CI' in os.environ and not TESTDATA.exists(): 15 | pytest.skip('test only works on @karlicoss private data for now') 16 | 17 | 18 | @dataclass 19 | class Res: 20 | pruned: list[Path] 21 | remaining: list[Path] 22 | 23 | 24 | def actions(*, paths: list[Path], Normaliser, threads: int | None = None) -> Res: 25 | from bleanser.core.common import Keep, Prune 26 | from bleanser.core.processor import compute_instructions 27 | 28 | instructions = list(compute_instructions(paths, Normaliser=Normaliser, threads=threads)) 29 | pruned = [] 30 | remaining = [] 31 | for i in instructions: 32 | if isinstance(i, Prune): 33 | pruned.append(i.path) 34 | elif isinstance(i, Keep): 35 | remaining.append(i.path) 36 | else: 37 | raise RuntimeError(type(i)) 38 | return Res(pruned=pruned, remaining=remaining) 39 | 40 | 41 | @dataclass 42 | class Res2: 43 | pruned: list[str] 44 | remaining: list[str] 45 | 46 | 47 | def actions2(*, path: Path, rglob: str, Normaliser, threads: int | None = None) -> Res2: 48 | from bleanser.core.main import _get_paths 49 | 50 | pp = str(path) + os.sep + rglob 51 | paths = _get_paths(path=pp, glob=True, from_=None, to=None) 52 | res = actions(paths=paths, Normaliser=Normaliser, threads=threads) 53 | pruned = res.pruned 54 | remaining = res.remaining 55 | return Res2( 56 | pruned =[str(c.relative_to(path)) for c in pruned ], 57 | remaining=[str(c.relative_to(path)) for c in remaining], 58 | ) # fmt: skip 59 | 60 | 61 | @contextmanager 62 | def hack_attribute(Normaliser, key, value): 63 | prev = getattr(Normaliser, key) 64 | try: 65 | # TODO meh.. maybe instead instantiate an instance instead of class? 66 | setattr(Normaliser, key, value) 67 | yield 68 | finally: 69 | setattr(Normaliser, key, prev) 70 | -------------------------------------------------------------------------------- /src/bleanser/tests/test_binary.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pathlib import Path 4 | 5 | import pytest 6 | 7 | from bleanser.modules.binary import Normaliser 8 | from bleanser.tests.common import TESTDATA, actions, hack_attribute, skip_if_no_data 9 | 10 | # TODO ugh. how to make relative imports work? pytest doesn't like them... 11 | 12 | 13 | def via_fdupes(path: Path) -> list[str]: 14 | from subprocess import check_output 15 | 16 | lines = check_output(['fdupes', '-1', path]).decode('utf8').splitlines() 17 | to_delete = [] 18 | for line in lines: 19 | items = line.split() 20 | # meh... don't get why it's not processing them in order... 21 | items = sorted(items) 22 | to_delete.extend(items[1:-1]) 23 | return sorted(to_delete) 24 | 25 | 26 | # TODO maybe add some sanity checks? 27 | # e.g. try guessing dates from filenames and making sure they are consistent with mtimes? 28 | # todo need to resort removing to a single command 29 | # and check 'remove' mode separately 30 | @pytest.mark.parametrize( 31 | 'data', 32 | [ 33 | TESTDATA / 'instapaper', 34 | TESTDATA / 'hypothesis_xz', 35 | ], 36 | ) 37 | def test_all(data: Path) -> None: 38 | skip_if_no_data() 39 | 40 | paths = sorted(data.glob('*.json*')) 41 | assert len(paths) > 20, paths # precondition 42 | 43 | with hack_attribute(Normaliser, '_DIFF_FILTER', None): 44 | res = actions(paths=paths, Normaliser=Normaliser) 45 | 46 | expected_deleted = [Path(p) for p in via_fdupes(path=data)] 47 | assert res.pruned == expected_deleted 48 | 49 | 50 | # FIXME hmm need to make sure --dry is the default (maybe add a cmdline test?) 51 | -------------------------------------------------------------------------------- /src/bleanser/tests/test_hypothesis.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | from bleanser.core.modules.json import JsonNormaliser as Normaliser 6 | from bleanser.tests.common import TESTDATA, actions, hack_attribute, skip_if_no_data 7 | 8 | data = TESTDATA / 'hypothesis' 9 | 10 | 11 | # total time about 5s? 12 | @pytest.mark.parametrize('num', range(10)) 13 | def test_normalise_one(tmp_path: Path, num: int) -> None: # noqa: ARG001 14 | skip_if_no_data() 15 | 16 | path = data / 'hypothesis_20210625T220028Z.json' 17 | n = Normaliser(original=path, base_tmp_dir=tmp_path) 18 | with n.do_normalise(): 19 | pass 20 | 21 | 22 | # TODO less verbose mode for tests? 23 | def test_all() -> None: 24 | skip_if_no_data() 25 | 26 | # todo share with main 27 | paths = sorted(data.glob('*.json')) 28 | assert len(paths) > 80, paths # precondition 29 | 30 | # 4 workers: 64 seconds 31 | # 4 workers, pool for asdict: 42 seconds.. 32 | # 2 workers: 81 seconds. hmmm 33 | with hack_attribute(Normaliser, key='PRUNE_DOMINATED', value=True): 34 | res = actions(paths=paths, Normaliser=Normaliser, threads=4) 35 | remaining = {p.name for p in res.remaining} 36 | assert 0 < len(remaining) < len(paths), remaining # sanity check 37 | 38 | assert { 39 | 'hypothesis_2017-11-21.json', 40 | 'hypothesis_2019-06-11.json', 41 | 'hypothesis_2019-08-18.json', 42 | 'hypothesis_20190923T003014Z.json', 43 | 'hypothesis_20191216T123012Z.json', 44 | 'hypothesis_20200325T140016Z.json', 45 | 'hypothesis_20200720T140043Z.json', 46 | 'hypothesis_20200828T123032Z.json', 47 | 'hypothesis_20201012T140035Z.json', 48 | 'hypothesis_20210223T213023Z.json', 49 | 'hypothesis_20210625T220028Z.json', 50 | }.issubset(remaining), remaining 51 | # issubset because concurrency might end up in leaving more files than the absolute minimum 52 | 53 | assert len(remaining) < 30, remaining 54 | 55 | 56 | # FIXME check move mode 57 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | minversion = 3.21 3 | # relies on the correct version of Python installed 4 | envlist = ruff,tests,mypy,mypy-hpi 5 | # https://github.com/tox-dev/tox/issues/20#issuecomment-247788333 6 | # hack to prevent .tox from crapping to the project directory 7 | toxworkdir = {env:TOXWORKDIR_BASE:}{toxinidir}/.tox 8 | 9 | [testenv] 10 | # TODO how to get package name from setuptools? 11 | package_name = "bleanser" 12 | pass_env = 13 | # useful for tests to know they are running under ci 14 | CI 15 | CI_* 16 | # respect user's cache dirs to prevent tox from crapping into project dir 17 | PYTHONPYCACHEPREFIX 18 | MYPY_CACHE_DIR 19 | RUFF_CACHE_DIR 20 | 21 | set_env = 22 | # do not add current working directory to pythonpath 23 | # generally this is more robust and safer, prevents weird issues later on 24 | PYTHONSAFEPATH=1 25 | 26 | # default is 'editable', in which tox builds wheel first for some reason? not sure if makes much sense 27 | package = uv-editable 28 | 29 | 30 | [testenv:ruff] 31 | skip_install = true 32 | dependency_groups = testing 33 | commands = 34 | {envpython} -m ruff check \ 35 | {posargs} 36 | 37 | 38 | [testenv:tests] 39 | dependency_groups = testing 40 | extras = 41 | extra 42 | json 43 | xml 44 | commands = 45 | # posargs allow test filtering, e.g. tox ... -- -k test_name 46 | {envpython} -m pytest \ 47 | --pyargs {[testenv]package_name} --ignore-glob 'src/bleanser/modules/hpi/*' \ 48 | {posargs} 49 | 50 | 51 | [testenv:mypy] 52 | dependency_groups = testing 53 | extras = 54 | extra 55 | json 56 | xml 57 | commands = 58 | {envpython} -m mypy --no-install-types \ 59 | # note: hpi modules are tested below 60 | -p {[testenv]package_name} --exclude 'hpi/*' \ 61 | # txt report is a bit more convenient to view on CI 62 | --txt-report .coverage.mypy \ 63 | --html-report .coverage.mypy \ 64 | {posargs} 65 | 66 | 67 | [testenv:mypy-hpi] 68 | dependency_groups = testing 69 | extras = 70 | extra 71 | json 72 | xml 73 | HPI 74 | commands = 75 | {envpython} -m mypy --no-install-types \ 76 | -p {[testenv]package_name}.modules.hpi \ 77 | # txt report is a bit more convenient to view on CI 78 | --txt-report .coverage.mypy-hpi \ 79 | --html-report .coverage.mypy-hpi \ 80 | {posargs} 81 | --------------------------------------------------------------------------------