├── .ci └── run ├── .github └── workflows │ └── main.yml ├── .gitignore ├── .gitmodules ├── LICENSE ├── README.org ├── conftest.py ├── dal.py ├── export.py ├── mypy.ini ├── pyproject.toml ├── pytest.ini ├── ruff.toml ├── src └── hypexport │ ├── dal.py │ ├── export.py │ └── py.typed └── tox.ini /.ci/run: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eu 3 | 4 | cd "$(dirname "$0")" 5 | cd .. # git root 6 | 7 | if ! command -v sudo; then 8 | # CI or Docker sometimes doesn't have it, so useful to have a dummy 9 | function sudo { 10 | "$@" 11 | } 12 | fi 13 | 14 | # --parallel-live to show outputs while it's running 15 | tox_cmd='run-parallel --parallel-live' 16 | if [ -n "${CI-}" ]; then 17 | # install OS specific stuff here 18 | case "$OSTYPE" in 19 | darwin*) 20 | # macos 21 | : 22 | ;; 23 | cygwin* | msys* | win*) 24 | # windows 25 | : 26 | # ugh. parallel stuff seems super flaky under windows, some random failures, "file used by other process" and crap like that 27 | tox_cmd='run' 28 | ;; 29 | *) 30 | # must be linux? 31 | : 32 | ;; 33 | esac 34 | fi 35 | 36 | # NOTE: expects uv installed 37 | uv tool run --with tox-uv tox $tox_cmd "$@" 38 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | # see https://github.com/karlicoss/pymplate for up-to-date reference 2 | 3 | name: CI 4 | on: 5 | push: 6 | branches: '*' 7 | tags: 'v[0-9]+.*' # only trigger on 'release' tags for PyPi 8 | # Ideally I would put this in the pypi job... but github syntax doesn't allow for regexes there :shrug: 9 | # P.S. fuck made up yaml DSLs. 10 | pull_request: # needed to trigger on others' PRs 11 | # Note that people who fork it need to go to "Actions" tab on their fork and click "I understand my workflows, go ahead and enable them". 12 | workflow_dispatch: # needed to trigger workflows manually 13 | # todo cron? 14 | inputs: 15 | debug_enabled: 16 | type: boolean 17 | description: 'Run the build with tmate debugging enabled (https://github.com/marketplace/actions/debugging-with-tmate)' 18 | required: false 19 | default: false 20 | 21 | 22 | jobs: 23 | build: 24 | strategy: 25 | fail-fast: false 26 | matrix: 27 | platform: [ubuntu-latest, macos-latest, windows-latest] 28 | python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] 29 | exclude: [ 30 | # windows runners are pretty scarce, so let's only run lowest and highest python version 31 | {platform: windows-latest, python-version: '3.10'}, 32 | {platform: windows-latest, python-version: '3.11'}, 33 | {platform: windows-latest, python-version: '3.12'}, 34 | 35 | # same, macos is a bit too slow and ubuntu covers python quirks well 36 | {platform: macos-latest , python-version: '3.10'}, 37 | {platform: macos-latest , python-version: '3.11'}, 38 | {platform: macos-latest , python-version: '3.12'}, 39 | ] 40 | 41 | runs-on: ${{ matrix.platform }} 42 | 43 | # useful for 'optional' pipelines 44 | # continue-on-error: ${{ matrix.platform == 'windows-latest' }} 45 | 46 | steps: 47 | # ugh https://github.com/actions/toolkit/blob/main/docs/commands.md#path-manipulation 48 | - run: echo "$HOME/.local/bin" >> $GITHUB_PATH 49 | 50 | - uses: actions/setup-python@v5 51 | with: 52 | python-version: ${{ matrix.python-version }} 53 | 54 | - uses: astral-sh/setup-uv@v5 55 | with: 56 | enable-cache: false # we don't have lock files, so can't use them as cache key 57 | 58 | - uses: actions/checkout@v4 59 | with: 60 | submodules: recursive 61 | fetch-depth: 0 # nicer to have all git history when debugging/for tests 62 | 63 | - uses: mxschmitt/action-tmate@v3 64 | if: ${{ github.event_name == 'workflow_dispatch' && inputs.debug_enabled }} 65 | 66 | # explicit bash command is necessary for Windows CI runner, otherwise it thinks it's cmd... 67 | - run: bash .ci/run 68 | 69 | - if: matrix.platform == 'ubuntu-latest' # no need to compute coverage for other platforms 70 | uses: actions/upload-artifact@v4 71 | with: 72 | include-hidden-files: true 73 | name: .coverage.mypy_${{ matrix.platform }}_${{ matrix.python-version }} 74 | path: .coverage.mypy/ 75 | 76 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/python,emacs 3 | # Edit at https://www.gitignore.io/?templates=python,emacs 4 | 5 | ### Emacs ### 6 | # -*- mode: gitignore; -*- 7 | *~ 8 | \#*\# 9 | /.emacs.desktop 10 | /.emacs.desktop.lock 11 | *.elc 12 | auto-save-list 13 | tramp 14 | .\#* 15 | 16 | # Org-mode 17 | .org-id-locations 18 | *_archive 19 | 20 | # flymake-mode 21 | *_flymake.* 22 | 23 | # eshell files 24 | /eshell/history 25 | /eshell/lastdir 26 | 27 | # elpa packages 28 | /elpa/ 29 | 30 | # reftex files 31 | *.rel 32 | 33 | # AUCTeX auto folder 34 | /auto/ 35 | 36 | # cask packages 37 | .cask/ 38 | dist/ 39 | 40 | # Flycheck 41 | flycheck_*.el 42 | 43 | # server auth directory 44 | /server/ 45 | 46 | # projectiles files 47 | .projectile 48 | 49 | # directory configuration 50 | .dir-locals.el 51 | 52 | # network security 53 | /network-security.data 54 | 55 | 56 | ### Python ### 57 | # Byte-compiled / optimized / DLL files 58 | __pycache__/ 59 | *.py[cod] 60 | *$py.class 61 | 62 | # C extensions 63 | *.so 64 | 65 | # Distribution / packaging 66 | .Python 67 | build/ 68 | develop-eggs/ 69 | downloads/ 70 | eggs/ 71 | .eggs/ 72 | lib/ 73 | lib64/ 74 | parts/ 75 | sdist/ 76 | var/ 77 | wheels/ 78 | pip-wheel-metadata/ 79 | share/python-wheels/ 80 | *.egg-info/ 81 | .installed.cfg 82 | *.egg 83 | MANIFEST 84 | 85 | # PyInstaller 86 | # Usually these files are written by a python script from a template 87 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 88 | *.manifest 89 | *.spec 90 | 91 | # Installer logs 92 | pip-log.txt 93 | pip-delete-this-directory.txt 94 | 95 | # Unit test / coverage reports 96 | htmlcov/ 97 | .tox/ 98 | .nox/ 99 | .coverage 100 | .coverage.* 101 | .cache 102 | nosetests.xml 103 | coverage.xml 104 | *.cover 105 | .hypothesis/ 106 | .pytest_cache/ 107 | 108 | # Translations 109 | *.mo 110 | *.pot 111 | 112 | # Scrapy stuff: 113 | .scrapy 114 | 115 | # Sphinx documentation 116 | docs/_build/ 117 | 118 | # PyBuilder 119 | target/ 120 | 121 | # pyenv 122 | .python-version 123 | 124 | # pipenv 125 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 126 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 127 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 128 | # install all needed dependencies. 129 | #Pipfile.lock 130 | 131 | # celery beat schedule file 132 | celerybeat-schedule 133 | 134 | # SageMath parsed files 135 | *.sage.py 136 | 137 | # Spyder project settings 138 | .spyderproject 139 | .spyproject 140 | 141 | # Rope project settings 142 | .ropeproject 143 | 144 | # Mr Developer 145 | .mr.developer.cfg 146 | .project 147 | .pydevproject 148 | 149 | # mkdocs documentation 150 | /site 151 | 152 | # mypy 153 | .mypy_cache/ 154 | .dmypy.json 155 | dmypy.json 156 | 157 | # Pyre type checker 158 | .pyre/ 159 | 160 | # End of https://www.gitignore.io/api/python,emacs 161 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "src/hypexport/Hypothesis"] 2 | path = src/hypexport/Hypothesis 3 | url = https://github.com/karlicoss/Hypothesis.git # ugh, github actions can't handle ssh? 4 | [submodule "src/hypexport/exporthelpers"] 5 | path = src/hypexport/exporthelpers 6 | url = https://github.com/karlicoss/exporthelpers.git 7 | [submodule "testdata/netrights-dashboard-mockup"] 8 | path = testdata/netrights-dashboard-mockup 9 | url = https://github.com/taniki/netrights-dashboard-mockup.git 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Dmitrii Gerasimov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.org: -------------------------------------------------------------------------------- 1 | #+begin_src python :dir src :results drawer :exports results 2 | import hypexport.export as E; return E.make_parser().prog 3 | #+end_src 4 | 5 | #+RESULTS: 6 | :results: 7 | Export/takeout for your personal [[https://hypothes.is][Hypothes.is]] data: annotations and profile information. 8 | :end: 9 | 10 | * Setting up 11 | 1. install with PIP 12 | - =pip3 install --user git+https://github.com/karlicoss/hypexport= 13 | 14 | - for export functionality: append =[export]= 15 | - for optional extras for logging and faster json processing: append =[optional]= 16 | - or any combination of the above, e.g. =[export,optional]= 17 | 18 | - alternatively, use =git clone --recursive=, or =git pull && git submodules update --init=. After that, you can use =pip3 install --editable=. 19 | 20 | 21 | 2. Follow [[https://hypothes.is/account/developer][these]] instructions to set up the token 22 | 23 | * Exporting 24 | 25 | #+begin_src python :dir src :results drawer :exports results 26 | import hypexport.export as E; return E.make_parser().epilog 27 | #+end_src 28 | 29 | #+RESULTS: 30 | :results: 31 | 32 | Usage: 33 | 34 | *Recommended*: create =secrets.py= keeping your api parameters, e.g.: 35 | 36 | 37 | : username = "USERNAME" 38 | : token = "TOKEN" 39 | 40 | 41 | After that, use: 42 | 43 | : python3 -m hypexport.export --secrets /path/to/secrets.py 44 | 45 | That way you type less and have control over where you keep your plaintext secrets. 46 | 47 | *Alternatively*, you can pass parameters directly, e.g. 48 | 49 | : python3 -m hypexport.export --username --token 50 | 51 | However, this is verbose and prone to leaking your keys/tokens/passwords in shell history. 52 | 53 | 54 | You can also import ~hypexport.export~ as a module and call ~get_json~ function directly to get raw JSON. 55 | 56 | 57 | I *highly* recommend checking exported files at least once just to make sure they contain everything you expect from your export. If not, please feel free to ask or raise an issue! 58 | 59 | :end: 60 | 61 | # TODO FIXME api limitations 10000 annotations? 62 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | # this is a hack to monkey patch pytest so it handles tests inside namespace packages without __init__.py properly 2 | # without it, pytest can't discover the package root for some reason 3 | # also see https://github.com/karlicoss/pytest_namespace_pkgs for more 4 | 5 | import os 6 | import pathlib 7 | from typing import Optional 8 | 9 | import _pytest.main 10 | import _pytest.pathlib 11 | 12 | # we consider all dirs in repo/ to be namespace packages 13 | root_dir = pathlib.Path(__file__).absolute().parent.resolve() / 'src' 14 | assert root_dir.exists(), root_dir 15 | 16 | # TODO assert it contains package name?? maybe get it via setuptools.. 17 | 18 | namespace_pkg_dirs = [str(d) for d in root_dir.iterdir() if d.is_dir()] 19 | 20 | # resolve_package_path is called from _pytest.pathlib.import_path 21 | # takes a full abs path to the test file and needs to return the path to the 'root' package on the filesystem 22 | resolve_pkg_path_orig = _pytest.pathlib.resolve_package_path 23 | def resolve_package_path(path: pathlib.Path) -> Optional[pathlib.Path]: 24 | result = path # search from the test file upwards 25 | for parent in result.parents: 26 | if str(parent) in namespace_pkg_dirs: 27 | return parent 28 | if os.name == 'nt': 29 | # ??? for some reason on windows it is trying to call this against conftest? but not on linux/osx 30 | if path.name == 'conftest.py': 31 | return resolve_pkg_path_orig(path) 32 | raise RuntimeError("Couldn't determine path for ", path) 33 | _pytest.pathlib.resolve_package_path = resolve_package_path 34 | 35 | 36 | # without patching, the orig function returns just a package name for some reason 37 | # (I think it's used as a sort of fallback) 38 | # so we need to point it at the absolute path properly 39 | # not sure what are the consequences.. maybe it wouldn't be able to run against installed packages? not sure.. 40 | search_pypath_orig = _pytest.main.search_pypath 41 | def search_pypath(module_name: str) -> str: 42 | mpath = root_dir / module_name.replace('.', os.sep) 43 | if not mpath.is_dir(): 44 | mpath = mpath.with_suffix('.py') 45 | assert mpath.exists(), mpath # just in case 46 | return str(mpath) 47 | _pytest.main.search_pypath = search_pypath 48 | -------------------------------------------------------------------------------- /dal.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ''' 3 | A helper script to provide backwards compatibility for pre-PIP package structure. Eventually will be removed altogether. 4 | ''' 5 | 6 | from pathlib import Path 7 | src = Path(__file__).absolute().parent / 'src' 8 | NAME = min(src.glob('*/*.py')).parent.name # package name (e.g. rexport/hypexport etc) 9 | 10 | 11 | import warnings 12 | warnings.warn(f'This script is DEPRECATED. Please install the package directly (see https://github.com/karlicoss/{NAME}#setting-up)') 13 | 14 | 15 | import sys 16 | sys.path.insert(0, str(src)) 17 | 18 | 19 | module_name = Path(__file__).stem # export/dal 20 | mod = f'{NAME}.{module_name}' 21 | 22 | # unload previously loaded DAL module (i.e. this file) 23 | if NAME in sys.modules: del sys.modules[NAME] 24 | if mod in sys.modules: del sys.modules[mod ] 25 | 26 | from contextlib import contextmanager 27 | @contextmanager 28 | def handle_submodule_error(): 29 | # todo this might also be useful in the actual dal/export files.. not sure 30 | try: 31 | yield 32 | except ImportError as e: 33 | import logging 34 | logging.critical(f"[{__file__}]: Error while importing {mod}. Make sure you've used 'git clone --recursive' or 'git pull && git submodule update --init'.") 35 | raise e 36 | 37 | # see https://stackoverflow.com/questions/43059267/how-to-do-from-module-import-using-importlib 38 | from importlib import import_module 39 | with handle_submodule_error(): 40 | dal = import_module(mod) 41 | names = [x for x in dal.__dict__ if not x.startswith("_")] 42 | globals().update({k: getattr(dal, k) for k in names}) 43 | 44 | 45 | if __name__ == '__main__': 46 | with handle_submodule_error(): 47 | main() # type: ignore 48 | -------------------------------------------------------------------------------- /export.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ''' 3 | A helper script to provide backwards compatibility for pre-PIP package structure. Eventually will be removed altogether. 4 | ''' 5 | 6 | from pathlib import Path 7 | src = Path(__file__).absolute().parent / 'src' 8 | NAME = min(src.glob('*/*.py')).parent.name # package name (e.g. rexport/hypexport etc) 9 | 10 | 11 | import warnings 12 | warnings.warn(f'This script is DEPRECATED. Please install the package directly (see https://github.com/karlicoss/{NAME}#setting-up)') 13 | 14 | 15 | import sys 16 | sys.path.insert(0, str(src)) 17 | 18 | 19 | module_name = Path(__file__).stem # export/dal 20 | mod = f'{NAME}.{module_name}' 21 | 22 | # unload previously loaded DAL module (i.e. this file) 23 | if NAME in sys.modules: del sys.modules[NAME] 24 | if mod in sys.modules: del sys.modules[mod ] 25 | 26 | from contextlib import contextmanager 27 | @contextmanager 28 | def handle_submodule_error(): 29 | # todo this might also be useful in the actual dal/export files.. not sure 30 | try: 31 | yield 32 | except ImportError as e: 33 | import logging 34 | logging.critical(f"[{__file__}]: Error while importing {mod}. Make sure you've used 'git clone --recursive' or 'git pull && git submodule update --init'.") 35 | raise e 36 | 37 | # see https://stackoverflow.com/questions/43059267/how-to-do-from-module-import-using-importlib 38 | from importlib import import_module 39 | with handle_submodule_error(): 40 | dal = import_module(mod) 41 | names = [x for x in dal.__dict__ if not x.startswith("_")] 42 | globals().update({k: getattr(dal, k) for k in names}) 43 | 44 | 45 | if __name__ == '__main__': 46 | with handle_submodule_error(): 47 | main() # type: ignore 48 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | pretty = True 3 | show_error_context = True 4 | show_column_numbers = True 5 | show_error_end = True 6 | 7 | check_untyped_defs = True 8 | 9 | # see https://mypy.readthedocs.io/en/stable/error_code_list2.html 10 | warn_redundant_casts = True 11 | strict_equality = True 12 | warn_unused_ignores = True 13 | enable_error_code = deprecated,redundant-expr,possibly-undefined,truthy-bool,truthy-iterable,ignore-without-code,unused-awaitable 14 | 15 | 16 | exclude = src/hypexport/Hypothesis 17 | 18 | # an example of suppressing 19 | # [mypy-my.config.repos.pdfannots.pdfannots] 20 | # ignore_errors = True 21 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # see https://github.com/karlicoss/pymplate for up-to-date reference 2 | [project] 3 | dynamic = ["version"] # version is managed by setuptools_scm 4 | name = "hypexport" 5 | # common/DAL dependencies 6 | dependencies = [] 7 | requires-python = ">= 3.9" 8 | 9 | ## these need to be set if you're planning to upload to pypi 10 | description = "Export and access your Hypothes.is data" 11 | license = {file = "LICENSE"} 12 | authors = [ 13 | {name = "Dima Gerasimov (@karlicoss)", email = "karlicoss@gmail.com"}, 14 | ] 15 | maintainers = [ 16 | {name = "Dima Gerasimov (@karlicoss)", email = "karlicoss@gmail.com"}, 17 | ] 18 | 19 | 20 | [project.urls] 21 | Homepage = "https://github.com/karlicoss/hypexport" 22 | ## 23 | 24 | [project.optional-dependencies] 25 | export = [ 26 | # dependency of Hypothesis API 27 | # todo sadly, the API itself is not a python package, so checked it out as a submodule.. would be nice to convert? 28 | "requests", 29 | ] 30 | optional = [ 31 | "orjson", 32 | "colorlog", 33 | "ijson", # faster iterative json processing 34 | ] 35 | 36 | [dependency-groups] 37 | testing = [ 38 | "pytest", 39 | "ruff", 40 | "mypy", 41 | "lxml", # for mypy html coverage 42 | ] 43 | 44 | 45 | [build-system] 46 | requires = ["setuptools", "setuptools-scm"] 47 | build-backend = "setuptools.build_meta" 48 | 49 | [tool.setuptools_scm] 50 | version_scheme = "python-simplified-semver" 51 | local_scheme = "dirty-tag" 52 | 53 | # workaround for error during uv publishing 54 | # see https://github.com/astral-sh/uv/issues/9513#issuecomment-2519527822 55 | [tool.setuptools] 56 | license-files = [] 57 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | # discover files that don't follow test_ naming. Useful to keep tests along with the source code 3 | python_files = *.py 4 | addopts = 5 | # -rap to print tests summary even when they are successful 6 | -rap 7 | --verbose 8 | 9 | # otherwise it won't discover doctests 10 | --doctest-modules 11 | 12 | # show all test durations (unless they are too short) 13 | --durations=0 14 | 15 | # ignore tests for the API (they need the token) 16 | --ignore src/hypexport/Hypothesis 17 | -------------------------------------------------------------------------------- /ruff.toml: -------------------------------------------------------------------------------- 1 | lint.extend-select = [ 2 | "F", # flakes rules -- default, but extend just in case 3 | "E", # pycodestyle -- default, but extend just in case 4 | "W", # various warnings 5 | 6 | "B", # 'bugbear' set -- various possible bugs 7 | "C4", # flake8-comprehensions -- unnecessary list/map/dict calls 8 | "COM", # trailing commas 9 | "EXE", # various checks wrt executable files 10 | "I", # sort imports 11 | "ICN", # various import conventions 12 | "FBT", # detect use of boolean arguments 13 | "FURB", # various rules 14 | "PERF", # various potential performance speedups 15 | "PD", # pandas rules 16 | "PIE", # 'misc' lints 17 | "PLC", # pylint convention rules 18 | "PLR", # pylint refactor rules 19 | "PLW", # pylint warnings 20 | "PT", # pytest stuff 21 | "PYI", # various type hinting rules 22 | "RET", # early returns 23 | "RUF", # various ruff-specific rules 24 | "TID", # various imports suggestions 25 | "TRY", # various exception handling rules 26 | "UP", # detect deprecated python stdlib stuff 27 | "FA", # suggest using from __future__ import annotations 28 | "PTH", # pathlib migration 29 | "ARG", # unused argument checks 30 | "A", # builtin shadowing 31 | "G", # logging stuff 32 | # "EM", # TODO hmm could be helpful to prevent duplicate err msg in traceback.. but kinda annoying 33 | 34 | # "ALL", # uncomment this to check for new rules! 35 | ] 36 | 37 | lint.ignore = [ 38 | "D", # annoying nags about docstrings 39 | "N", # pep naming 40 | "TCH", # type checking rules, mostly just suggests moving imports under TYPE_CHECKING 41 | "S", # bandit (security checks) -- tends to be not very useful, lots of nitpicks 42 | "DTZ", # datetimes checks -- complaining about missing tz and mostly false positives 43 | "FIX", # complains about fixmes/todos -- annoying 44 | "TD", # complains about todo formatting -- too annoying 45 | "ANN", # missing type annotations? seems way to strict though 46 | 47 | ### too opinionated style checks 48 | "E501", # too long lines 49 | "E702", # Multiple statements on one line (semicolon) 50 | "E731", # assigning lambda instead of using def 51 | "E741", # Ambiguous variable name: `l` 52 | "E742", # Ambiguous class name: `O 53 | "E401", # Multiple imports on one line 54 | "F403", # import *` used; unable to detect undefined names 55 | ### 56 | 57 | ### 58 | "E722", # Do not use bare `except` ## Sometimes it's useful for defensive imports and that sort of thing.. 59 | "F811", # Redefinition of unused # this gets in the way of pytest fixtures (e.g. in cachew) 60 | 61 | ## might be nice .. but later and I don't wanna make it strict 62 | "E402", # Module level import not at top of file 63 | 64 | ### maybe consider these soon 65 | # sometimes it's useful to give a variable a name even if we don't use it as a documentation 66 | # on the other hand, often is a sign of error 67 | "F841", # Local variable `count` is assigned to but never used 68 | ### 69 | 70 | "RUF100", # unused noqa -- handle later 71 | "RUF012", # mutable class attrs should be annotated with ClassVar... ugh pretty annoying for user configs 72 | 73 | ### these are just nitpicky, we usually know better 74 | "PLR0911", # too many return statements 75 | "PLR0912", # too many branches 76 | "PLR0913", # too many function arguments 77 | "PLR0915", # too many statements 78 | "PLR1714", # consider merging multiple comparisons 79 | "PLR2044", # line with empty comment 80 | "PLR5501", # use elif instead of else if 81 | "PLR2004", # magic value in comparison -- super annoying in tests 82 | ### 83 | "PLR0402", # import X.Y as Y -- TODO maybe consider enabling it, but double check 84 | 85 | "B009", # calling gettattr with constant attribute -- this is useful to convince mypy 86 | "B010", # same as above, but setattr 87 | "B011", # complains about assert False 88 | "B017", # pytest.raises(Exception) 89 | "B023", # seems to result in false positives? 90 | "B028", # suggest using explicit stacklevel? TODO double check later, but not sure it's useful 91 | 92 | # complains about useless pass, but has sort of a false positive if the function has a docstring? 93 | # this is common for click entrypoints (e.g. in __main__), so disable 94 | "PIE790", 95 | 96 | # a bit too annoying, offers to convert for loops to list comprehension 97 | # , which may heart readability 98 | "PERF401", 99 | 100 | # suggests no using exception in for loops 101 | # we do use this technique a lot, plus in 3.11 happy path exception handling is "zero-cost" 102 | "PERF203", 103 | 104 | "RET504", # unnecessary assignment before returning -- that can be useful for readability 105 | "RET505", # unnecessary else after return -- can hurt readability 106 | 107 | "PLW0603", # global variable update.. we usually know why we are doing this 108 | "PLW2901", # for loop variable overwritten, usually this is intentional 109 | 110 | "PT011", # pytest raises should is too broad 111 | "PT012", # pytest raises should contain a single statement 112 | 113 | "COM812", # trailing comma missing -- mostly just being annoying with long multiline strings 114 | 115 | "PD901", # generic variable name df 116 | 117 | "TRY003", # suggests defining exception messages in exception class -- kinda annoying 118 | "TRY004", # prefer TypeError -- don't see the point 119 | "TRY201", # raise without specifying exception name -- sometimes hurts readability 120 | "TRY400", # TODO double check this, might be useful 121 | "TRY401", # redundant exception in logging.exception call? TODO double check, might result in excessive logging 122 | 123 | "PGH", # TODO force error code in mypy instead 124 | 125 | "TID252", # Prefer absolute imports over relative imports from parent modules 126 | 127 | "UP038", # suggests using | (union) in isisntance checks.. but it results in slower code 128 | 129 | ## too annoying 130 | "T20", # just complains about prints and pprints 131 | "Q", # flake quotes, too annoying 132 | "C90", # some complexity checking 133 | "G004", # logging statement uses f string 134 | "ERA001", # commented out code 135 | "SLF001", # private member accessed 136 | "BLE001", # do not catch 'blind' Exception 137 | "INP001", # complains about implicit namespace packages 138 | "SIM", # some if statements crap 139 | "RSE102", # complains about missing parens in exceptions 140 | ## 141 | ] 142 | 143 | 144 | exclude = [ 145 | # "src/hypexport/exporthelpers", # TODO hmm not sure if should check here? 146 | "src/hypexport/Hypothesis", 147 | ] 148 | -------------------------------------------------------------------------------- /src/hypexport/dal.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from collections.abc import Iterator, Sequence 4 | from datetime import datetime 5 | from itertools import groupby, tee 6 | from pathlib import Path 7 | from typing import NamedTuple 8 | 9 | from .exporthelpers import dal_helper, logging_helper 10 | from .exporthelpers.dal_helper import ( 11 | Json, 12 | PathIsh, 13 | Res, 14 | datetime_aware, 15 | json_items, 16 | pathify, 17 | the, 18 | ) 19 | 20 | logger = logging_helper.logger(__name__) 21 | 22 | 23 | Url = str 24 | 25 | 26 | # TODO unstead, use raw json + add @property? 27 | class Highlight(NamedTuple): 28 | created: datetime_aware 29 | title: str 30 | url: Url 31 | hid: str 32 | hyp_link: Url 33 | # highlight might be None if for instance we just marked page with tags without annotating 34 | # not sure if we want to handle it somehow separately 35 | highlight: str | None 36 | annotation: str | None # user's comment 37 | tags: Sequence[str] 38 | 39 | 40 | class Page(NamedTuple): 41 | """ 42 | Represents annotated page along with the highlights 43 | """ 44 | 45 | highlights: Sequence[Highlight] 46 | 47 | @property 48 | def url(self) -> str: 49 | return the(h.url for h in self.highlights) 50 | 51 | @property 52 | def title(self) -> str: 53 | return the(h.title for h in self.highlights) 54 | 55 | @property 56 | def created(self) -> datetime: 57 | return min(h.created for h in self.highlights) 58 | 59 | 60 | class DAL: 61 | def __init__(self, sources: Sequence[PathIsh]) -> None: 62 | self.sources = list(map(pathify, sources)) 63 | 64 | def _iter_raw(self): 65 | paths = self.sources 66 | total = len(paths) 67 | width = len(str(total)) 68 | for idx, path in enumerate(paths): 69 | logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}') 70 | with path.open(mode='rb') as fo: 71 | first = fo.read(1) 72 | old_format = first == b'[' 73 | key = None if old_format else 'annotations' 74 | # annotations are in reverse chronological order, so make sense to reverse 75 | annotations = sorted(iter(json_items(path, key)), key=lambda j: j.get('created', '')) 76 | yield from annotations 77 | 78 | def highlights(self) -> Iterator[Res[Highlight]]: 79 | emitted = set() 80 | for i in self._iter_raw(): 81 | hid = i['id'] 82 | if hid in emitted: 83 | continue 84 | emitted.add(hid) 85 | try: 86 | yield self._parse_highlight(i) 87 | except Exception as e: 88 | err = RuntimeError(i) 89 | err.__cause__ = e 90 | yield err 91 | 92 | def pages(self) -> Iterator[Res[Page]]: 93 | vit, eit = tee(self.highlights()) 94 | # fmt: off 95 | values = (r for r in vit if not isinstance(r, Exception)) 96 | errors = (r for r in eit if isinstance(r, Exception)) 97 | # fmt: on 98 | 99 | by_url = lambda h: h.url 100 | by_created = lambda h: h.created 101 | def it() -> Iterator[Page]: 102 | for _link, git in groupby(sorted(values, key=by_url), key=by_url): 103 | group = sorted(git, key=by_created) 104 | yield Page(group) 105 | 106 | yield from sorted(it(), key=by_created) 107 | yield from errors 108 | 109 | def _parse_highlight(self, i: Json) -> Highlight: 110 | [tg] = i['target'] # hopefully it's always single element? 111 | selectors = tg.get('selector', None) 112 | if selectors is None: 113 | # TODO warn?... 114 | selectors = [] 115 | 116 | highlights = [s['exact'] for s in selectors if 'exact' in s] 117 | 118 | # TODO warn? never happend though 119 | assert len(highlights) <= 1 120 | 121 | if len(highlights) == 0: 122 | highlight = None 123 | else: 124 | [highlight] = highlights 125 | 126 | content: str | None = None 127 | for s in selectors: 128 | if 'exact' in s: 129 | content = s['exact'] 130 | break 131 | 132 | page_link = i['uri'] 133 | title = i['document'].get('title') 134 | if title is None: 135 | # sometimes happens, e.t. if it's plaintext file 136 | page_title = page_link 137 | else: 138 | page_title = ' '.join(title) 139 | hid = i['id'] 140 | dts = i['created'] 141 | created = datetime.strptime(dts[:-3] + dts[-2:], '%Y-%m-%dT%H:%M:%S.%f%z') 142 | txt = i['text'] 143 | annotation = None if len(txt.strip()) == 0 else txt 144 | context = i['links']['incontext'] 145 | return Highlight( 146 | created=created, 147 | url=page_link, 148 | title=page_title, 149 | hid=hid, 150 | hyp_link=context, 151 | highlight=highlight, 152 | annotation=annotation, 153 | tags=tuple(i['tags']), 154 | ) 155 | 156 | 157 | # todo would be nice to use some fake data instead? this only gonna work under an ediable install 158 | def _testfile() -> Path: 159 | testdata = Path(__file__).absolute().parent.parent.parent / 'testdata' 160 | [jfile] = testdata.rglob('data/annotations.json') 161 | return jfile 162 | 163 | 164 | def test() -> None: 165 | dal = DAL([_testfile()]) 166 | # at least check it doesn't crash 167 | for p in dal.pages(): 168 | assert not isinstance(p, Exception), p 169 | p.title # noqa: B018 170 | p.url # noqa: B018 171 | p.created # noqa: B018 172 | len(list(p.highlights)) 173 | 174 | 175 | def demo(dal: DAL) -> None: 176 | # TODO split errors properly? move it to dal_helper? 177 | # highlights = list(w for w in dao.highlights() if not isinstance(w, Exception)) 178 | 179 | # TODO logger? 180 | vit, eit = tee(dal.pages()) 181 | # fmt: off 182 | values = (r for r in vit if not isinstance(r, Exception)) 183 | errors = (r for r in eit if isinstance(r, Exception)) 184 | # fmt: on 185 | for e in errors: 186 | print("ERROR! ", e) 187 | 188 | pages = list(values) 189 | print(f"Parsed {len(pages)} pages") 190 | 191 | from collections import Counter 192 | 193 | common = Counter({(x.url, x.title): len(x.highlights) for x in pages}).most_common(10) 194 | print("10 most highlighed pages:") 195 | for (url, title), count in common: 196 | print(f'{count:4d} {url} "{title}"') 197 | 198 | 199 | if __name__ == '__main__': 200 | dal_helper.main(DAL=DAL, demo=demo) 201 | -------------------------------------------------------------------------------- /src/hypexport/export.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from .exporthelpers.export_helper import Parser, setup_parser 4 | from .Hypothesis import hypothesis 5 | 6 | 7 | class Exporter: 8 | def __init__(self, *args, **kwargs) -> None: 9 | kwargs['max_search_results'] = 10000 10 | self.api = hypothesis.Hypothesis(*args, **kwargs) 11 | # TODO not sure why max_search_results is set to 2000 in Hypothesis package; documentation says 9800 is the max for offset? Ask judell 12 | self.user = kwargs['username'] 13 | 14 | def export_json(self): 15 | profile = self.api.authenticated_api_query(self.api.api_url + '/profile') 16 | annotations = list(self.api.search_all({'user': self.user})) 17 | return { 18 | 'profile': profile, 19 | 'annotations': annotations, 20 | } 21 | 22 | 23 | def get_json(**params): 24 | return Exporter(**params).export_json() 25 | 26 | 27 | def main() -> None: 28 | parser = make_parser() 29 | args = parser.parse_args() 30 | 31 | params = args.params 32 | dumper = args.dumper 33 | 34 | j = get_json(**params) 35 | js = json.dumps(j, ensure_ascii=False, indent=2, sort_keys=True) 36 | dumper(js) 37 | 38 | 39 | def make_parser(): 40 | parser = Parser('Export/takeout for your personal [[https://hypothes.is][Hypothes.is]] data: annotations and profile information.') 41 | setup_parser( 42 | parser=parser, 43 | params=['username', 'token'], 44 | extra_usage=''' 45 | You can also import ~hypexport.export~ as a module and call ~get_json~ function directly to get raw JSON. 46 | ''', 47 | ) 48 | return parser 49 | 50 | 51 | if __name__ == '__main__': 52 | main() 53 | -------------------------------------------------------------------------------- /src/hypexport/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/karlicoss/hypexport/218093d714ed82d5663f312219c3813a1ad2cb0d/src/hypexport/py.typed -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | minversion = 3.21 3 | # relies on the correct version of Python installed 4 | envlist = ruff,tests,mypy 5 | # https://github.com/tox-dev/tox/issues/20#issuecomment-247788333 6 | # hack to prevent .tox from crapping to the project directory 7 | toxworkdir = {env:TOXWORKDIR_BASE:}{toxinidir}/.tox 8 | 9 | [testenv] 10 | # TODO how to get package name from setuptools? 11 | package_name = "hypexport" 12 | passenv = 13 | # useful for tests to know they are running under ci 14 | CI 15 | CI_* 16 | # respect user's cache dirs to prevent tox from crapping into project dir 17 | PYTHONPYCACHEPREFIX 18 | MYPY_CACHE_DIR 19 | RUFF_CACHE_DIR 20 | usedevelop = true # for some reason tox seems to ignore "-e ." in deps section?? 21 | uv_seed = true # seems necessary so uv creates separate venvs per tox env? 22 | 23 | 24 | [testenv:ruff] 25 | dependency_groups = testing 26 | commands = 27 | {envpython} -m ruff check src/ 28 | 29 | 30 | [testenv:tests] 31 | dependency_groups = testing 32 | deps = 33 | -e .[export] 34 | commands = 35 | {envpython} -m pytest \ 36 | --pyargs {[testenv]package_name} \ 37 | {posargs} 38 | 39 | 40 | [testenv:mypy] 41 | dependency_groups = testing 42 | deps = 43 | -e .[optional] 44 | commands = 45 | {envpython} -m mypy --no-install-types \ 46 | -p {[testenv]package_name} \ 47 | # txt report is a bit more convenient to view on CI 48 | --txt-report .coverage.mypy \ 49 | --html-report .coverage.mypy \ 50 | {posargs} 51 | --------------------------------------------------------------------------------