├── .ci
    └── run
├── .github
    └── workflows
    │   └── main.yml
├── .gitignore
├── .gitmodules
├── LICENSE
├── README.org
├── conftest.py
├── dal.py
├── export.py
├── mypy.ini
├── pyproject.toml
├── pytest.ini
├── ruff.toml
├── src
    └── hypexport
    │   ├── dal.py
    │   ├── export.py
    │   └── py.typed
└── tox.ini


/.ci/run:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eu
 3 | 
 4 | cd "$(dirname "$0")"
 5 | cd .. # git root
 6 | 
 7 | if ! command -v sudo; then
 8 |     # CI or Docker sometimes doesn't have it, so useful to have a dummy
 9 |     function sudo {
10 |         "$@"
11 |     }
12 | fi
13 | 
14 | # --parallel-live to show outputs while it's running
15 | tox_cmd='run-parallel --parallel-live'
16 | if [ -n "${CI-}" ]; then
17 |     # install OS specific stuff here
18 |     case "$OSTYPE" in
19 |     darwin*) 
20 |         # macos
21 |         :
22 |         ;;
23 |     cygwin* | msys* | win*)
24 |         # windows
25 |         :
26 |         # ugh. parallel stuff seems super flaky under windows, some random failures, "file used by other process" and crap like that
27 |         tox_cmd='run'
28 |         ;;
29 |     *)
30 |         # must be linux?
31 |         :
32 |         ;;
33 |     esac
34 | fi
35 | 
36 | # NOTE: expects uv installed
37 | uv tool run --with tox-uv tox $tox_cmd "$@"
38 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | # see https://github.com/karlicoss/pymplate for up-to-date reference
 2 | 
 3 | name: CI
 4 | on:
 5 |   push:
 6 |     branches: '*'
 7 |     tags: 'v[0-9]+.*' # only trigger on 'release' tags for PyPi
 8 |     # Ideally I would put this in the pypi job... but github syntax doesn't allow for regexes there :shrug:
 9 |     # P.S. fuck made up yaml DSLs.
10 |   pull_request: # needed to trigger on others' PRs
11 |   # Note that people who fork it need to go to "Actions" tab on their fork and click "I understand my workflows, go ahead and enable them".
12 |   workflow_dispatch: # needed to trigger workflows manually
13 |     # todo cron?
14 |     inputs:
15 |       debug_enabled:
16 |         type: boolean
17 |         description: 'Run the build with tmate debugging enabled (https://github.com/marketplace/actions/debugging-with-tmate)'
18 |         required: false
19 |         default: false
20 | 
21 | 
22 | jobs:
23 |   build:
24 |     strategy:
25 |       fail-fast: false
26 |       matrix:
27 |         platform: [ubuntu-latest, macos-latest, windows-latest]
28 |         python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
29 |         exclude: [
30 |             # windows runners are pretty scarce, so let's only run lowest and highest python version
31 |             {platform: windows-latest, python-version: '3.10'},
32 |             {platform: windows-latest, python-version: '3.11'},
33 |             {platform: windows-latest, python-version: '3.12'},
34 | 
35 |             # same, macos is a bit too slow and ubuntu covers python quirks well
36 |             {platform: macos-latest  , python-version: '3.10'},
37 |             {platform: macos-latest  , python-version: '3.11'},
38 |             {platform: macos-latest  , python-version: '3.12'},
39 |         ]
40 | 
41 |     runs-on: ${{ matrix.platform }}
42 | 
43 |     # useful for 'optional' pipelines
44 |     # continue-on-error: ${{ matrix.platform == 'windows-latest' }}
45 | 
46 |     steps:
47 |     # ugh https://github.com/actions/toolkit/blob/main/docs/commands.md#path-manipulation
48 |     - run: echo "$HOME/.local/bin" >> $GITHUB_PATH
49 | 
50 |     - uses: actions/setup-python@v5
51 |       with:
52 |         python-version: ${{ matrix.python-version }}
53 |       
54 |     - uses: astral-sh/setup-uv@v5
55 |       with:
56 |         enable-cache: false  # we don't have lock files, so can't use them as cache key
57 | 
58 |     - uses: actions/checkout@v4
59 |       with:
60 |         submodules: recursive
61 |         fetch-depth: 0  # nicer to have all git history when debugging/for tests
62 | 
63 |     - uses: mxschmitt/action-tmate@v3
64 |       if: ${{ github.event_name == 'workflow_dispatch' && inputs.debug_enabled }}
65 | 
66 |     # explicit bash command is necessary for Windows CI runner, otherwise it thinks it's cmd...
67 |     - run: bash .ci/run
68 | 
69 |     - if: matrix.platform == 'ubuntu-latest'  # no need to compute coverage for other platforms
70 |       uses: actions/upload-artifact@v4
71 |       with:
72 |         include-hidden-files: true
73 |         name: .coverage.mypy_${{ matrix.platform }}_${{ matrix.python-version }}
74 |         path: .coverage.mypy/
75 | 
76 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Created by https://www.gitignore.io/api/python,emacs
  3 | # Edit at https://www.gitignore.io/?templates=python,emacs
  4 | 
  5 | ### Emacs ###
  6 | # -*- mode: gitignore; -*-
  7 | *~
  8 | \#*\#
  9 | /.emacs.desktop
 10 | /.emacs.desktop.lock
 11 | *.elc
 12 | auto-save-list
 13 | tramp
 14 | .\#*
 15 | 
 16 | # Org-mode
 17 | .org-id-locations
 18 | *_archive
 19 | 
 20 | # flymake-mode
 21 | *_flymake.*
 22 | 
 23 | # eshell files
 24 | /eshell/history
 25 | /eshell/lastdir
 26 | 
 27 | # elpa packages
 28 | /elpa/
 29 | 
 30 | # reftex files
 31 | *.rel
 32 | 
 33 | # AUCTeX auto folder
 34 | /auto/
 35 | 
 36 | # cask packages
 37 | .cask/
 38 | dist/
 39 | 
 40 | # Flycheck
 41 | flycheck_*.el
 42 | 
 43 | # server auth directory
 44 | /server/
 45 | 
 46 | # projectiles files
 47 | .projectile
 48 | 
 49 | # directory configuration
 50 | .dir-locals.el
 51 | 
 52 | # network security
 53 | /network-security.data
 54 | 
 55 | 
 56 | ### Python ###
 57 | # Byte-compiled / optimized / DLL files
 58 | __pycache__/
 59 | *.py[cod]
 60 | *$py.class
 61 | 
 62 | # C extensions
 63 | *.so
 64 | 
 65 | # Distribution / packaging
 66 | .Python
 67 | build/
 68 | develop-eggs/
 69 | downloads/
 70 | eggs/
 71 | .eggs/
 72 | lib/
 73 | lib64/
 74 | parts/
 75 | sdist/
 76 | var/
 77 | wheels/
 78 | pip-wheel-metadata/
 79 | share/python-wheels/
 80 | *.egg-info/
 81 | .installed.cfg
 82 | *.egg
 83 | MANIFEST
 84 | 
 85 | # PyInstaller
 86 | #  Usually these files are written by a python script from a template
 87 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 88 | *.manifest
 89 | *.spec
 90 | 
 91 | # Installer logs
 92 | pip-log.txt
 93 | pip-delete-this-directory.txt
 94 | 
 95 | # Unit test / coverage reports
 96 | htmlcov/
 97 | .tox/
 98 | .nox/
 99 | .coverage
100 | .coverage.*
101 | .cache
102 | nosetests.xml
103 | coverage.xml
104 | *.cover
105 | .hypothesis/
106 | .pytest_cache/
107 | 
108 | # Translations
109 | *.mo
110 | *.pot
111 | 
112 | # Scrapy stuff:
113 | .scrapy
114 | 
115 | # Sphinx documentation
116 | docs/_build/
117 | 
118 | # PyBuilder
119 | target/
120 | 
121 | # pyenv
122 | .python-version
123 | 
124 | # pipenv
125 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
126 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
127 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
128 | #   install all needed dependencies.
129 | #Pipfile.lock
130 | 
131 | # celery beat schedule file
132 | celerybeat-schedule
133 | 
134 | # SageMath parsed files
135 | *.sage.py
136 | 
137 | # Spyder project settings
138 | .spyderproject
139 | .spyproject
140 | 
141 | # Rope project settings
142 | .ropeproject
143 | 
144 | # Mr Developer
145 | .mr.developer.cfg
146 | .project
147 | .pydevproject
148 | 
149 | # mkdocs documentation
150 | /site
151 | 
152 | # mypy
153 | .mypy_cache/
154 | .dmypy.json
155 | dmypy.json
156 | 
157 | # Pyre type checker
158 | .pyre/
159 | 
160 | # End of https://www.gitignore.io/api/python,emacs
161 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "src/hypexport/Hypothesis"]
 2 | 	path = src/hypexport/Hypothesis
 3 | 	url = https://github.com/karlicoss/Hypothesis.git # ugh, github actions can't handle ssh?
 4 | [submodule "src/hypexport/exporthelpers"]
 5 | 	path = src/hypexport/exporthelpers
 6 | 	url = https://github.com/karlicoss/exporthelpers.git
 7 | [submodule "testdata/netrights-dashboard-mockup"]
 8 | 	path = testdata/netrights-dashboard-mockup
 9 | 	url = https://github.com/taniki/netrights-dashboard-mockup.git
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Dmitrii Gerasimov
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.org:
--------------------------------------------------------------------------------
 1 | #+begin_src python :dir src :results drawer :exports results
 2 | import hypexport.export as E; return E.make_parser().prog
 3 | #+end_src
 4 | 
 5 | #+RESULTS:
 6 | :results:
 7 | Export/takeout for your personal [[https://hypothes.is][Hypothes.is]] data: annotations and profile information.
 8 | :end:
 9 | 
10 | * Setting up
11 | 1. install with PIP
12 |    - =pip3 install --user git+https://github.com/karlicoss/hypexport=
13 | 
14 |    - for export functionality: append =[export]=
15 |    - for optional extras for logging and faster json processing: append =[optional]=
16 |    - or any combination of the above, e.g. =[export,optional]=
17 | 
18 |    - alternatively, use =git clone --recursive=, or =git pull && git submodules update --init=. After that, you can use =pip3 install --editable=.
19 | 
20 | 
21 | 2. Follow [[https://hypothes.is/account/developer][these]] instructions to set up the token
22 | 
23 | * Exporting
24 | 
25 | #+begin_src python :dir src :results drawer :exports results
26 | import hypexport.export as E; return E.make_parser().epilog
27 | #+end_src
28 | 
29 | #+RESULTS:
30 | :results:
31 | 
32 | Usage:
33 | 
34 | *Recommended*: create =secrets.py= keeping your api parameters, e.g.:
35 | 
36 | 
37 | : username = "USERNAME"
38 | : token = "TOKEN"
39 | 
40 | 
41 | After that, use:
42 | 
43 | : python3 -m hypexport.export --secrets /path/to/secrets.py
44 | 
45 | That way you type less and have control over where you keep your plaintext secrets.
46 | 
47 | *Alternatively*, you can pass parameters directly, e.g.
48 | 
49 | : python3 -m hypexport.export --username <username> --token <token>
50 | 
51 | However, this is verbose and prone to leaking your keys/tokens/passwords in shell history.
52 | 
53 | 
54 | You can also import ~hypexport.export~ as a module and call ~get_json~ function directly to get raw JSON.
55 | 
56 | 
57 | I *highly* recommend checking exported files at least once just to make sure they contain everything you expect from your export. If not, please feel free to ask or raise an issue!
58 | 
59 | :end:
60 | 
61 | # TODO FIXME api limitations 10000 annotations?
62 | 


--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
 1 | # this is a hack to monkey patch pytest so it handles tests inside namespace packages without __init__.py properly
 2 | # without it, pytest can't discover the package root for some reason
 3 | # also see https://github.com/karlicoss/pytest_namespace_pkgs for more
 4 | 
 5 | import os
 6 | import pathlib
 7 | from typing import Optional
 8 | 
 9 | import _pytest.main
10 | import _pytest.pathlib
11 | 
12 | # we consider all dirs in repo/ to be namespace packages
13 | root_dir = pathlib.Path(__file__).absolute().parent.resolve() / 'src'
14 | assert root_dir.exists(), root_dir
15 | 
16 | # TODO assert it contains package name?? maybe get it via setuptools..
17 | 
18 | namespace_pkg_dirs = [str(d) for d in root_dir.iterdir() if d.is_dir()]
19 | 
20 | # resolve_package_path is called from _pytest.pathlib.import_path
21 | # takes a full abs path to the test file and needs to return the path to the 'root' package on the filesystem
22 | resolve_pkg_path_orig = _pytest.pathlib.resolve_package_path
23 | def resolve_package_path(path: pathlib.Path) -> Optional[pathlib.Path]:
24 |     result = path  # search from the test file upwards
25 |     for parent in result.parents:
26 |         if str(parent) in namespace_pkg_dirs:
27 |             return parent
28 |     if os.name == 'nt':
29 |         # ??? for some reason on windows it is trying to call this against conftest? but not on linux/osx
30 |         if path.name == 'conftest.py':
31 |             return resolve_pkg_path_orig(path)
32 |     raise RuntimeError("Couldn't determine path for ", path)
33 | _pytest.pathlib.resolve_package_path = resolve_package_path
34 | 
35 | 
36 | # without patching, the orig function returns just a package name for some reason
37 | # (I think it's used as a sort of fallback)
38 | # so we need to point it at the absolute path properly
39 | # not sure what are the consequences.. maybe it wouldn't be able to run against installed packages? not sure..
40 | search_pypath_orig = _pytest.main.search_pypath
41 | def search_pypath(module_name: str) -> str:
42 |     mpath = root_dir / module_name.replace('.', os.sep)
43 |     if not mpath.is_dir():
44 |         mpath = mpath.with_suffix('.py')
45 |         assert mpath.exists(), mpath  # just in case
46 |     return str(mpath)
47 | _pytest.main.search_pypath = search_pypath
48 | 


--------------------------------------------------------------------------------
/dal.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | '''
 3 | A helper script to provide backwards compatibility for pre-PIP package structure. Eventually will be removed altogether.
 4 | '''
 5 | 
 6 | from pathlib import Path
 7 | src = Path(__file__).absolute().parent / 'src'
 8 | NAME = min(src.glob('*/*.py')).parent.name # package name (e.g. rexport/hypexport etc)
 9 | 
10 | 
11 | import warnings
12 | warnings.warn(f'This script is DEPRECATED. Please install the package directly (see https://github.com/karlicoss/{NAME}#setting-up)')
13 | 
14 | 
15 | import sys
16 | sys.path.insert(0, str(src))
17 | 
18 | 
19 | module_name = Path(__file__).stem # export/dal
20 | mod = f'{NAME}.{module_name}'
21 | 
22 | # unload previously loaded DAL module (i.e. this file)
23 | if NAME in sys.modules: del sys.modules[NAME]
24 | if mod  in sys.modules: del sys.modules[mod ]
25 | 
26 | from contextlib import contextmanager
27 | @contextmanager
28 | def handle_submodule_error():
29 |     # todo this might also be useful in the actual dal/export files.. not sure
30 |     try:
31 |         yield
32 |     except ImportError as e:
33 |         import logging
34 |         logging.critical(f"[{__file__}]: Error while importing {mod}. Make sure you've used 'git clone --recursive' or 'git pull && git submodule update --init'.")
35 |         raise e
36 | 
37 | # see https://stackoverflow.com/questions/43059267/how-to-do-from-module-import-using-importlib
38 | from importlib import import_module
39 | with handle_submodule_error():
40 |     dal = import_module(mod)
41 | names = [x for x in dal.__dict__ if not x.startswith("_")]
42 | globals().update({k: getattr(dal, k) for k in names})
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     with handle_submodule_error():
47 |         main() # type: ignore
48 | 


--------------------------------------------------------------------------------
/export.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | '''
 3 | A helper script to provide backwards compatibility for pre-PIP package structure. Eventually will be removed altogether.
 4 | '''
 5 | 
 6 | from pathlib import Path
 7 | src = Path(__file__).absolute().parent / 'src'
 8 | NAME = min(src.glob('*/*.py')).parent.name # package name (e.g. rexport/hypexport etc)
 9 | 
10 | 
11 | import warnings
12 | warnings.warn(f'This script is DEPRECATED. Please install the package directly (see https://github.com/karlicoss/{NAME}#setting-up)')
13 | 
14 | 
15 | import sys
16 | sys.path.insert(0, str(src))
17 | 
18 | 
19 | module_name = Path(__file__).stem # export/dal
20 | mod = f'{NAME}.{module_name}'
21 | 
22 | # unload previously loaded DAL module (i.e. this file)
23 | if NAME in sys.modules: del sys.modules[NAME]
24 | if mod  in sys.modules: del sys.modules[mod ]
25 | 
26 | from contextlib import contextmanager
27 | @contextmanager
28 | def handle_submodule_error():
29 |     # todo this might also be useful in the actual dal/export files.. not sure
30 |     try:
31 |         yield
32 |     except ImportError as e:
33 |         import logging
34 |         logging.critical(f"[{__file__}]: Error while importing {mod}. Make sure you've used 'git clone --recursive' or 'git pull && git submodule update --init'.")
35 |         raise e
36 | 
37 | # see https://stackoverflow.com/questions/43059267/how-to-do-from-module-import-using-importlib
38 | from importlib import import_module
39 | with handle_submodule_error():
40 |     dal = import_module(mod)
41 | names = [x for x in dal.__dict__ if not x.startswith("_")]
42 | globals().update({k: getattr(dal, k) for k in names})
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     with handle_submodule_error():
47 |         main() # type: ignore
48 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | pretty = True
 3 | show_error_context = True
 4 | show_column_numbers = True
 5 | show_error_end = True
 6 | 
 7 | check_untyped_defs = True
 8 | 
 9 | # see https://mypy.readthedocs.io/en/stable/error_code_list2.html
10 | warn_redundant_casts = True
11 | strict_equality = True
12 | warn_unused_ignores = True
13 | enable_error_code = deprecated,redundant-expr,possibly-undefined,truthy-bool,truthy-iterable,ignore-without-code,unused-awaitable
14 | 
15 | 
16 | exclude = src/hypexport/Hypothesis
17 | 
18 | # an example of suppressing
19 | # [mypy-my.config.repos.pdfannots.pdfannots]
20 | # ignore_errors = True
21 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | # see https://github.com/karlicoss/pymplate for up-to-date reference
 2 | [project]
 3 | dynamic = ["version"]  # version is managed by setuptools_scm
 4 | name = "hypexport"
 5 | # common/DAL dependencies
 6 | dependencies = []
 7 | requires-python = ">= 3.9"
 8 | 
 9 | ## these need to be set if you're planning to upload to pypi
10 | description = "Export and access your Hypothes.is data"
11 | license = {file = "LICENSE"}
12 | authors = [
13 |     {name = "Dima Gerasimov (@karlicoss)", email = "karlicoss@gmail.com"},
14 | ]
15 | maintainers = [
16 |     {name = "Dima Gerasimov (@karlicoss)", email = "karlicoss@gmail.com"},
17 | ]
18 | 
19 | 
20 | [project.urls]
21 | Homepage = "https://github.com/karlicoss/hypexport"
22 | ##
23 | 
24 | [project.optional-dependencies]
25 | export = [
26 |     # dependency of Hypothesis API
27 |     # todo sadly, the API itself is not a python package, so checked it out as a submodule.. would be nice to convert?
28 |     "requests",
29 | ]
30 | optional = [
31 |     "orjson",
32 |     "colorlog",
33 |     "ijson",  # faster iterative json processing
34 | ]
35 | 
36 | [dependency-groups]
37 | testing = [
38 |     "pytest",
39 |     "ruff",
40 |     "mypy",
41 |     "lxml",  # for mypy html coverage
42 | ]
43 | 
44 | 
45 | [build-system]
46 | requires = ["setuptools", "setuptools-scm"]
47 | build-backend = "setuptools.build_meta"
48 | 
49 | [tool.setuptools_scm]
50 | version_scheme = "python-simplified-semver"
51 | local_scheme = "dirty-tag"
52 | 
53 | # workaround for error during uv publishing
54 | # see https://github.com/astral-sh/uv/issues/9513#issuecomment-2519527822
55 | [tool.setuptools]
56 | license-files = []
57 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
 1 | [pytest]
 2 | # discover files that don't follow test_ naming. Useful to keep tests along with the source code
 3 | python_files = *.py
 4 | addopts =
 5 |   # -rap to print tests summary even when they are successful
 6 |   -rap
 7 |   --verbose
 8 | 
 9 |   # otherwise it won't discover doctests
10 |   --doctest-modules
11 | 
12 |   # show all test durations (unless they are too short)
13 |   --durations=0
14 | 
15 |   # ignore tests for the API (they need the token)
16 |   --ignore src/hypexport/Hypothesis
17 | 


--------------------------------------------------------------------------------
/ruff.toml:
--------------------------------------------------------------------------------
  1 | lint.extend-select = [
  2 |     "F",    # flakes rules -- default, but extend just in case
  3 |     "E",    # pycodestyle  -- default, but extend just in case
  4 |     "W",    # various warnings
  5 | 
  6 |     "B",    # 'bugbear' set -- various possible bugs
  7 |     "C4",   # flake8-comprehensions -- unnecessary list/map/dict calls
  8 |     "COM",  # trailing commas
  9 |     "EXE",  # various checks wrt executable files
 10 |     "I",    # sort imports
 11 |     "ICN",  # various import conventions
 12 |     "FBT",  # detect use of boolean arguments
 13 |     "FURB", # various rules
 14 |     "PERF", # various potential performance speedups
 15 |     "PD",   # pandas rules
 16 |     "PIE",  # 'misc' lints
 17 |     "PLC",  # pylint convention rules
 18 |     "PLR",  # pylint refactor rules
 19 |     "PLW",  # pylint warnings
 20 |     "PT",   # pytest stuff
 21 |     "PYI",  # various type hinting rules
 22 |     "RET",  # early returns
 23 |     "RUF",  # various ruff-specific rules
 24 |     "TID",  # various imports suggestions
 25 |     "TRY",  # various exception handling rules
 26 |     "UP",   # detect deprecated python stdlib stuff
 27 |     "FA",   # suggest using from __future__ import annotations
 28 |     "PTH",  # pathlib migration
 29 |     "ARG",  # unused argument checks
 30 |     "A",    # builtin shadowing
 31 |     "G",    # logging stuff
 32 |     # "EM",  # TODO hmm could be helpful to prevent duplicate err msg in traceback.. but kinda annoying
 33 | 
 34 |     # "ALL", # uncomment this to check for new rules!
 35 | ]
 36 | 
 37 | lint.ignore = [
 38 |     "D",     # annoying nags about docstrings
 39 |     "N",     # pep naming
 40 |     "TCH",   # type checking rules, mostly just suggests moving imports under TYPE_CHECKING
 41 |     "S",     # bandit (security checks) -- tends to be not very useful, lots of nitpicks
 42 |     "DTZ",   # datetimes checks -- complaining about missing tz and mostly false positives
 43 |     "FIX",   # complains about fixmes/todos -- annoying
 44 |     "TD",    # complains about todo formatting -- too annoying
 45 |     "ANN",   # missing type annotations? seems way to strict though
 46 | 
 47 | ### too opinionated style checks
 48 |     "E501",  # too long lines
 49 |     "E702",  # Multiple statements on one line (semicolon)
 50 |     "E731",  # assigning lambda instead of using def
 51 |     "E741",  # Ambiguous variable name: `l`
 52 |     "E742",  # Ambiguous class name: `O
 53 |     "E401",  # Multiple imports on one line
 54 |     "F403",  # import *` used; unable to detect undefined names
 55 | ###
 56 | 
 57 | ###
 58 |     "E722",  # Do not use bare `except` ## Sometimes it's useful for defensive imports and that sort of thing..
 59 |     "F811",  # Redefinition of unused  # this gets in the way of pytest fixtures (e.g. in cachew)
 60 | 
 61 | ## might be nice .. but later and I don't wanna make it strict
 62 |     "E402",  # Module level import not at top of file
 63 | 
 64 | ### maybe consider these soon
 65 |     # sometimes it's useful to give a variable a name even if we don't use it as a documentation
 66 |     # on the other hand, often is a sign of error
 67 |     "F841",  # Local variable `count` is assigned to but never used
 68 | ###
 69 | 
 70 |     "RUF100",  # unused noqa -- handle later
 71 |     "RUF012",  # mutable class attrs should be annotated with ClassVar... ugh pretty annoying for user configs
 72 | 
 73 | ### these are just nitpicky, we usually know better
 74 |     "PLR0911",  # too many return statements
 75 |     "PLR0912",  # too many branches
 76 |     "PLR0913",  # too many function arguments
 77 |     "PLR0915",  # too many statements
 78 |     "PLR1714",  # consider merging multiple comparisons
 79 |     "PLR2044",  # line with empty comment
 80 |     "PLR5501",  # use elif instead of else if
 81 |     "PLR2004",  # magic value in comparison -- super annoying in tests
 82 | ###
 83 |     "PLR0402",  # import X.Y as Y -- TODO maybe consider enabling it, but double check
 84 | 
 85 |     "B009",  # calling gettattr with constant attribute -- this is useful to convince mypy
 86 |     "B010",  # same as above, but setattr
 87 |     "B011",  # complains about assert False
 88 |     "B017",  # pytest.raises(Exception)
 89 |     "B023",  # seems to result in false positives?
 90 |     "B028",  # suggest using explicit stacklevel? TODO double check later, but not sure it's useful
 91 | 
 92 |     # complains about useless pass, but has sort of a false positive if the function has a docstring?
 93 |     # this is common for click entrypoints (e.g. in __main__), so disable
 94 |     "PIE790",
 95 | 
 96 |     # a bit too annoying, offers to convert for loops to list comprehension
 97 |     # , which may heart readability
 98 |     "PERF401",
 99 | 
100 |     # suggests no using exception in for loops
101 |     # we do use this technique a lot, plus in 3.11 happy path exception handling is "zero-cost"
102 |     "PERF203",
103 | 
104 |     "RET504", # unnecessary assignment before returning -- that can be useful for readability
105 |     "RET505", # unnecessary else after return -- can hurt readability
106 | 
107 |     "PLW0603",  # global variable update.. we usually know why we are doing this
108 |     "PLW2901",  # for loop variable overwritten, usually this is intentional
109 | 
110 |     "PT011",  # pytest raises should is too broad
111 |     "PT012",  # pytest raises should contain a single statement
112 | 
113 |     "COM812",  # trailing comma missing -- mostly just being annoying with long multiline strings
114 | 
115 |     "PD901",   # generic variable name df
116 | 
117 |     "TRY003",  # suggests defining exception messages in exception class -- kinda annoying
118 |     "TRY004",  # prefer TypeError -- don't see the point
119 |     "TRY201",  # raise without specifying exception name -- sometimes hurts readability
120 |     "TRY400",  # TODO double check this, might be useful
121 |     "TRY401",  # redundant exception in logging.exception call? TODO double check, might result in excessive logging
122 | 
123 |     "PGH",  # TODO force error code in mypy instead
124 | 
125 |     "TID252",  # Prefer absolute imports over relative imports from parent modules
126 | 
127 |     "UP038",  # suggests using | (union) in isisntance checks.. but it results in slower code
128 | 
129 |     ## too annoying
130 |     "T20",     # just complains about prints and pprints
131 |     "Q",       # flake quotes, too annoying
132 |     "C90",     # some complexity checking
133 |     "G004",    # logging statement uses f string
134 |     "ERA001",  # commented out code
135 |     "SLF001",  # private member accessed
136 |     "BLE001",  # do not catch 'blind' Exception
137 |     "INP001",  # complains about implicit namespace packages
138 |     "SIM",     # some if statements crap
139 |     "RSE102",  # complains about missing parens in exceptions
140 |     ##
141 | ]
142 | 
143 | 
144 | exclude = [
145 |     # "src/hypexport/exporthelpers",  # TODO hmm not sure if should check here?
146 |     "src/hypexport/Hypothesis",
147 | ]
148 | 


--------------------------------------------------------------------------------
/src/hypexport/dal.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from collections.abc import Iterator, Sequence
  4 | from datetime import datetime
  5 | from itertools import groupby, tee
  6 | from pathlib import Path
  7 | from typing import NamedTuple
  8 | 
  9 | from .exporthelpers import dal_helper, logging_helper
 10 | from .exporthelpers.dal_helper import (
 11 |     Json,
 12 |     PathIsh,
 13 |     Res,
 14 |     datetime_aware,
 15 |     json_items,
 16 |     pathify,
 17 |     the,
 18 | )
 19 | 
 20 | logger = logging_helper.logger(__name__)
 21 | 
 22 | 
 23 | Url = str
 24 | 
 25 | 
 26 | # TODO unstead, use raw json + add @property?
 27 | class Highlight(NamedTuple):
 28 |     created: datetime_aware
 29 |     title: str
 30 |     url: Url
 31 |     hid: str
 32 |     hyp_link: Url
 33 |     # highlight might be None if for instance we just marked page with tags without annotating
 34 |     # not sure if we want to handle it somehow separately
 35 |     highlight: str | None
 36 |     annotation: str | None  # user's comment
 37 |     tags: Sequence[str]
 38 | 
 39 | 
 40 | class Page(NamedTuple):
 41 |     """
 42 |     Represents annotated page along with the highlights
 43 |     """
 44 | 
 45 |     highlights: Sequence[Highlight]
 46 | 
 47 |     @property
 48 |     def url(self) -> str:
 49 |         return the(h.url for h in self.highlights)
 50 | 
 51 |     @property
 52 |     def title(self) -> str:
 53 |         return the(h.title for h in self.highlights)
 54 | 
 55 |     @property
 56 |     def created(self) -> datetime:
 57 |         return min(h.created for h in self.highlights)
 58 | 
 59 | 
 60 | class DAL:
 61 |     def __init__(self, sources: Sequence[PathIsh]) -> None:
 62 |         self.sources = list(map(pathify, sources))
 63 | 
 64 |     def _iter_raw(self):
 65 |         paths = self.sources
 66 |         total = len(paths)
 67 |         width = len(str(total))
 68 |         for idx, path in enumerate(paths):
 69 |             logger.info(f'processing [{idx:>{width}}/{total:>{width}}] {path}')
 70 |             with path.open(mode='rb') as fo:
 71 |                 first = fo.read(1)
 72 |                 old_format = first == b'['
 73 |             key = None if old_format else 'annotations'
 74 |             # annotations are in reverse chronological order, so make sense to reverse
 75 |             annotations = sorted(iter(json_items(path, key)), key=lambda j: j.get('created', ''))
 76 |             yield from annotations
 77 | 
 78 |     def highlights(self) -> Iterator[Res[Highlight]]:
 79 |         emitted = set()
 80 |         for i in self._iter_raw():
 81 |             hid = i['id']
 82 |             if hid in emitted:
 83 |                 continue
 84 |             emitted.add(hid)
 85 |             try:
 86 |                 yield self._parse_highlight(i)
 87 |             except Exception as e:
 88 |                 err = RuntimeError(i)
 89 |                 err.__cause__ = e
 90 |                 yield err
 91 | 
 92 |     def pages(self) -> Iterator[Res[Page]]:
 93 |         vit, eit = tee(self.highlights())
 94 |         # fmt: off
 95 |         values = (r for r in vit if not isinstance(r, Exception))
 96 |         errors = (r for r in eit if     isinstance(r, Exception))
 97 |         # fmt: on
 98 | 
 99 |         by_url = lambda h: h.url
100 |         by_created = lambda h: h.created
101 |         def it() -> Iterator[Page]:
102 |             for _link, git in groupby(sorted(values, key=by_url), key=by_url):
103 |                 group = sorted(git, key=by_created)
104 |                 yield Page(group)
105 | 
106 |         yield from sorted(it(), key=by_created)
107 |         yield from errors
108 | 
109 |     def _parse_highlight(self, i: Json) -> Highlight:
110 |         [tg] = i['target']  # hopefully it's always single element?
111 |         selectors = tg.get('selector', None)
112 |         if selectors is None:
113 |             # TODO warn?...
114 |             selectors = []
115 | 
116 |         highlights = [s['exact'] for s in selectors if 'exact' in s]
117 | 
118 |         # TODO warn? never happend though
119 |         assert len(highlights) <= 1
120 | 
121 |         if len(highlights) == 0:
122 |             highlight = None
123 |         else:
124 |             [highlight] = highlights
125 | 
126 |         content: str | None = None
127 |         for s in selectors:
128 |             if 'exact' in s:
129 |                 content = s['exact']
130 |                 break
131 | 
132 |         page_link = i['uri']
133 |         title = i['document'].get('title')
134 |         if title is None:
135 |             # sometimes happens, e.t. if it's plaintext file
136 |             page_title = page_link
137 |         else:
138 |             page_title = ' '.join(title)
139 |         hid = i['id']
140 |         dts = i['created']
141 |         created = datetime.strptime(dts[:-3] + dts[-2:], '%Y-%m-%dT%H:%M:%S.%f%z')
142 |         txt = i['text']
143 |         annotation = None if len(txt.strip()) == 0 else txt
144 |         context = i['links']['incontext']
145 |         return Highlight(
146 |             created=created,
147 |             url=page_link,
148 |             title=page_title,
149 |             hid=hid,
150 |             hyp_link=context,
151 |             highlight=highlight,
152 |             annotation=annotation,
153 |             tags=tuple(i['tags']),
154 |         )
155 | 
156 | 
157 | # todo would be nice to use some fake data instead? this only gonna work under an ediable install
158 | def _testfile() -> Path:
159 |     testdata = Path(__file__).absolute().parent.parent.parent / 'testdata'
160 |     [jfile] = testdata.rglob('data/annotations.json')
161 |     return jfile
162 | 
163 | 
164 | def test() -> None:
165 |     dal = DAL([_testfile()])
166 |     # at least check it doesn't crash
167 |     for p in dal.pages():
168 |         assert not isinstance(p, Exception), p
169 |         p.title  # noqa: B018
170 |         p.url  # noqa: B018
171 |         p.created  # noqa: B018
172 |         len(list(p.highlights))
173 | 
174 | 
175 | def demo(dal: DAL) -> None:
176 |     # TODO split errors properly? move it to dal_helper?
177 |     # highlights = list(w for w in dao.highlights() if not isinstance(w, Exception))
178 | 
179 |     # TODO logger?
180 |     vit, eit = tee(dal.pages())
181 |     # fmt: off
182 |     values = (r for r in vit if not isinstance(r, Exception))
183 |     errors = (r for r in eit if     isinstance(r, Exception))
184 |     # fmt: on
185 |     for e in errors:
186 |         print("ERROR! ", e)
187 | 
188 |     pages = list(values)
189 |     print(f"Parsed {len(pages)} pages")
190 | 
191 |     from collections import Counter
192 | 
193 |     common = Counter({(x.url, x.title): len(x.highlights) for x in pages}).most_common(10)
194 |     print("10 most highlighed pages:")
195 |     for (url, title), count in common:
196 |         print(f'{count:4d} {url} "{title}"')
197 | 
198 | 
199 | if __name__ == '__main__':
200 |     dal_helper.main(DAL=DAL, demo=demo)
201 | 


--------------------------------------------------------------------------------
/src/hypexport/export.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from .exporthelpers.export_helper import Parser, setup_parser
 4 | from .Hypothesis import hypothesis
 5 | 
 6 | 
 7 | class Exporter:
 8 |     def __init__(self, *args, **kwargs) -> None:
 9 |         kwargs['max_search_results'] = 10000
10 |         self.api = hypothesis.Hypothesis(*args, **kwargs)
11 |         # TODO not sure why max_search_results is set to 2000 in Hypothesis package; documentation says 9800 is the max for offset? Ask judell
12 |         self.user = kwargs['username']
13 | 
14 |     def export_json(self):
15 |         profile = self.api.authenticated_api_query(self.api.api_url + '/profile')
16 |         annotations = list(self.api.search_all({'user': self.user}))
17 |         return {
18 |             'profile': profile,
19 |             'annotations': annotations,
20 |         }
21 | 
22 | 
23 | def get_json(**params):
24 |     return Exporter(**params).export_json()
25 | 
26 | 
27 | def main() -> None:
28 |     parser = make_parser()
29 |     args = parser.parse_args()
30 | 
31 |     params = args.params
32 |     dumper = args.dumper
33 | 
34 |     j = get_json(**params)
35 |     js = json.dumps(j, ensure_ascii=False, indent=2, sort_keys=True)
36 |     dumper(js)
37 | 
38 | 
39 | def make_parser():
40 |     parser = Parser('Export/takeout for your personal [[https://hypothes.is][Hypothes.is]] data: annotations and profile information.')
41 |     setup_parser(
42 |         parser=parser,
43 |         params=['username', 'token'],
44 |         extra_usage='''
45 | You can also import ~hypexport.export~ as a module and call ~get_json~ function directly to get raw JSON.
46 | ''',
47 |     )
48 |     return parser
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     main()
53 | 


--------------------------------------------------------------------------------
/src/hypexport/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/karlicoss/hypexport/218093d714ed82d5663f312219c3813a1ad2cb0d/src/hypexport/py.typed


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | minversion = 3.21
 3 | # relies on the correct version of Python installed
 4 | envlist = ruff,tests,mypy
 5 | # https://github.com/tox-dev/tox/issues/20#issuecomment-247788333
 6 | # hack to prevent .tox from crapping to the project directory
 7 | toxworkdir = {env:TOXWORKDIR_BASE:}{toxinidir}/.tox
 8 | 
 9 | [testenv]
10 | # TODO how to get package name from setuptools?
11 | package_name = "hypexport"
12 | passenv =
13 | # useful for tests to know they are running under ci
14 |     CI
15 |     CI_*
16 | # respect user's cache dirs to prevent tox from crapping into project dir
17 |     PYTHONPYCACHEPREFIX
18 |     MYPY_CACHE_DIR
19 |     RUFF_CACHE_DIR
20 | usedevelop = true  # for some reason tox seems to ignore "-e ." in deps section??
21 | uv_seed = true  # seems necessary so uv creates separate venvs per tox env?
22 | 
23 | 
24 | [testenv:ruff]
25 | dependency_groups = testing
26 | commands =
27 |     {envpython} -m ruff check src/
28 | 
29 | 
30 | [testenv:tests]
31 | dependency_groups = testing
32 | deps =
33 |     -e .[export]
34 | commands =
35 |     {envpython} -m pytest \
36 |         --pyargs {[testenv]package_name} \
37 |         {posargs}
38 | 
39 | 
40 | [testenv:mypy]
41 | dependency_groups = testing
42 | deps =
43 |     -e .[optional]
44 | commands =
45 |     {envpython} -m mypy --no-install-types \
46 |         -p {[testenv]package_name}       \
47 |         # txt report is a bit more convenient to view on CI
48 |         --txt-report  .coverage.mypy     \
49 |         --html-report .coverage.mypy     \
50 |         {posargs}
51 | 


--------------------------------------------------------------------------------