├── src
    └── goodrexport
    │   ├── py.typed
    │   ├── dal.py
    │   └── export.py
├── .gitmodules
├── mypy.ini
├── pytest.ini
├── .ci
    └── run
├── LICENSE
├── pyproject.toml
├── tox.ini
├── conftest.py
├── README.org
├── .gitignore
├── .github
    └── workflows
    │   └── main.yml
└── ruff.toml


/src/goodrexport/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "src/goodrexport/exporthelpers"]
2 | 	path = src/goodrexport/exporthelpers
3 | 	url = https://github.com/karlicoss/exporthelpers.git
4 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | pretty = True
 3 | show_error_context = True
 4 | show_column_numbers = True
 5 | show_error_end = True
 6 | 
 7 | check_untyped_defs = True
 8 | 
 9 | # see https://mypy.readthedocs.io/en/stable/error_code_list2.html
10 | warn_redundant_casts = True
11 | strict_equality = True
12 | warn_unused_ignores = True
13 | enable_error_code = deprecated,redundant-expr,possibly-undefined,truthy-bool,truthy-iterable,ignore-without-code,unused-awaitable
14 | 
15 | 
16 | # an example of suppressing
17 | # [mypy-my.config.repos.pdfannots.pdfannots]
18 | # ignore_errors = True
19 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
 1 | [pytest]
 2 | # discover files that don't follow test_ naming. Useful to keep tests along with the source code
 3 | python_files = *.py
 4 | 
 5 | # this setting only impacts package/module naming under pytest, not the discovery
 6 | consider_namespace_packages = true
 7 | 
 8 | addopts =
 9 |   # prevent pytest cache from being created... it craps into project dir and I never use it anyway
10 |   -p no:cacheprovider
11 | 
12 |   # -rap to print tests summary even when they are successful
13 |   -rap
14 |   --verbose
15 | 
16 |   # otherwise it won't discover doctests
17 |   --doctest-modules
18 | 
19 |   # show all test durations (unless they are too short)
20 |   --durations=0
21 | 


--------------------------------------------------------------------------------
/.ci/run:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eu
 3 | 
 4 | cd "$(dirname "$0")"
 5 | cd .. # git root
 6 | 
 7 | if ! command -v sudo; then
 8 |     # CI or Docker sometimes doesn't have it, so useful to have a dummy
 9 |     function sudo {
10 |         "$@"
11 |     }
12 | fi
13 | 
14 | # --parallel-live to show outputs while it's running
15 | tox_cmd='run-parallel --parallel-live'
16 | if [ -n "${CI-}" ]; then
17 |     # install OS specific stuff here
18 |     case "$OSTYPE" in
19 |     darwin*) 
20 |         # macos
21 |         :
22 |         ;;
23 |     cygwin* | msys* | win*)
24 |         # windows
25 |         # ugh. parallel stuff seems super flaky under windows, some random failures, "file used by other process" and crap like that
26 |         tox_cmd='run'
27 |         ;;
28 |     *)
29 |         # must be linux?
30 |         :
31 |         ;;
32 |     esac
33 | fi
34 | 
35 | # NOTE: expects uv installed
36 | uv tool run --with tox-uv tox $tox_cmd "$@"
37 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Dmitrii Gerasimov
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | # see https://github.com/karlicoss/pymplate for up-to-date reference
 2 | [project]
 3 | dynamic = ["version"]  # version is managed by build backend
 4 | name = "goodrexport"
 5 | dependencies = [
 6 |     "goodrexport[dal]",  # TODO backwards compatibility -- remove later?
 7 | ]
 8 | requires-python = ">=3.9"
 9 | 
10 | ## these need to be set if you're planning to upload to pypi
11 | # description = "TODO"
12 | # license = {file = "LICENSE"}
13 | # authors = [
14 | #     {name = "Dima Gerasimov (@karlicoss)", email = "karlicoss@gmail.com"},
15 | # ]
16 | # maintainers = [
17 | #     {name = "Dima Gerasimov (@karlicoss)", email = "karlicoss@gmail.com"},
18 | # ]
19 | #
20 | # [project.urls]
21 | # Homepage = "https://github.com/karlicoss/pymplate"
22 | ##
23 | 
24 | 
25 | [project.optional-dependencies]
26 | dal = ["lxml"]
27 | export = []
28 | optional = [
29 |     "orjson",  # faster json processing
30 |     "colorlog",
31 |     "ijson",  # faster iterative json processing
32 | ]
33 | [dependency-groups]
34 | # TODO: not sure, on the one hand could just use 'standard' dev dependency group
35 | # On the other hand, it's a bit annoying that it's always included by default? 
36 | # To make sure it's not included, need to use `uv run --exact --no-default-groups ...`
37 | testing = [
38 |     "pytest",
39 |     "ruff",
40 |     "mypy",
41 |     "lxml", # for mypy html coverage
42 |     "ty>=0.0.1a15",
43 | 
44 |     "lxml-stubs",
45 | ]
46 | 
47 | 
48 | [build-system]
49 | requires = ["hatchling", "hatch-vcs"]
50 | build-backend = "hatchling.build"
51 | 
52 | # unfortunately have to duplicate project name here atm, see https://github.com/pypa/hatch/issues/1894
53 | [tool.hatch.build.targets.wheel]
54 | packages = ["src/goodrexport"]
55 | 
56 | [tool.hatch.version]
57 | source = "vcs"
58 | 
59 | [tool.hatch.version.raw-options]
60 | version_scheme = "python-simplified-semver"
61 | local_scheme = "dirty-tag"
62 | 
63 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | minversion = 3.21
 3 | # relies on the correct version of Python installed
 4 | envlist = ruff,tests,mypy,ty
 5 | # https://github.com/tox-dev/tox/issues/20#issuecomment-247788333
 6 | # hack to prevent .tox from crapping to the project directory
 7 | toxworkdir = {env:TOXWORKDIR_BASE:}{toxinidir}/.tox
 8 | 
 9 | [testenv]
10 | # TODO how to get package name from setuptools?
11 | package_name = "goodrexport"
12 | pass_env =
13 | # useful for tests to know they are running under ci
14 |     CI
15 |     CI_*
16 | # respect user's cache dirs to prevent tox from crapping into project dir
17 |     PYTHONPYCACHEPREFIX
18 |     MYPY_CACHE_DIR
19 |     RUFF_CACHE_DIR
20 | 
21 | set_env =
22 | # do not add current working directory to pythonpath
23 | # generally this is more robust and safer, prevents weird issues later on
24 |     PYTHONSAFEPATH=1
25 | 
26 | # default is 'editable', in which tox builds wheel first for some reason? not sure if makes much sense
27 | package = uv-editable
28 | 
29 | 
30 | [testenv:ruff]
31 | skip_install = true
32 | dependency_groups = testing
33 | commands =
34 |     {envpython} -m ruff check \
35 |         {posargs}
36 | 
37 | 
38 | [testenv:tests]
39 | dependency_groups = testing
40 | commands =
41 |     # posargs allow test filtering, e.g. tox ... -- -k test_name
42 |     {envpython} -m pytest \
43 |         --pyargs {[testenv]package_name} \
44 |         {posargs}
45 | 
46 | 
47 | [testenv:mypy]
48 | dependency_groups = testing
49 | extras = optional
50 | commands =
51 |     {envpython} -m mypy --no-install-types \
52 |         -p {[testenv]package_name}       \
53 |         --txt-report           .coverage.mypy \
54 |         --html-report          .coverage.mypy \
55 |         # this is for github actions to upload to codecov.io
56 |         # sadly xml coverage crashes on windows... so we need to disable it
57 |         {env:CI_MYPY_COVERAGE} \
58 |         {posargs}
59 | 
60 | 
61 | [testenv:ty]
62 | dependency_groups = testing
63 | extras = optional
64 | deps =  # any other dependencies (if needed)
65 | commands =
66 |     {envpython} -m ty \
67 |         check \
68 |         {posargs}
69 | 


--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
 1 | # this is a hack to monkey patch pytest so it handles tests inside namespace packages without __init__.py properly
 2 | # without it, pytest can't discover the package root for some reason
 3 | # also see https://github.com/karlicoss/pytest_namespace_pkgs for more
 4 | 
 5 | import os
 6 | import pathlib
 7 | from typing import Optional
 8 | 
 9 | import _pytest.main
10 | import _pytest.pathlib
11 | 
12 | # we consider all dirs in repo/ to be namespace packages
13 | root_dir = pathlib.Path(__file__).absolute().parent.resolve() / 'src'
14 | assert root_dir.exists(), root_dir
15 | 
16 | # TODO assert it contains package name?? maybe get it via setuptools..
17 | 
18 | namespace_pkg_dirs = [str(d) for d in root_dir.iterdir() if d.is_dir()]
19 | 
20 | # resolve_package_path is called from _pytest.pathlib.import_path
21 | # takes a full abs path to the test file and needs to return the path to the 'root' package on the filesystem
22 | resolve_pkg_path_orig = _pytest.pathlib.resolve_package_path
23 | 
24 | 
25 | def resolve_package_path(path: pathlib.Path) -> Optional[pathlib.Path]:
26 |     result = path  # search from the test file upwards
27 |     for parent in result.parents:
28 |         if str(parent) in namespace_pkg_dirs:
29 |             return parent
30 |     if os.name == 'nt':
31 |         # ??? for some reason on windows it is trying to call this against conftest? but not on linux/osx
32 |         if path.name == 'conftest.py':
33 |             return resolve_pkg_path_orig(path)
34 |     raise RuntimeError("Couldn't determine path for ", path)
35 | 
36 | 
37 | # NOTE: seems like it's not necessary anymore?
38 | # keeping it for now just in case
39 | # after https://github.com/pytest-dev/pytest/pull/13426 we should be able to remove the whole conftest
40 | # _pytest.pathlib.resolve_package_path = resolve_package_path
41 | 
42 | 
43 | # without patching, the orig function returns just a package name for some reason
44 | # (I think it's used as a sort of fallback)
45 | # so we need to point it at the absolute path properly
46 | # not sure what are the consequences.. maybe it wouldn't be able to run against installed packages? not sure..
47 | search_pypath_orig = _pytest.main.search_pypath
48 | 
49 | 
50 | def search_pypath(module_name: str) -> str:
51 |     mpath = root_dir / module_name.replace('.', os.sep)
52 |     if not mpath.is_dir():
53 |         mpath = mpath.with_suffix('.py')
54 |         assert mpath.exists(), mpath  # just in case
55 |     return str(mpath)
56 | 
57 | 
58 | _pytest.main.search_pypath = search_pypath  # ty: ignore[invalid-assignment]
59 | 


--------------------------------------------------------------------------------
/README.org:
--------------------------------------------------------------------------------
 1 | #+begin_src python :dir src :results drawer :exports results
 2 | import goodrexport.export as E; return E.make_parser().prog
 3 | #+end_src
 4 | 
 5 | #+RESULTS:
 6 | :results:
 7 | Export/takeout for your personal Goodreads data
 8 | :end:
 9 | 
10 | * Setting up
11 | 1. The easiest way is =pip3 install --user git+https://github.com/karlicoss/goodrexport=.
12 | 
13 |    Alternatively, use =git clone --recursive=, or =git pull && git submodule update --init=. After that, you can use =pip3 install --editable=.
14 | 2. [[https://www.goodreads.com/api/keys][get your API keys]]
15 | 
16 | * Exporting
17 | 
18 | #+begin_src python :dir src :results drawer :exports results
19 | import goodrexport.export as E; return E.make_parser().epilog
20 | #+end_src
21 | 
22 | #+RESULTS:
23 | :results:
24 | 
25 | Usage:
26 | 
27 | *Recommended*: create =secrets.py= keeping your api parameters, e.g.:
28 | 
29 | 
30 | : user_id = "USER_ID"
31 | : key = "KEY"
32 | 
33 | 
34 | After that, use:
35 | 
36 | : python3 -m goodrexport.export --secrets /path/to/secrets.py
37 | 
38 | That way you type less and have control over where you keep your plaintext secrets.
39 | 
40 | *Alternatively*, you can pass parameters directly, e.g.
41 | 
42 | : python3 -m goodrexport.export --user_id <user_id> --key <key>
43 | 
44 | However, this is verbose and prone to leaking your keys/tokens/passwords in shell history.
45 | 
46 | 
47 | You can also import ~goodrexport.export~ as a module and call ~get_xml~ function directly to get raw XML.
48 | 
49 | 
50 | I *highly* recommend checking exported files at least once just to make sure they contain everything you expect from your export. If not, please feel free to ask or raise an issue!
51 | 
52 | :end:
53 | 
54 | * Using data
55 |   
56 | #+begin_src python :dir src  :results drawer :exports results
57 | import goodrexport.exporthelpers.dal_helper as D; return D.make_parser().epilog
58 | #+end_src
59 | 
60 | #+RESULTS:
61 | :results:
62 | 
63 | You can use =goodrexport.dal= (stands for "Data Access/Abstraction Layer") to access your exported data, even offline.
64 | I elaborate on motivation behind it [[https://beepb00p.xyz/exports.html#dal][here]].
65 | 
66 | - main usecase is to be imported as python module to allow for *programmatic access* to your data.
67 | 
68 |   You can find some inspiration in [[https://beepb00p.xyz/mypkg.html][=my.=]] package that I'm using as an API to all my personal data.
69 | 
70 | - to test it against your export, simply run: ~python3 -m goodrexport.dal --source /path/to/export~
71 | 
72 | - you can also try it interactively: ~python3 -m goodrexport.dal --source /path/to/export --interactive~
73 | 
74 | :end:
75 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Created by https://www.gitignore.io/api/python,emacs
  3 | 
  4 | ### Emacs ###
  5 | # -*- mode: gitignore; -*-
  6 | *~
  7 | \#*\#
  8 | /.emacs.desktop
  9 | /.emacs.desktop.lock
 10 | *.elc
 11 | auto-save-list
 12 | tramp
 13 | .\#*
 14 | 
 15 | # Org-mode
 16 | .org-id-locations
 17 | *_archive
 18 | 
 19 | # flymake-mode
 20 | *_flymake.*
 21 | 
 22 | # eshell files
 23 | /eshell/history
 24 | /eshell/lastdir
 25 | 
 26 | # elpa packages
 27 | /elpa/
 28 | 
 29 | # reftex files
 30 | *.rel
 31 | 
 32 | # AUCTeX auto folder
 33 | /auto/
 34 | 
 35 | # cask packages
 36 | .cask/
 37 | dist/
 38 | 
 39 | # Flycheck
 40 | flycheck_*.el
 41 | 
 42 | # server auth directory
 43 | /server/
 44 | 
 45 | # projectiles files
 46 | .projectile
 47 | 
 48 | # directory configuration
 49 | .dir-locals.el
 50 | 
 51 | ### Python ###
 52 | # Byte-compiled / optimized / DLL files
 53 | __pycache__/
 54 | *.py[cod]
 55 | *$py.class
 56 | 
 57 | # C extensions
 58 | *.so
 59 | 
 60 | # Distribution / packaging
 61 | .Python
 62 | build/
 63 | develop-eggs/
 64 | downloads/
 65 | eggs/
 66 | .eggs/
 67 | lib/
 68 | lib64/
 69 | parts/
 70 | sdist/
 71 | var/
 72 | wheels/
 73 | *.egg-info/
 74 | .installed.cfg
 75 | *.egg
 76 | MANIFEST
 77 | 
 78 | # PyInstaller
 79 | #  Usually these files are written by a python script from a template
 80 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 81 | *.manifest
 82 | *.spec
 83 | 
 84 | # Installer logs
 85 | pip-log.txt
 86 | pip-delete-this-directory.txt
 87 | 
 88 | # Unit test / coverage reports
 89 | htmlcov/
 90 | .tox/
 91 | .coverage
 92 | .coverage.*
 93 | .cache
 94 | nosetests.xml
 95 | coverage.xml
 96 | *.cover
 97 | .hypothesis/
 98 | .pytest_cache/
 99 | 
100 | # Translations
101 | *.mo
102 | *.pot
103 | 
104 | # Django stuff:
105 | *.log
106 | local_settings.py
107 | db.sqlite3
108 | 
109 | # Flask stuff:
110 | instance/
111 | .webassets-cache
112 | 
113 | # Scrapy stuff:
114 | .scrapy
115 | 
116 | # Sphinx documentation
117 | docs/_build/
118 | 
119 | # PyBuilder
120 | target/
121 | 
122 | # Jupyter Notebook
123 | .ipynb_checkpoints
124 | 
125 | # IPython
126 | profile_default/
127 | ipython_config.py
128 | 
129 | # pyenv
130 | .python-version
131 | 
132 | # celery beat schedule file
133 | celerybeat-schedule
134 | 
135 | # SageMath parsed files
136 | *.sage.py
137 | 
138 | # Environments
139 | .env
140 | .venv
141 | env/
142 | venv/
143 | ENV/
144 | env.bak/
145 | venv.bak/
146 | 
147 | # Spyder project settings
148 | .spyderproject
149 | .spyproject
150 | 
151 | # Rope project settings
152 | .ropeproject
153 | 
154 | # mkdocs documentation
155 | /site
156 | 
157 | # mypy
158 | .mypy_cache/
159 | .dmypy.json
160 | dmypy.json
161 | 
162 | ### Python Patch ###
163 | .venv/
164 | 
165 | ### Python.VirtualEnv Stack ###
166 | # Virtualenv
167 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
168 | [Bb]in
169 | [Ii]nclude
170 | [Ll]ib
171 | [Ll]ib64
172 | [Ll]ocal
173 | [Ss]cripts
174 | pyvenv.cfg
175 | pip-selfcheck.json
176 | 
177 | 
178 | # End of https://www.gitignore.io/api/python,emacs
179 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | # see https://github.com/karlicoss/pymplate for up-to-date reference
 2 | 
 3 | name: CI
 4 | on:
 5 |   push:
 6 |     branches: '*'
 7 |     tags: 'v[0-9]+.*' # only trigger on 'release' tags for PyPi
 8 |     # Ideally I would put this in the pypi job... but github syntax doesn't allow for regexes there :shrug:
 9 | 
10 |   # Needed to trigger on others' PRs.
11 |   # Note that people who fork it need to go to "Actions" tab on their fork and click "I understand my workflows, go ahead and enable them".
12 |   pull_request:
13 | 
14 |   # Needed to trigger workflows manually.
15 |   workflow_dispatch:
16 |     inputs:
17 |       debug_enabled:
18 |         type: boolean
19 |         description: 'Run the build with tmate debugging enabled (https://github.com/marketplace/actions/debugging-with-tmate)'
20 |         required: false
21 |         default: false
22 | 
23 |   schedule:
24 |     - cron: '31 18 * * 5'  # run every Friday
25 | 
26 | 
27 | jobs:
28 |   build:
29 |     strategy:
30 |       fail-fast: false
31 |       matrix:
32 |         platform: [ubuntu-latest, macos-latest, windows-latest]
33 |         python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
34 |         exclude: [
35 |             # windows runners are pretty scarce, so let's only run lowest and highest python version
36 |             {platform: windows-latest, python-version: '3.10'},
37 |             {platform: windows-latest, python-version: '3.11'},
38 |             {platform: windows-latest, python-version: '3.12'},
39 | 
40 |             # same, macos is a bit too slow and ubuntu covers python quirks well
41 |             {platform: macos-latest  , python-version: '3.10'},
42 |             {platform: macos-latest  , python-version: '3.11'},
43 |             {platform: macos-latest  , python-version: '3.12'},
44 |         ]
45 | 
46 |     runs-on: ${{ matrix.platform }}
47 | 
48 |     # useful for 'optional' pipelines
49 |     # continue-on-error: ${{ matrix.platform == 'windows-latest' }}
50 | 
51 |     steps:
52 |     # ugh https://github.com/actions/toolkit/blob/main/docs/commands.md#path-manipulation
53 |     - run: echo "$HOME/.local/bin" >> $GITHUB_PATH
54 | 
55 |     - uses: actions/checkout@v4
56 |       with:
57 |         submodules: recursive
58 |         fetch-depth: 0  # nicer to have all git history when debugging/for tests
59 | 
60 |     - uses: actions/setup-python@v5
61 |       with:
62 |         python-version: ${{ matrix.python-version }}
63 |       
64 |     - uses: astral-sh/setup-uv@v5
65 |       with:
66 |         enable-cache: false  # we don't have lock files, so can't use them as cache key
67 | 
68 |     - uses: mxschmitt/action-tmate@v3
69 |       if: ${{ github.event_name == 'workflow_dispatch' && inputs.debug_enabled }}
70 | 
71 |     # explicit bash command is necessary for Windows CI runner, otherwise it thinks it's cmd...
72 |     - run: bash .ci/run
73 |       env:
74 |         # only compute lxml coverage on ubuntu; it crashes on windows
75 |         CI_MYPY_COVERAGE: ${{ matrix.platform == 'ubuntu-latest' && '--cobertura-xml-report .coverage.mypy' || '' }}
76 | 
77 |     - if: matrix.platform == 'ubuntu-latest'  # no need to compute coverage for other platforms
78 |       uses: codecov/codecov-action@v5
79 |       with:
80 |         fail_ci_if_error: true  # default false
81 |         token: ${{ secrets.CODECOV_TOKEN }}
82 |         flags: mypy-${{ matrix.python-version }}
83 |         files: .coverage.mypy/cobertura.xml
84 | 
85 | 


--------------------------------------------------------------------------------
/src/goodrexport/dal.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from collections.abc import Iterator, Sequence
  4 | from datetime import datetime, timezone
  5 | from typing import NamedTuple, Optional
  6 | 
  7 | from lxml import etree as ET
  8 | 
  9 | from .exporthelpers import dal_helper
 10 | from .exporthelpers.dal_helper import PathIsh, datetime_aware, pathify, the
 11 | 
 12 | 
 13 | class Book(NamedTuple):
 14 |     id: str
 15 |     title: str
 16 |     authors: Sequence[str]
 17 |     shelves: Sequence[str]
 18 |     date_added: datetime_aware
 19 |     date_started: Optional[datetime_aware]
 20 |     date_read: Optional[datetime_aware]
 21 | 
 22 | 
 23 | class Review(NamedTuple):
 24 |     id: str
 25 |     book: Book
 26 | 
 27 | 
 28 | def _parse_date(s: Optional[str]) -> Optional[datetime_aware]:
 29 |     if s is None:
 30 |         return None
 31 |     res = datetime.strptime(s, "%a %b %d %H:%M:%S %z %Y")
 32 |     assert res.tzinfo is not None
 33 |     return res
 34 | 
 35 | 
 36 | def _parse_review(r) -> Review:
 37 |     rid = the(r.xpath('id'))
 38 |     be = the(r.xpath('book'))
 39 |     title = the(be.xpath('title/text()'))
 40 |     authors = be.xpath('authors/author/name/text()')
 41 | 
 42 |     bid = the(r.xpath('id/text()'))
 43 |     # isbn_element   = the(book_element.getElementsByTagName('isbn'))
 44 |     # isbn13_element = the(book_element.getElementsByTagName('isbn13'))
 45 |     date_added = the(r.xpath('date_added/text()'))
 46 |     sss = r.xpath('started_at/text()')
 47 |     rrr = r.xpath('read_at/text()')
 48 |     started_at = None if len(sss) == 0 else the(sss)
 49 |     read_at = None if len(rrr) == 0 else the(rrr)
 50 | 
 51 |     shelves = [s.attrib['name'] for s in r.xpath('shelves/shelf')]
 52 | 
 53 |     # if isbn_element.getAttribute('nil') != 'true':
 54 |     #     book['isbn'] = isbn_element.firstChild.data
 55 |     # else:
 56 |     #     book['isbn'] = ''
 57 | 
 58 |     # if isbn13_element.getAttribute('nil') != 'true':
 59 |     #     book['isbn13'] = isbn13_element.firstChild.data
 60 |     # else:
 61 |     #     book['isbn13'] = ''
 62 | 
 63 |     da = _parse_date(date_added)
 64 |     assert da is not None
 65 |     book = Book(
 66 |         id=bid,
 67 |         title=title,
 68 |         authors=authors,
 69 |         shelves=shelves,
 70 |         date_added=da,
 71 |         date_started=_parse_date(started_at),
 72 |         date_read=_parse_date(read_at),
 73 |     )
 74 |     return Review(
 75 |         id=rid,
 76 |         book=book,
 77 |     )
 78 | 
 79 | 
 80 | class DAL:
 81 |     def __init__(self, sources: Sequence[PathIsh]) -> None:
 82 |         self.sources = list(map(pathify, sources))
 83 |         # TODO take all sources into the account?
 84 |         self._source = max(self.sources)
 85 | 
 86 |     def reviews(self) -> Iterator[Review]:
 87 |         tree = ET.fromstring(self._source.read_text())
 88 |         rxml = tree.xpath('//review')
 89 |         for r in rxml:  # type: ignore[union-attr]
 90 |             yield _parse_review(r)
 91 | 
 92 | 
 93 | def demo(dal: DAL) -> None:
 94 |     print("Your books:")
 95 | 
 96 |     mindt = datetime.min.replace(tzinfo=timezone.utc)
 97 |     reviews = sorted(dal.reviews(), key=lambda r: r.book.date_read or mindt)
 98 |     for r in reviews:
 99 |         print(r.book.date_read, r.book.title)
100 | 
101 | 
102 | def main() -> None:
103 |     dal_helper.main(DAL=DAL, demo=demo)
104 | 
105 | 
106 | if __name__ == '__main__':
107 |     main()
108 | 


--------------------------------------------------------------------------------
/src/goodrexport/export.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import argparse
  4 | from textwrap import dedent
  5 | from urllib.parse import urlencode
  6 | from urllib.request import urlopen
  7 | from xml.dom.minidom import parse
  8 | 
  9 | from .exporthelpers.export_helper import Parser, setup_parser
 10 | 
 11 | # https://www.goodreads.com/api
 12 | # TODO maybe add these too?
 13 | # auth.user   —   Get id of user who authorized OAuth.
 14 | # group.list   —   List groups for a given user.
 15 | # owned_books.list   —   List books owned by a user.
 16 | # reviews.list   —   Get the books on a members shelf.
 17 | # review.show   —   Get a review.
 18 | # review.show_by_user_and_book   —   Get a user's review for a given book.
 19 | # shelves.list   —   Get a user's shelves.
 20 | # user.show   —   Get info about a member by id or username.
 21 | # user.followers   —   Get a user's followers.
 22 | # user.following   —   Get people a user is following.
 23 | # user.friends   —   Get a user's friends.
 24 | 
 25 | 
 26 | class Exporter:
 27 |     def __init__(self, *args, **kwargs) -> None:  # noqa: ARG002
 28 |         self.base_url = 'https://www.goodreads.com/'
 29 |         self.user_id = kwargs['user_id']
 30 |         self.key = kwargs['key']
 31 |         self.per_page = 200
 32 | 
 33 |     # apparently no json... https://www.goodreads.com/topic/show/1663342-json-endpoints
 34 |     def _get(self, endpoint: str, **kwargs):
 35 |         current_page = 1
 36 |         total = None
 37 | 
 38 |         results = []  # type: ignore[var-annotated]
 39 |         while total is None or len(results) < total:
 40 |             query = urlencode(
 41 |                 [
 42 |                     ('v', '2'),
 43 |                     ('key', self.key),
 44 |                     ('per_page', self.per_page),
 45 |                     ('page', current_page),
 46 |                     *kwargs.items(),
 47 |                 ]
 48 |             )
 49 |             url = self.base_url + endpoint + '.xml?' + query
 50 |             chunk = parse(urlopen(url))
 51 | 
 52 |             [curr] = chunk.getElementsByTagName('reviews')
 53 |             total = int(curr.getAttribute('total'))
 54 |             results.extend(curr.getElementsByTagName('review'))
 55 |             current_page += 1
 56 |         return results
 57 | 
 58 |     def export_xml(self) -> str:
 59 |         nodes = []
 60 |         for node_name, endpoint in [
 61 |             ## TODO looks like friends require oauth..
 62 |             # 'friend/user/' + self.user,
 63 |             # https://gist.github.com/gpiancastelli/537923
 64 |             ##
 65 |             ## TODO shelves are a mess too...
 66 |             # 'shelf/list',
 67 |             # <shelves end="3" start="1" total="3"> <user_shelf>
 68 |             ('reviews', 'review/list'),
 69 |         ]:
 70 |             results = self._get(endpoint, id=self.user_id)
 71 |             body = ''.join(x.toprettyxml() for x in results)
 72 |             # eh, not sure why toprettyxml adds so many newlines.. whatever
 73 |             nodes.append(
 74 |                 dedent(f'''
 75 |             <{node_name}>
 76 |             {body}
 77 |             </{node_name}>
 78 |             ''')
 79 |             )
 80 |         nodess = ''.join(nodes)
 81 |         return dedent(f'''
 82 |                <export>
 83 |                {nodess}
 84 |                </export>''')
 85 | 
 86 | 
 87 | def get_xml(**params):
 88 |     return Exporter(**params).export_xml()
 89 | 
 90 | 
 91 | def make_parser() -> argparse.ArgumentParser:
 92 |     parser = Parser('Export/takeout for your personal Goodreads data')
 93 |     setup_parser(
 94 |         parser,
 95 |         params=['user_id', 'key'],
 96 |         # TODO not sure if worth automating?
 97 |         extra_usage='''
 98 | You can also import ~goodrexport.export~ as a module and call ~get_xml~ function directly to get raw XML.
 99 |         ''',
100 |     )
101 |     return parser
102 | 
103 | 
104 | def main() -> None:
105 |     parser = make_parser()
106 |     args = parser.parse_args()
107 | 
108 |     params = args.params
109 |     dumper = args.dumper
110 | 
111 |     x = get_xml(**params)
112 |     dumper(x)
113 | 
114 | 
115 | if __name__ == '__main__':
116 |     main()
117 | 


--------------------------------------------------------------------------------
/ruff.toml:
--------------------------------------------------------------------------------
  1 | lint.extend-select = [
  2 |     "F",    # flakes rules -- default, but extend just in case
  3 |     "E",    # pycodestyle  -- default, but extend just in case
  4 |     "W",    # various warnings
  5 | 
  6 |     "B",    # 'bugbear' set -- various possible bugs
  7 |     "C4",   # flake8-comprehensions -- unnecessary list/map/dict calls
  8 |     "COM",  # trailing commas
  9 |     "EXE",  # various checks wrt executable files
 10 |     "I",    # sort imports
 11 |     "ICN",  # various import conventions
 12 |     "FBT",  # detect use of boolean arguments
 13 |     "FURB", # various rules
 14 |     "PERF", # various potential performance speedups
 15 |     "PD",   # pandas rules
 16 |     "PIE",  # 'misc' lints
 17 |     "PLC",  # pylint convention rules
 18 |     "PLR",  # pylint refactor rules
 19 |     "PLW",  # pylint warnings
 20 |     "PT",   # pytest stuff
 21 |     "PYI",  # various type hinting rules
 22 |     "RET",  # early returns
 23 |     "RUF",  # various ruff-specific rules
 24 |     "TID",  # various imports suggestions
 25 |     "TRY",  # various exception handling rules
 26 |     "UP",   # detect deprecated python stdlib stuff
 27 |     "FA",   # suggest using from __future__ import annotations
 28 |     "PTH",  # pathlib migration
 29 |     "ARG",  # unused argument checks
 30 |     "A",    # builtin shadowing
 31 |     "G",    # logging stuff
 32 | 
 33 |     # "ALL", # uncomment this to check for new rules!
 34 | ]
 35 | 
 36 | # Preserve types, even if a file imports `from __future__ import annotations`
 37 | # we need this for cachew to work with HPI types on 3.9
 38 | # can probably remove after 3.10?
 39 | lint.pyupgrade.keep-runtime-typing = true
 40 | 
 41 | lint.ignore = [
 42 |     "D",     # annoying nags about docstrings
 43 |     "N",     # pep naming
 44 |     "TCH",   # type checking rules, mostly just suggests moving imports under TYPE_CHECKING
 45 |     "S",     # bandit (security checks) -- tends to be not very useful, lots of nitpicks
 46 |     "DTZ",   # datetimes checks -- complaining about missing tz and mostly false positives
 47 |     "FIX",   # complains about fixmes/todos -- annoying
 48 |     "TD",    # complains about todo formatting -- too annoying
 49 |     "ANN",   # missing type annotations? seems way to strict though
 50 |     "EM" ,   # suggests assigning all exception messages into a variable first... pretty annoying
 51 | 
 52 | ### too opinionated style checks
 53 |     "E501",  # too long lines
 54 |     "E702",  # Multiple statements on one line (semicolon)
 55 |     "E731",  # assigning lambda instead of using def
 56 |     "E741",  # Ambiguous variable name: `l`
 57 |     "E742",  # Ambiguous class name: `O
 58 |     "E401",  # Multiple imports on one line
 59 |     "F403",  # import *` used; unable to detect undefined names
 60 | ###
 61 | 
 62 | ###
 63 |     "E722",  # Do not use bare `except` ## Sometimes it's useful for defensive imports and that sort of thing..
 64 |     "F811",  # Redefinition of unused  # this gets in the way of pytest fixtures (e.g. in cachew)
 65 | 
 66 | ## might be nice .. but later and I don't wanna make it strict
 67 |     "E402",  # Module level import not at top of file
 68 | 
 69 |     "RUF100",  # unused noqa -- handle later
 70 |     "RUF012",  # mutable class attrs should be annotated with ClassVar... ugh pretty annoying for user configs
 71 | 
 72 | ### these are just nitpicky, we usually know better
 73 |     "PLR0911",  # too many return statements
 74 |     "PLR0912",  # too many branches
 75 |     "PLR0913",  # too many function arguments
 76 |     "PLR0915",  # too many statements
 77 |     "PLR1714",  # consider merging multiple comparisons
 78 |     "PLR2044",  # line with empty comment
 79 |     "PLR5501",  # use elif instead of else if
 80 |     "PLR2004",  # magic value in comparison -- super annoying in tests
 81 | ###
 82 |     "PLR0402",  # import X.Y as Y -- TODO maybe consider enabling it, but double check
 83 | 
 84 |     "B009",  # calling gettattr with constant attribute -- this is useful to convince mypy
 85 |     "B010",  # same as above, but setattr
 86 |     "B011",  # complains about assert False
 87 |     "B017",  # pytest.raises(Exception)
 88 |     "B023",  # seems to result in false positives?
 89 |     "B028",  # suggest using explicit stacklevel? TODO double check later, but not sure it's useful
 90 | 
 91 |     # complains about useless pass, but has sort of a false positive if the function has a docstring?
 92 |     # this is common for click entrypoints (e.g. in __main__), so disable
 93 |     "PIE790",
 94 | 
 95 |     # a bit too annoying, offers to convert for loops to list comprehension
 96 |     # , which may heart readability
 97 |     "PERF401",
 98 | 
 99 |     # suggests no using exception in for loops
100 |     # we do use this technique a lot, plus in 3.11 happy path exception handling is "zero-cost"
101 |     "PERF203",
102 | 
103 |     "RET504", # unnecessary assignment before returning -- that can be useful for readability
104 |     "RET505", # unnecessary else after return -- can hurt readability
105 | 
106 |     "PLW0603",  # global variable update.. we usually know why we are doing this
107 |     "PLW2901",  # for loop variable overwritten, usually this is intentional
108 | 
109 |     "PT011",  # pytest raises should is too broad
110 |     "PT012",  # pytest raises should contain a single statement
111 | 
112 |     "COM812",  # trailing comma missing -- mostly just being annoying with long multiline strings
113 | 
114 |     "PD901",   # generic variable name df
115 | 
116 |     "TRY003",  # suggests defining exception messages in exception class -- kinda annoying
117 |     "TRY004",  # prefer TypeError -- don't see the point
118 |     "TRY201",  # raise without specifying exception name -- sometimes hurts readability
119 |     "TRY400",  # TODO double check this, might be useful
120 |     "TRY401",  # redundant exception in logging.exception call? TODO double check, might result in excessive logging
121 | 
122 |     "PGH",  # TODO force error code in mypy instead? although it also has blanket noqa rule
123 | 
124 |     "TID252",  # Prefer absolute imports over relative imports from parent modules
125 | 
126 |     "UP038",  # suggests using | (union) in isisntance checks.. but it results in slower code
127 | 
128 |     ## too annoying
129 |     "T20",     # just complains about prints and pprints
130 |     "Q",       # flake quotes, too annoying
131 |     "C90",     # some complexity checking
132 |     "G004",    # logging statement uses f string
133 |     "ERA001",  # commented out code
134 |     "SLF001",  # private member accessed
135 |     "BLE001",  # do not catch 'blind' Exception
136 |     "INP001",  # complains about implicit namespace packages
137 |     "SIM",     # some if statements crap
138 |     "RSE102",  # complains about missing parens in exceptions
139 |     ##
140 | 
141 |     "PLC0415", # "imports should be at the top level" -- not realistic
142 | ]
143 | 
144 | 
145 | extend-exclude = [
146 | ]
147 | 


--------------------------------------------------------------------------------