├── src └── goodrexport │ ├── py.typed │ ├── dal.py │ └── export.py ├── .gitmodules ├── mypy.ini ├── pytest.ini ├── .ci └── run ├── LICENSE ├── pyproject.toml ├── tox.ini ├── conftest.py ├── README.org ├── .gitignore ├── .github └── workflows │ └── main.yml └── ruff.toml /src/goodrexport/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "src/goodrexport/exporthelpers"] 2 | path = src/goodrexport/exporthelpers 3 | url = https://github.com/karlicoss/exporthelpers.git 4 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | pretty = True 3 | show_error_context = True 4 | show_column_numbers = True 5 | show_error_end = True 6 | 7 | check_untyped_defs = True 8 | 9 | # see https://mypy.readthedocs.io/en/stable/error_code_list2.html 10 | warn_redundant_casts = True 11 | strict_equality = True 12 | warn_unused_ignores = True 13 | enable_error_code = deprecated,redundant-expr,possibly-undefined,truthy-bool,truthy-iterable,ignore-without-code,unused-awaitable 14 | 15 | 16 | # an example of suppressing 17 | # [mypy-my.config.repos.pdfannots.pdfannots] 18 | # ignore_errors = True 19 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | # discover files that don't follow test_ naming. Useful to keep tests along with the source code 3 | python_files = *.py 4 | 5 | # this setting only impacts package/module naming under pytest, not the discovery 6 | consider_namespace_packages = true 7 | 8 | addopts = 9 | # prevent pytest cache from being created... it craps into project dir and I never use it anyway 10 | -p no:cacheprovider 11 | 12 | # -rap to print tests summary even when they are successful 13 | -rap 14 | --verbose 15 | 16 | # otherwise it won't discover doctests 17 | --doctest-modules 18 | 19 | # show all test durations (unless they are too short) 20 | --durations=0 21 | -------------------------------------------------------------------------------- /.ci/run: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eu 3 | 4 | cd "$(dirname "$0")" 5 | cd .. # git root 6 | 7 | if ! command -v sudo; then 8 | # CI or Docker sometimes doesn't have it, so useful to have a dummy 9 | function sudo { 10 | "$@" 11 | } 12 | fi 13 | 14 | # --parallel-live to show outputs while it's running 15 | tox_cmd='run-parallel --parallel-live' 16 | if [ -n "${CI-}" ]; then 17 | # install OS specific stuff here 18 | case "$OSTYPE" in 19 | darwin*) 20 | # macos 21 | : 22 | ;; 23 | cygwin* | msys* | win*) 24 | # windows 25 | # ugh. parallel stuff seems super flaky under windows, some random failures, "file used by other process" and crap like that 26 | tox_cmd='run' 27 | ;; 28 | *) 29 | # must be linux? 30 | : 31 | ;; 32 | esac 33 | fi 34 | 35 | # NOTE: expects uv installed 36 | uv tool run --with tox-uv tox $tox_cmd "$@" 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Dmitrii Gerasimov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # see https://github.com/karlicoss/pymplate for up-to-date reference 2 | [project] 3 | dynamic = ["version"] # version is managed by build backend 4 | name = "goodrexport" 5 | dependencies = [ 6 | "goodrexport[dal]", # TODO backwards compatibility -- remove later? 7 | ] 8 | requires-python = ">=3.9" 9 | 10 | ## these need to be set if you're planning to upload to pypi 11 | # description = "TODO" 12 | # license = {file = "LICENSE"} 13 | # authors = [ 14 | # {name = "Dima Gerasimov (@karlicoss)", email = "karlicoss@gmail.com"}, 15 | # ] 16 | # maintainers = [ 17 | # {name = "Dima Gerasimov (@karlicoss)", email = "karlicoss@gmail.com"}, 18 | # ] 19 | # 20 | # [project.urls] 21 | # Homepage = "https://github.com/karlicoss/pymplate" 22 | ## 23 | 24 | 25 | [project.optional-dependencies] 26 | dal = ["lxml"] 27 | export = [] 28 | optional = [ 29 | "orjson", # faster json processing 30 | "colorlog", 31 | "ijson", # faster iterative json processing 32 | ] 33 | [dependency-groups] 34 | # TODO: not sure, on the one hand could just use 'standard' dev dependency group 35 | # On the other hand, it's a bit annoying that it's always included by default? 36 | # To make sure it's not included, need to use `uv run --exact --no-default-groups ...` 37 | testing = [ 38 | "pytest", 39 | "ruff", 40 | "mypy", 41 | "lxml", # for mypy html coverage 42 | "ty>=0.0.1a15", 43 | 44 | "lxml-stubs", 45 | ] 46 | 47 | 48 | [build-system] 49 | requires = ["hatchling", "hatch-vcs"] 50 | build-backend = "hatchling.build" 51 | 52 | # unfortunately have to duplicate project name here atm, see https://github.com/pypa/hatch/issues/1894 53 | [tool.hatch.build.targets.wheel] 54 | packages = ["src/goodrexport"] 55 | 56 | [tool.hatch.version] 57 | source = "vcs" 58 | 59 | [tool.hatch.version.raw-options] 60 | version_scheme = "python-simplified-semver" 61 | local_scheme = "dirty-tag" 62 | 63 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | minversion = 3.21 3 | # relies on the correct version of Python installed 4 | envlist = ruff,tests,mypy,ty 5 | # https://github.com/tox-dev/tox/issues/20#issuecomment-247788333 6 | # hack to prevent .tox from crapping to the project directory 7 | toxworkdir = {env:TOXWORKDIR_BASE:}{toxinidir}/.tox 8 | 9 | [testenv] 10 | # TODO how to get package name from setuptools? 11 | package_name = "goodrexport" 12 | pass_env = 13 | # useful for tests to know they are running under ci 14 | CI 15 | CI_* 16 | # respect user's cache dirs to prevent tox from crapping into project dir 17 | PYTHONPYCACHEPREFIX 18 | MYPY_CACHE_DIR 19 | RUFF_CACHE_DIR 20 | 21 | set_env = 22 | # do not add current working directory to pythonpath 23 | # generally this is more robust and safer, prevents weird issues later on 24 | PYTHONSAFEPATH=1 25 | 26 | # default is 'editable', in which tox builds wheel first for some reason? not sure if makes much sense 27 | package = uv-editable 28 | 29 | 30 | [testenv:ruff] 31 | skip_install = true 32 | dependency_groups = testing 33 | commands = 34 | {envpython} -m ruff check \ 35 | {posargs} 36 | 37 | 38 | [testenv:tests] 39 | dependency_groups = testing 40 | commands = 41 | # posargs allow test filtering, e.g. tox ... -- -k test_name 42 | {envpython} -m pytest \ 43 | --pyargs {[testenv]package_name} \ 44 | {posargs} 45 | 46 | 47 | [testenv:mypy] 48 | dependency_groups = testing 49 | extras = optional 50 | commands = 51 | {envpython} -m mypy --no-install-types \ 52 | -p {[testenv]package_name} \ 53 | --txt-report .coverage.mypy \ 54 | --html-report .coverage.mypy \ 55 | # this is for github actions to upload to codecov.io 56 | # sadly xml coverage crashes on windows... so we need to disable it 57 | {env:CI_MYPY_COVERAGE} \ 58 | {posargs} 59 | 60 | 61 | [testenv:ty] 62 | dependency_groups = testing 63 | extras = optional 64 | deps = # any other dependencies (if needed) 65 | commands = 66 | {envpython} -m ty \ 67 | check \ 68 | {posargs} 69 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | # this is a hack to monkey patch pytest so it handles tests inside namespace packages without __init__.py properly 2 | # without it, pytest can't discover the package root for some reason 3 | # also see https://github.com/karlicoss/pytest_namespace_pkgs for more 4 | 5 | import os 6 | import pathlib 7 | from typing import Optional 8 | 9 | import _pytest.main 10 | import _pytest.pathlib 11 | 12 | # we consider all dirs in repo/ to be namespace packages 13 | root_dir = pathlib.Path(__file__).absolute().parent.resolve() / 'src' 14 | assert root_dir.exists(), root_dir 15 | 16 | # TODO assert it contains package name?? maybe get it via setuptools.. 17 | 18 | namespace_pkg_dirs = [str(d) for d in root_dir.iterdir() if d.is_dir()] 19 | 20 | # resolve_package_path is called from _pytest.pathlib.import_path 21 | # takes a full abs path to the test file and needs to return the path to the 'root' package on the filesystem 22 | resolve_pkg_path_orig = _pytest.pathlib.resolve_package_path 23 | 24 | 25 | def resolve_package_path(path: pathlib.Path) -> Optional[pathlib.Path]: 26 | result = path # search from the test file upwards 27 | for parent in result.parents: 28 | if str(parent) in namespace_pkg_dirs: 29 | return parent 30 | if os.name == 'nt': 31 | # ??? for some reason on windows it is trying to call this against conftest? but not on linux/osx 32 | if path.name == 'conftest.py': 33 | return resolve_pkg_path_orig(path) 34 | raise RuntimeError("Couldn't determine path for ", path) 35 | 36 | 37 | # NOTE: seems like it's not necessary anymore? 38 | # keeping it for now just in case 39 | # after https://github.com/pytest-dev/pytest/pull/13426 we should be able to remove the whole conftest 40 | # _pytest.pathlib.resolve_package_path = resolve_package_path 41 | 42 | 43 | # without patching, the orig function returns just a package name for some reason 44 | # (I think it's used as a sort of fallback) 45 | # so we need to point it at the absolute path properly 46 | # not sure what are the consequences.. maybe it wouldn't be able to run against installed packages? not sure.. 47 | search_pypath_orig = _pytest.main.search_pypath 48 | 49 | 50 | def search_pypath(module_name: str) -> str: 51 | mpath = root_dir / module_name.replace('.', os.sep) 52 | if not mpath.is_dir(): 53 | mpath = mpath.with_suffix('.py') 54 | assert mpath.exists(), mpath # just in case 55 | return str(mpath) 56 | 57 | 58 | _pytest.main.search_pypath = search_pypath # ty: ignore[invalid-assignment] 59 | -------------------------------------------------------------------------------- /README.org: -------------------------------------------------------------------------------- 1 | #+begin_src python :dir src :results drawer :exports results 2 | import goodrexport.export as E; return E.make_parser().prog 3 | #+end_src 4 | 5 | #+RESULTS: 6 | :results: 7 | Export/takeout for your personal Goodreads data 8 | :end: 9 | 10 | * Setting up 11 | 1. The easiest way is =pip3 install --user git+https://github.com/karlicoss/goodrexport=. 12 | 13 | Alternatively, use =git clone --recursive=, or =git pull && git submodule update --init=. After that, you can use =pip3 install --editable=. 14 | 2. [[https://www.goodreads.com/api/keys][get your API keys]] 15 | 16 | * Exporting 17 | 18 | #+begin_src python :dir src :results drawer :exports results 19 | import goodrexport.export as E; return E.make_parser().epilog 20 | #+end_src 21 | 22 | #+RESULTS: 23 | :results: 24 | 25 | Usage: 26 | 27 | *Recommended*: create =secrets.py= keeping your api parameters, e.g.: 28 | 29 | 30 | : user_id = "USER_ID" 31 | : key = "KEY" 32 | 33 | 34 | After that, use: 35 | 36 | : python3 -m goodrexport.export --secrets /path/to/secrets.py 37 | 38 | That way you type less and have control over where you keep your plaintext secrets. 39 | 40 | *Alternatively*, you can pass parameters directly, e.g. 41 | 42 | : python3 -m goodrexport.export --user_id --key 43 | 44 | However, this is verbose and prone to leaking your keys/tokens/passwords in shell history. 45 | 46 | 47 | You can also import ~goodrexport.export~ as a module and call ~get_xml~ function directly to get raw XML. 48 | 49 | 50 | I *highly* recommend checking exported files at least once just to make sure they contain everything you expect from your export. If not, please feel free to ask or raise an issue! 51 | 52 | :end: 53 | 54 | * Using data 55 | 56 | #+begin_src python :dir src :results drawer :exports results 57 | import goodrexport.exporthelpers.dal_helper as D; return D.make_parser().epilog 58 | #+end_src 59 | 60 | #+RESULTS: 61 | :results: 62 | 63 | You can use =goodrexport.dal= (stands for "Data Access/Abstraction Layer") to access your exported data, even offline. 64 | I elaborate on motivation behind it [[https://beepb00p.xyz/exports.html#dal][here]]. 65 | 66 | - main usecase is to be imported as python module to allow for *programmatic access* to your data. 67 | 68 | You can find some inspiration in [[https://beepb00p.xyz/mypkg.html][=my.=]] package that I'm using as an API to all my personal data. 69 | 70 | - to test it against your export, simply run: ~python3 -m goodrexport.dal --source /path/to/export~ 71 | 72 | - you can also try it interactively: ~python3 -m goodrexport.dal --source /path/to/export --interactive~ 73 | 74 | :end: 75 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/python,emacs 3 | 4 | ### Emacs ### 5 | # -*- mode: gitignore; -*- 6 | *~ 7 | \#*\# 8 | /.emacs.desktop 9 | /.emacs.desktop.lock 10 | *.elc 11 | auto-save-list 12 | tramp 13 | .\#* 14 | 15 | # Org-mode 16 | .org-id-locations 17 | *_archive 18 | 19 | # flymake-mode 20 | *_flymake.* 21 | 22 | # eshell files 23 | /eshell/history 24 | /eshell/lastdir 25 | 26 | # elpa packages 27 | /elpa/ 28 | 29 | # reftex files 30 | *.rel 31 | 32 | # AUCTeX auto folder 33 | /auto/ 34 | 35 | # cask packages 36 | .cask/ 37 | dist/ 38 | 39 | # Flycheck 40 | flycheck_*.el 41 | 42 | # server auth directory 43 | /server/ 44 | 45 | # projectiles files 46 | .projectile 47 | 48 | # directory configuration 49 | .dir-locals.el 50 | 51 | ### Python ### 52 | # Byte-compiled / optimized / DLL files 53 | __pycache__/ 54 | *.py[cod] 55 | *$py.class 56 | 57 | # C extensions 58 | *.so 59 | 60 | # Distribution / packaging 61 | .Python 62 | build/ 63 | develop-eggs/ 64 | downloads/ 65 | eggs/ 66 | .eggs/ 67 | lib/ 68 | lib64/ 69 | parts/ 70 | sdist/ 71 | var/ 72 | wheels/ 73 | *.egg-info/ 74 | .installed.cfg 75 | *.egg 76 | MANIFEST 77 | 78 | # PyInstaller 79 | # Usually these files are written by a python script from a template 80 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 81 | *.manifest 82 | *.spec 83 | 84 | # Installer logs 85 | pip-log.txt 86 | pip-delete-this-directory.txt 87 | 88 | # Unit test / coverage reports 89 | htmlcov/ 90 | .tox/ 91 | .coverage 92 | .coverage.* 93 | .cache 94 | nosetests.xml 95 | coverage.xml 96 | *.cover 97 | .hypothesis/ 98 | .pytest_cache/ 99 | 100 | # Translations 101 | *.mo 102 | *.pot 103 | 104 | # Django stuff: 105 | *.log 106 | local_settings.py 107 | db.sqlite3 108 | 109 | # Flask stuff: 110 | instance/ 111 | .webassets-cache 112 | 113 | # Scrapy stuff: 114 | .scrapy 115 | 116 | # Sphinx documentation 117 | docs/_build/ 118 | 119 | # PyBuilder 120 | target/ 121 | 122 | # Jupyter Notebook 123 | .ipynb_checkpoints 124 | 125 | # IPython 126 | profile_default/ 127 | ipython_config.py 128 | 129 | # pyenv 130 | .python-version 131 | 132 | # celery beat schedule file 133 | celerybeat-schedule 134 | 135 | # SageMath parsed files 136 | *.sage.py 137 | 138 | # Environments 139 | .env 140 | .venv 141 | env/ 142 | venv/ 143 | ENV/ 144 | env.bak/ 145 | venv.bak/ 146 | 147 | # Spyder project settings 148 | .spyderproject 149 | .spyproject 150 | 151 | # Rope project settings 152 | .ropeproject 153 | 154 | # mkdocs documentation 155 | /site 156 | 157 | # mypy 158 | .mypy_cache/ 159 | .dmypy.json 160 | dmypy.json 161 | 162 | ### Python Patch ### 163 | .venv/ 164 | 165 | ### Python.VirtualEnv Stack ### 166 | # Virtualenv 167 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ 168 | [Bb]in 169 | [Ii]nclude 170 | [Ll]ib 171 | [Ll]ib64 172 | [Ll]ocal 173 | [Ss]cripts 174 | pyvenv.cfg 175 | pip-selfcheck.json 176 | 177 | 178 | # End of https://www.gitignore.io/api/python,emacs 179 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | # see https://github.com/karlicoss/pymplate for up-to-date reference 2 | 3 | name: CI 4 | on: 5 | push: 6 | branches: '*' 7 | tags: 'v[0-9]+.*' # only trigger on 'release' tags for PyPi 8 | # Ideally I would put this in the pypi job... but github syntax doesn't allow for regexes there :shrug: 9 | 10 | # Needed to trigger on others' PRs. 11 | # Note that people who fork it need to go to "Actions" tab on their fork and click "I understand my workflows, go ahead and enable them". 12 | pull_request: 13 | 14 | # Needed to trigger workflows manually. 15 | workflow_dispatch: 16 | inputs: 17 | debug_enabled: 18 | type: boolean 19 | description: 'Run the build with tmate debugging enabled (https://github.com/marketplace/actions/debugging-with-tmate)' 20 | required: false 21 | default: false 22 | 23 | schedule: 24 | - cron: '31 18 * * 5' # run every Friday 25 | 26 | 27 | jobs: 28 | build: 29 | strategy: 30 | fail-fast: false 31 | matrix: 32 | platform: [ubuntu-latest, macos-latest, windows-latest] 33 | python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] 34 | exclude: [ 35 | # windows runners are pretty scarce, so let's only run lowest and highest python version 36 | {platform: windows-latest, python-version: '3.10'}, 37 | {platform: windows-latest, python-version: '3.11'}, 38 | {platform: windows-latest, python-version: '3.12'}, 39 | 40 | # same, macos is a bit too slow and ubuntu covers python quirks well 41 | {platform: macos-latest , python-version: '3.10'}, 42 | {platform: macos-latest , python-version: '3.11'}, 43 | {platform: macos-latest , python-version: '3.12'}, 44 | ] 45 | 46 | runs-on: ${{ matrix.platform }} 47 | 48 | # useful for 'optional' pipelines 49 | # continue-on-error: ${{ matrix.platform == 'windows-latest' }} 50 | 51 | steps: 52 | # ugh https://github.com/actions/toolkit/blob/main/docs/commands.md#path-manipulation 53 | - run: echo "$HOME/.local/bin" >> $GITHUB_PATH 54 | 55 | - uses: actions/checkout@v4 56 | with: 57 | submodules: recursive 58 | fetch-depth: 0 # nicer to have all git history when debugging/for tests 59 | 60 | - uses: actions/setup-python@v5 61 | with: 62 | python-version: ${{ matrix.python-version }} 63 | 64 | - uses: astral-sh/setup-uv@v5 65 | with: 66 | enable-cache: false # we don't have lock files, so can't use them as cache key 67 | 68 | - uses: mxschmitt/action-tmate@v3 69 | if: ${{ github.event_name == 'workflow_dispatch' && inputs.debug_enabled }} 70 | 71 | # explicit bash command is necessary for Windows CI runner, otherwise it thinks it's cmd... 72 | - run: bash .ci/run 73 | env: 74 | # only compute lxml coverage on ubuntu; it crashes on windows 75 | CI_MYPY_COVERAGE: ${{ matrix.platform == 'ubuntu-latest' && '--cobertura-xml-report .coverage.mypy' || '' }} 76 | 77 | - if: matrix.platform == 'ubuntu-latest' # no need to compute coverage for other platforms 78 | uses: codecov/codecov-action@v5 79 | with: 80 | fail_ci_if_error: true # default false 81 | token: ${{ secrets.CODECOV_TOKEN }} 82 | flags: mypy-${{ matrix.python-version }} 83 | files: .coverage.mypy/cobertura.xml 84 | 85 | -------------------------------------------------------------------------------- /src/goodrexport/dal.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from collections.abc import Iterator, Sequence 4 | from datetime import datetime, timezone 5 | from typing import NamedTuple, Optional 6 | 7 | from lxml import etree as ET 8 | 9 | from .exporthelpers import dal_helper 10 | from .exporthelpers.dal_helper import PathIsh, datetime_aware, pathify, the 11 | 12 | 13 | class Book(NamedTuple): 14 | id: str 15 | title: str 16 | authors: Sequence[str] 17 | shelves: Sequence[str] 18 | date_added: datetime_aware 19 | date_started: Optional[datetime_aware] 20 | date_read: Optional[datetime_aware] 21 | 22 | 23 | class Review(NamedTuple): 24 | id: str 25 | book: Book 26 | 27 | 28 | def _parse_date(s: Optional[str]) -> Optional[datetime_aware]: 29 | if s is None: 30 | return None 31 | res = datetime.strptime(s, "%a %b %d %H:%M:%S %z %Y") 32 | assert res.tzinfo is not None 33 | return res 34 | 35 | 36 | def _parse_review(r) -> Review: 37 | rid = the(r.xpath('id')) 38 | be = the(r.xpath('book')) 39 | title = the(be.xpath('title/text()')) 40 | authors = be.xpath('authors/author/name/text()') 41 | 42 | bid = the(r.xpath('id/text()')) 43 | # isbn_element = the(book_element.getElementsByTagName('isbn')) 44 | # isbn13_element = the(book_element.getElementsByTagName('isbn13')) 45 | date_added = the(r.xpath('date_added/text()')) 46 | sss = r.xpath('started_at/text()') 47 | rrr = r.xpath('read_at/text()') 48 | started_at = None if len(sss) == 0 else the(sss) 49 | read_at = None if len(rrr) == 0 else the(rrr) 50 | 51 | shelves = [s.attrib['name'] for s in r.xpath('shelves/shelf')] 52 | 53 | # if isbn_element.getAttribute('nil') != 'true': 54 | # book['isbn'] = isbn_element.firstChild.data 55 | # else: 56 | # book['isbn'] = '' 57 | 58 | # if isbn13_element.getAttribute('nil') != 'true': 59 | # book['isbn13'] = isbn13_element.firstChild.data 60 | # else: 61 | # book['isbn13'] = '' 62 | 63 | da = _parse_date(date_added) 64 | assert da is not None 65 | book = Book( 66 | id=bid, 67 | title=title, 68 | authors=authors, 69 | shelves=shelves, 70 | date_added=da, 71 | date_started=_parse_date(started_at), 72 | date_read=_parse_date(read_at), 73 | ) 74 | return Review( 75 | id=rid, 76 | book=book, 77 | ) 78 | 79 | 80 | class DAL: 81 | def __init__(self, sources: Sequence[PathIsh]) -> None: 82 | self.sources = list(map(pathify, sources)) 83 | # TODO take all sources into the account? 84 | self._source = max(self.sources) 85 | 86 | def reviews(self) -> Iterator[Review]: 87 | tree = ET.fromstring(self._source.read_text()) 88 | rxml = tree.xpath('//review') 89 | for r in rxml: # type: ignore[union-attr] 90 | yield _parse_review(r) 91 | 92 | 93 | def demo(dal: DAL) -> None: 94 | print("Your books:") 95 | 96 | mindt = datetime.min.replace(tzinfo=timezone.utc) 97 | reviews = sorted(dal.reviews(), key=lambda r: r.book.date_read or mindt) 98 | for r in reviews: 99 | print(r.book.date_read, r.book.title) 100 | 101 | 102 | def main() -> None: 103 | dal_helper.main(DAL=DAL, demo=demo) 104 | 105 | 106 | if __name__ == '__main__': 107 | main() 108 | -------------------------------------------------------------------------------- /src/goodrexport/export.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import argparse 4 | from textwrap import dedent 5 | from urllib.parse import urlencode 6 | from urllib.request import urlopen 7 | from xml.dom.minidom import parse 8 | 9 | from .exporthelpers.export_helper import Parser, setup_parser 10 | 11 | # https://www.goodreads.com/api 12 | # TODO maybe add these too? 13 | # auth.user — Get id of user who authorized OAuth. 14 | # group.list — List groups for a given user. 15 | # owned_books.list — List books owned by a user. 16 | # reviews.list — Get the books on a members shelf. 17 | # review.show — Get a review. 18 | # review.show_by_user_and_book — Get a user's review for a given book. 19 | # shelves.list — Get a user's shelves. 20 | # user.show — Get info about a member by id or username. 21 | # user.followers — Get a user's followers. 22 | # user.following — Get people a user is following. 23 | # user.friends — Get a user's friends. 24 | 25 | 26 | class Exporter: 27 | def __init__(self, *args, **kwargs) -> None: # noqa: ARG002 28 | self.base_url = 'https://www.goodreads.com/' 29 | self.user_id = kwargs['user_id'] 30 | self.key = kwargs['key'] 31 | self.per_page = 200 32 | 33 | # apparently no json... https://www.goodreads.com/topic/show/1663342-json-endpoints 34 | def _get(self, endpoint: str, **kwargs): 35 | current_page = 1 36 | total = None 37 | 38 | results = [] # type: ignore[var-annotated] 39 | while total is None or len(results) < total: 40 | query = urlencode( 41 | [ 42 | ('v', '2'), 43 | ('key', self.key), 44 | ('per_page', self.per_page), 45 | ('page', current_page), 46 | *kwargs.items(), 47 | ] 48 | ) 49 | url = self.base_url + endpoint + '.xml?' + query 50 | chunk = parse(urlopen(url)) 51 | 52 | [curr] = chunk.getElementsByTagName('reviews') 53 | total = int(curr.getAttribute('total')) 54 | results.extend(curr.getElementsByTagName('review')) 55 | current_page += 1 56 | return results 57 | 58 | def export_xml(self) -> str: 59 | nodes = [] 60 | for node_name, endpoint in [ 61 | ## TODO looks like friends require oauth.. 62 | # 'friend/user/' + self.user, 63 | # https://gist.github.com/gpiancastelli/537923 64 | ## 65 | ## TODO shelves are a mess too... 66 | # 'shelf/list', 67 | # 68 | ('reviews', 'review/list'), 69 | ]: 70 | results = self._get(endpoint, id=self.user_id) 71 | body = ''.join(x.toprettyxml() for x in results) 72 | # eh, not sure why toprettyxml adds so many newlines.. whatever 73 | nodes.append( 74 | dedent(f''' 75 | <{node_name}> 76 | {body} 77 | 78 | ''') 79 | ) 80 | nodess = ''.join(nodes) 81 | return dedent(f''' 82 | 83 | {nodess} 84 | ''') 85 | 86 | 87 | def get_xml(**params): 88 | return Exporter(**params).export_xml() 89 | 90 | 91 | def make_parser() -> argparse.ArgumentParser: 92 | parser = Parser('Export/takeout for your personal Goodreads data') 93 | setup_parser( 94 | parser, 95 | params=['user_id', 'key'], 96 | # TODO not sure if worth automating? 97 | extra_usage=''' 98 | You can also import ~goodrexport.export~ as a module and call ~get_xml~ function directly to get raw XML. 99 | ''', 100 | ) 101 | return parser 102 | 103 | 104 | def main() -> None: 105 | parser = make_parser() 106 | args = parser.parse_args() 107 | 108 | params = args.params 109 | dumper = args.dumper 110 | 111 | x = get_xml(**params) 112 | dumper(x) 113 | 114 | 115 | if __name__ == '__main__': 116 | main() 117 | -------------------------------------------------------------------------------- /ruff.toml: -------------------------------------------------------------------------------- 1 | lint.extend-select = [ 2 | "F", # flakes rules -- default, but extend just in case 3 | "E", # pycodestyle -- default, but extend just in case 4 | "W", # various warnings 5 | 6 | "B", # 'bugbear' set -- various possible bugs 7 | "C4", # flake8-comprehensions -- unnecessary list/map/dict calls 8 | "COM", # trailing commas 9 | "EXE", # various checks wrt executable files 10 | "I", # sort imports 11 | "ICN", # various import conventions 12 | "FBT", # detect use of boolean arguments 13 | "FURB", # various rules 14 | "PERF", # various potential performance speedups 15 | "PD", # pandas rules 16 | "PIE", # 'misc' lints 17 | "PLC", # pylint convention rules 18 | "PLR", # pylint refactor rules 19 | "PLW", # pylint warnings 20 | "PT", # pytest stuff 21 | "PYI", # various type hinting rules 22 | "RET", # early returns 23 | "RUF", # various ruff-specific rules 24 | "TID", # various imports suggestions 25 | "TRY", # various exception handling rules 26 | "UP", # detect deprecated python stdlib stuff 27 | "FA", # suggest using from __future__ import annotations 28 | "PTH", # pathlib migration 29 | "ARG", # unused argument checks 30 | "A", # builtin shadowing 31 | "G", # logging stuff 32 | 33 | # "ALL", # uncomment this to check for new rules! 34 | ] 35 | 36 | # Preserve types, even if a file imports `from __future__ import annotations` 37 | # we need this for cachew to work with HPI types on 3.9 38 | # can probably remove after 3.10? 39 | lint.pyupgrade.keep-runtime-typing = true 40 | 41 | lint.ignore = [ 42 | "D", # annoying nags about docstrings 43 | "N", # pep naming 44 | "TCH", # type checking rules, mostly just suggests moving imports under TYPE_CHECKING 45 | "S", # bandit (security checks) -- tends to be not very useful, lots of nitpicks 46 | "DTZ", # datetimes checks -- complaining about missing tz and mostly false positives 47 | "FIX", # complains about fixmes/todos -- annoying 48 | "TD", # complains about todo formatting -- too annoying 49 | "ANN", # missing type annotations? seems way to strict though 50 | "EM" , # suggests assigning all exception messages into a variable first... pretty annoying 51 | 52 | ### too opinionated style checks 53 | "E501", # too long lines 54 | "E702", # Multiple statements on one line (semicolon) 55 | "E731", # assigning lambda instead of using def 56 | "E741", # Ambiguous variable name: `l` 57 | "E742", # Ambiguous class name: `O 58 | "E401", # Multiple imports on one line 59 | "F403", # import *` used; unable to detect undefined names 60 | ### 61 | 62 | ### 63 | "E722", # Do not use bare `except` ## Sometimes it's useful for defensive imports and that sort of thing.. 64 | "F811", # Redefinition of unused # this gets in the way of pytest fixtures (e.g. in cachew) 65 | 66 | ## might be nice .. but later and I don't wanna make it strict 67 | "E402", # Module level import not at top of file 68 | 69 | "RUF100", # unused noqa -- handle later 70 | "RUF012", # mutable class attrs should be annotated with ClassVar... ugh pretty annoying for user configs 71 | 72 | ### these are just nitpicky, we usually know better 73 | "PLR0911", # too many return statements 74 | "PLR0912", # too many branches 75 | "PLR0913", # too many function arguments 76 | "PLR0915", # too many statements 77 | "PLR1714", # consider merging multiple comparisons 78 | "PLR2044", # line with empty comment 79 | "PLR5501", # use elif instead of else if 80 | "PLR2004", # magic value in comparison -- super annoying in tests 81 | ### 82 | "PLR0402", # import X.Y as Y -- TODO maybe consider enabling it, but double check 83 | 84 | "B009", # calling gettattr with constant attribute -- this is useful to convince mypy 85 | "B010", # same as above, but setattr 86 | "B011", # complains about assert False 87 | "B017", # pytest.raises(Exception) 88 | "B023", # seems to result in false positives? 89 | "B028", # suggest using explicit stacklevel? TODO double check later, but not sure it's useful 90 | 91 | # complains about useless pass, but has sort of a false positive if the function has a docstring? 92 | # this is common for click entrypoints (e.g. in __main__), so disable 93 | "PIE790", 94 | 95 | # a bit too annoying, offers to convert for loops to list comprehension 96 | # , which may heart readability 97 | "PERF401", 98 | 99 | # suggests no using exception in for loops 100 | # we do use this technique a lot, plus in 3.11 happy path exception handling is "zero-cost" 101 | "PERF203", 102 | 103 | "RET504", # unnecessary assignment before returning -- that can be useful for readability 104 | "RET505", # unnecessary else after return -- can hurt readability 105 | 106 | "PLW0603", # global variable update.. we usually know why we are doing this 107 | "PLW2901", # for loop variable overwritten, usually this is intentional 108 | 109 | "PT011", # pytest raises should is too broad 110 | "PT012", # pytest raises should contain a single statement 111 | 112 | "COM812", # trailing comma missing -- mostly just being annoying with long multiline strings 113 | 114 | "PD901", # generic variable name df 115 | 116 | "TRY003", # suggests defining exception messages in exception class -- kinda annoying 117 | "TRY004", # prefer TypeError -- don't see the point 118 | "TRY201", # raise without specifying exception name -- sometimes hurts readability 119 | "TRY400", # TODO double check this, might be useful 120 | "TRY401", # redundant exception in logging.exception call? TODO double check, might result in excessive logging 121 | 122 | "PGH", # TODO force error code in mypy instead? although it also has blanket noqa rule 123 | 124 | "TID252", # Prefer absolute imports over relative imports from parent modules 125 | 126 | "UP038", # suggests using | (union) in isisntance checks.. but it results in slower code 127 | 128 | ## too annoying 129 | "T20", # just complains about prints and pprints 130 | "Q", # flake quotes, too annoying 131 | "C90", # some complexity checking 132 | "G004", # logging statement uses f string 133 | "ERA001", # commented out code 134 | "SLF001", # private member accessed 135 | "BLE001", # do not catch 'blind' Exception 136 | "INP001", # complains about implicit namespace packages 137 | "SIM", # some if statements crap 138 | "RSE102", # complains about missing parens in exceptions 139 | ## 140 | 141 | "PLC0415", # "imports should be at the top level" -- not realistic 142 | ] 143 | 144 | 145 | extend-exclude = [ 146 | ] 147 | --------------------------------------------------------------------------------