├── .gitignore
├── .python-version
├── CHANGELOG.md
├── LICENSE
├── README.md
├── justfile
├── pyproject.toml
├── reddit_user_to_sqlite
    ├── __init__.py
    ├── cli.py
    ├── csv_helpers.py
    ├── helpers.py
    ├── reddit_api.py
    └── sqlite_helpers.py
├── tests
    ├── __init__.py
    ├── conftest.py
    ├── test_cli.py
    ├── test_csv_helpers.py
    ├── test_helpers.py
    ├── test_reddit_api.py
    └── test_sqlite_helpers.py
└── tox.ini


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by https://www.toptal.com/developers/gitignore/api/python
  2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python
  3 | 
  4 | ### Python ###
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | cover/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | .pybuilder/
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | #   For a library or package, you might want to ignore these files since the code is
 91 | #   intended to run in multiple environments; otherwise, check them in:
 92 | # .python-version
 93 | 
 94 | # pipenv
 95 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 96 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 97 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 98 | #   install all needed dependencies.
 99 | #Pipfile.lock
100 | 
101 | # poetry
102 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
104 | #   commonly ignored for libraries.
105 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106 | #poetry.lock
107 | 
108 | # pdm
109 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110 | #pdm.lock
111 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112 | #   in version control.
113 | #   https://pdm.fming.dev/#use-with-ide
114 | .pdm.toml
115 | 
116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117 | __pypackages__/
118 | 
119 | # Celery stuff
120 | celerybeat-schedule
121 | celerybeat.pid
122 | 
123 | # SageMath parsed files
124 | *.sage.py
125 | 
126 | # Environments
127 | .env
128 | .venv
129 | env/
130 | venv/
131 | ENV/
132 | env.bak/
133 | venv.bak/
134 | 
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 | 
139 | # Rope project settings
140 | .ropeproject
141 | 
142 | # mkdocs documentation
143 | /site
144 | 
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 | 
150 | # Pyre type checker
151 | .pyre/
152 | 
153 | # pytype static type analyzer
154 | .pytype/
155 | 
156 | # Cython debug symbols
157 | cython_debug/
158 | 
159 | # PyCharm
160 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
163 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
164 | #.idea/
165 | 
166 | ### Python Patch ###
167 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
168 | poetry.toml
169 | 
170 | # ruff
171 | .ruff_cache/
172 | 
173 | # LSP config files
174 | pyrightconfig.json
175 | 
176 | # End of https://www.toptal.com/developers/gitignore/api/python
177 | 
178 | *.db
179 | config.json
180 | metadata.json
181 | 
182 | launch.json
183 | 


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.11.0
2 | 3.10.0
3 | 3.9.16
4 | 3.8.16
5 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | This project uses [SemVer](https://semver.org/) for versioning. Its public APIs, runtime support, and documented file locations won't change incompatibly outside of major versions (once version 1.0.0 has been released). There may be breaking schema changes in minor releases before 1.0.0 and will be noted in these release notes.
 4 | 
 5 | ## 0.4.2
 6 | 
 7 | _released `2023-07-22`_
 8 | 
 9 | - handle [new rate limiting](https://support.reddithelp.com/hc/en-us/articles/16160319875092-Reddit-Data-API-Wiki) more gracefully (fixes [#23](https://github.com/xavdid/reddit-user-to-sqlite/issues/23) via [#24](https://github.com/xavdid/reddit-user-to-sqlite/pull/24) (by [@piyh](https://github.com/piyh)) and [#25](https://github.com/xavdid/reddit-user-to-sqlite/pull/25))
10 | 
11 | ## 0.4.1
12 | 
13 | _released `2023-06-25`_
14 | 
15 | - specify `utf-8` as the default character encoding, improving windows compatibility (fixes [#10](https://github.com/xavdid/reddit-user-to-sqlite/issues/10))
16 | 
17 | ## 0.4.0
18 | 
19 | _released `2023-06-14`_
20 | 
21 | - the `archive` command includes saved posts / comments by default (in their own table). Use the `--skip-saved` flag to opt out of this behavior ([#16](https://github.com/xavdid/reddit-user-to-sqlite/pull/16))
22 | - add support for Python 3.9, verified using `tox` ([#19](https://github.com/xavdid/reddit-user-to-sqlite/pull/19))
23 | - add `num_awards` column to comments (was omitted by accident) ([#18](https://github.com/xavdid/reddit-user-to-sqlite/pull/18))
24 | - added support for disabling the progress bars via the `DISABLE_PROGRESS` env var. Set it to `1` to disable progress bars ([#16](https://github.com/xavdid/reddit-user-to-sqlite/pull/16))
25 | 
26 | ## 0.3.1
27 | 
28 | _released `2023-06-09`_
29 | 
30 | - remove dependency on 3.11 by adding `typing-extensions` ([#3](https://github.com/xavdid/reddit-user-to-sqlite/pull/3) by [@changlinli](https://github.com/changlinli))
31 | 
32 | ## 0.3.0
33 | 
34 | _released `2023-05-23`_
35 | 
36 | - adds the `archive` command, which loads data from a Reddit GDPR archive ([#1](https://github.com/xavdid/reddit-user-to-sqlite/pull/1))
37 | - added more help text to both commands
38 | - provide more info about the counts of comments/posts saved/updated
39 | 
40 | ## 0.2.0
41 | 
42 | _released `2023-05-07`_
43 | 
44 | - improves the `user` command to also fetch submitted posts and store them in a corresponding `posts` table.
45 | 
46 | ## 0.1.0
47 | 
48 | _released `2023-05-06`_
49 | 
50 | - Initial public release!
51 | - Adds the `user` command, which currently only fetches comments
52 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 David Brownman
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # reddit-user-to-sqlite
  2 | 
  3 | Stores all the content from a specific user in a SQLite database. This includes their comments and their posts.
  4 | 
  5 | ## Install
  6 | 
  7 | The best way to install the package is by using [pipx](https://pypa.github.io/pipx/):
  8 | 
  9 | ```bash
 10 | pipx install reddit-user-to-sqlite
 11 | ```
 12 | 
 13 | It's also available via [brew](https://brew.sh/):
 14 | 
 15 | ```bash
 16 | brew install xavdid/projects/reddit-user-to-sqlite
 17 | ```
 18 | 
 19 | ## Usage
 20 | 
 21 | The CLI currently exposes two commands: `user` and `archive`. They allow you to archive recent comments/posts from the API or _all_ posts (as read from a CSV file).
 22 | 
 23 | ### user
 24 | 
 25 | Fetches all comments and posts for a specific user.
 26 | 
 27 | ```bash
 28 | reddit-user-to-sqlite user your_username
 29 | reddit-user-to-sqlite user your_username --db my-reddit-data.db
 30 | ```
 31 | 
 32 | #### Params
 33 | 
 34 | > Note: the argument order is reversed from most dogsheep packages (which take db_path first). This method allows for use of a default db name, so I prefer it.
 35 | 
 36 | 1. `username`: a case-insensitive string. The leading `/u/` is optional (and ignored if supplied).
 37 | 2. (optional) `--db`: the path to a sqlite file, which will be created or updated as needed. Defaults to `reddit.db`.
 38 | 
 39 | ### archive
 40 | 
 41 | Reads the output of a [Reddit GDPR archive](https://support.reddithelp.com/hc/en-us/articles/360043048352-How-do-I-request-a-copy-of-my-Reddit-data-and-information-) and fetches additional info from the Reddit API (where possible). This allows you to store more than 1k posts/comments.
 42 | 
 43 | > FYI: this behavior is built with the assumption that the archive that Reddit provides has the same format regardless of if you select `GDPR` or `CCPA` as the request type. But, just to be on the safe side, I recommend selecting `GDPR` during the export process until I'm able to confirm.
 44 | 
 45 | #### Params
 46 | 
 47 | > Note: the argument order is reversed from most dogsheep packages (which take db_path first). This method allows for use of a default db name, so I prefer it.
 48 | 
 49 | 1. `archive_path`: the path to the (unzipped) archive directory on your machine. Don't rename/move the files that Reddit gives you.
 50 | 2. (optional) `--db`: the path to a sqlite file, which will be created or updated as needed. Defaults to `reddit.db`.
 51 | 3. (optional) `--skip-saved`: a flag for skipping the inclusion of loading saved comments/posts from the archive.
 52 | 
 53 | ## Viewing Data
 54 | 
 55 | The resulting SQLite database pairs well with [Datasette](https://datasette.io/), a tool for viewing SQLite in the web. Below is my recommended configuration.
 56 | 
 57 | First, install `datasette`:
 58 | 
 59 | ```bash
 60 | pipx install datasette
 61 | ```
 62 | 
 63 | Then, add the recommended plugins (for rendering timestamps and markdown):
 64 | 
 65 | ```bash
 66 | pipx inject datasette datasette-render-markdown datasette-render-timestamps
 67 | ```
 68 | 
 69 | Finally, create a `metadata.json` file next to your `reddit.db` with the following:
 70 | 
 71 | ```json
 72 | {
 73 |   "databases": {
 74 |     "reddit": {
 75 |       "tables": {
 76 |         "comments": {
 77 |           "sort_desc": "timestamp",
 78 |           "plugins": {
 79 |             "datasette-render-markdown": {
 80 |               "columns": ["text"]
 81 |             },
 82 |             "datasette-render-timestamps": {
 83 |               "columns": ["timestamp"]
 84 |             }
 85 |           }
 86 |         },
 87 |         "posts": {
 88 |           "sort_desc": "timestamp",
 89 |           "plugins": {
 90 |             "datasette-render-markdown": {
 91 |               "columns": ["text"]
 92 |             },
 93 |             "datasette-render-timestamps": {
 94 |               "columns": ["timestamp"]
 95 |             }
 96 |           }
 97 |         },
 98 |         "subreddits": {
 99 |           "sort": "name"
100 |         }
101 |       }
102 |     }
103 |   }
104 | }
105 | ```
106 | 
107 | Now when you run
108 | 
109 | ```bash
110 | datasette reddit.db --metadata metadata.json
111 | ```
112 | 
113 | You'll get a nice, formatted output:
114 | 
115 | ![](https://cdn.zappy.app/93b1760ab541a8b68c2ee2899be5e079.png)
116 | 
117 | ![](https://cdn.zappy.app/5850a782196d1c7a83a054400c0a5dc4.png)
118 | 
119 | ## Motivation
120 | 
121 | I got nervous when I saw Reddit's [notification of upcoming API changes](https://old.reddit.com/r/reddit/comments/12qwagm/an_update_regarding_reddits_api/). To ensure I could always access data I created, I wanted to make sure I had a backup in place before anything changed in a big way.
122 | 
123 | ## FAQs
124 | 
125 | ### Why does this post only show 1k recent comments / posts?
126 | 
127 | Reddit's paging API only shows 1000 items (page 11 is an empty list). If you have more comments (or posts) than than that, you can use the [GDPR archive import feature](#archive) feature to backfill your older data.
128 | 
129 | ### Why are my longer posts truncated in Datasette?
130 | 
131 | Datasette truncates long text fields by default. You can disable this behavior by using the `truncate_cells_html` flag when running `datasette` ([see the docs](https://docs.datasette.io/en/stable/settings.html#truncate-cells-html)):
132 | 
133 | ```shell
134 | datasette reddit.db --setting truncate_cells_html 0
135 | ```
136 | 
137 | ### How do I store a username that starts with `-`?
138 | 
139 | By default, [click](https://click.palletsprojects.com/en/8.1.x/) (the argument parser this uses) interprets leading dashes on argument as a flag. If you're fetching data for user `-asdf`, you'll get an error saying `Error: No such option: -a`. To ensure the last argument is interpreted positionally, put it after a `--`:
140 | 
141 | ```shell
142 | reddit-user-to-sqlite user -- -asdf
143 | ```
144 | 
145 | ### Why do some of my posts say `[removed]` even though I can see them on the web?
146 | 
147 | If a post is removed, only the mods and the user who posted it can see its text. Since this tool currently runs without any authentication, those removed posts can't be fetched via the API.
148 | 
149 | To load data about your own removed posts, use the [GDPR archive import feature](#archive).
150 | 
151 | ### Why is the database missing data returned by the Reddit API?
152 | 
153 | While most [Dogsheep](https://github.com/dogsheep) projects grab the raw JSON output of their source APIs, Reddit's API has a lot of junk in it. So, I opted for a slimmed down approach.
154 | 
155 | If there's a field missing that you think would be useful, feel free to open an issue!
156 | 
157 | ### Does this tool refetch old data?
158 | 
159 | When running the `user` command, yes. It fetches and updates up to 1k each of comments and posts and updates the local copy.
160 | 
161 | When running the `archive` command, no. To cut down on API requests, it only fetches data about comments/posts that aren't yet in the database (since the archive may include many items).
162 | 
163 | Both of these may change in the future to be more in line with [Reddit's per-subreddit archiving guidelines](https://www.reddit.com/r/modnews/comments/py2xy2/voting_commenting_on_archived_posts/).
164 | 
165 | ## Development
166 | 
167 | This section is people making changes to this package.
168 | 
169 | When in a virtual environment, run the following:
170 | 
171 | ```bash
172 | pip install -e '.[test]'
173 | ```
174 | 
175 | This installs the package in `--edit` mode and makes its dependencies available. You can now run `reddit-user-to-sqlite` to invoke the CLI.
176 | 
177 | ### Running Tests
178 | 
179 | In your virtual environment, a simple `pytest` should run the unit test suite. You can also run `pyright` for type checking.
180 | 
181 | ### Releasing New Versions
182 | 
183 | > these notes are mostly for myself (or other contributors)
184 | 
185 | 1. Run `just release` while your venv is active
186 | 2. paste the stored API key (If you're getting invalid password, verify that `~/.pypirc` is empty)
187 | 


--------------------------------------------------------------------------------
/justfile:
--------------------------------------------------------------------------------
 1 | _default:
 2 |     just --list
 3 | 
 4 | # error out if this isn't being run in a venv
 5 | _require-venv:
 6 |     #!/usr/bin/env python
 7 |     import sys
 8 |     sys.exit(sys.prefix == sys.base_prefix)
 9 | 
10 | # run test suite against multiple python versions
11 | @tox:
12 |     tox run-parallel
13 | 
14 | @lint:
15 |     ruff .
16 |     black --check --quiet .
17 | 
18 | # lint&fix files, useful for a pre-commit hook
19 | @lint-fix:
20 |     ruff . --fix
21 |     black --quiet .
22 | 
23 | @typecheck:
24 |     pyright -p pyproject.toml
25 | 
26 | # perform all checks, but don't change any files
27 | @validate: tox lint typecheck
28 | 
29 | @local: _require-venv validate
30 | 
31 | # run the full ci pipeline
32 | ci: && validate
33 |     pip install .[test,ci]
34 | 
35 | # useful for reinstalling after changing dependencies
36 | @reinstall: _require-venv
37 |     pip install -e .[test,ci]
38 | 
39 | @release: _require-venv validate
40 |     rm -rf dist
41 |     pip install -e .[release]
42 |     python -m build
43 |     # give upload api key at runtime
44 |     python -m twine upload --username __token__ dist/*
45 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "reddit-user-to-sqlite"
 3 | version = "0.4.2"
 4 | 
 5 | authors = [{ name = "David Brownman", email = "beamneocube@gmail.com" }]
 6 | description = "Create a SQLite database containing data pulled from Reddit about a single user."
 7 | readme = "README.md"
 8 | license = { file = "LICENSE" }
 9 | 
10 | requires-python = ">=3.9"
11 | classifiers = [
12 |   "Programming Language :: Python :: 3",
13 |   "Development Status :: 3 - Alpha",
14 |   "Environment :: Console",
15 |   "License :: OSI Approved :: MIT License",
16 |   "Operating System :: OS Independent",
17 |   "Natural Language :: English",
18 | ]
19 | keywords = ["sqlite", "reddit", "dogsheep"]
20 | 
21 | dependencies = [
22 |   "sqlite-utils==3.32.1",
23 |   "click==8.1.3",
24 |   "requests==2.29.0",
25 |   "tqdm==4.65.0",
26 | ]
27 | 
28 | [project.optional-dependencies]
29 | test = ["pytest==7.3.1", "responses==0.23.1"]
30 | release = ["twine==4.0.2", "build==0.10.0"]
31 | ci = ["black==23.3.0", "pyright==1.1.318", "ruff==0.0.277"]
32 | 
33 | [project.urls]
34 | "Homepage" = "https://github.com/xavdid/reddit-user-to-sqlite"
35 | "Bug Tracker" = "https://github.com/xavdid/reddit-user-to-sqlite/issues"
36 | "Author" = "https://xavd.id"
37 | "Changelog" = "https://github.com/xavdid/reddit-user-to-sqlite/blob/main/CHANGELOG.md"
38 | 
39 | [project.scripts]
40 | reddit-user-to-sqlite = "reddit_user_to_sqlite.cli:cli"
41 | 
42 | [build-system]
43 | requires = ["flit_core>=3.4"]
44 | build-backend = "flit_core.buildapi"
45 | 
46 | # needed so the LSP performs typechecking
47 | [tool.pyright]
48 | 
49 | [tool.ruff]
50 | select = ["E", "F", "I001"] # defaults & isort
51 | ignore = ["E501"]
52 | 


--------------------------------------------------------------------------------
/reddit_user_to_sqlite/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavdid/reddit-user-to-sqlite/e02fad746694f32ebc2ee2efce82652857682cc6/reddit_user_to_sqlite/__init__.py


--------------------------------------------------------------------------------
/reddit_user_to_sqlite/cli.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | from pathlib import Path
  3 | from typing import Callable, Iterable, Optional, TypeVar, cast
  4 | 
  5 | import click
  6 | from sqlite_utils import Database
  7 | 
  8 | from reddit_user_to_sqlite.csv_helpers import (
  9 |     PrefixType,
 10 |     get_username_from_archive,
 11 |     load_unsaved_ids_from_file,
 12 | )
 13 | from reddit_user_to_sqlite.helpers import clean_username, find_user_details_from_items
 14 | from reddit_user_to_sqlite.reddit_api import (
 15 |     Comment,
 16 |     Post,
 17 |     add_missing_user_fragment,
 18 |     get_user_id,
 19 |     load_comments_for_user,
 20 |     load_info,
 21 |     load_posts_for_user,
 22 | )
 23 | from reddit_user_to_sqlite.sqlite_helpers import (
 24 |     ensure_fts,
 25 |     insert_users,
 26 |     upsert_comments,
 27 |     upsert_posts,
 28 |     upsert_subreddits,
 29 | )
 30 | 
 31 | 
 32 | @click.group()
 33 | @click.version_option()
 34 | def cli():
 35 |     "Save data from Reddit to a SQLite database"
 36 | 
 37 | 
 38 | DB_PATH_HELP = "A path to a SQLite database file. If it doesn't exist, it will be created. It can have any extension, `.db` or `.sqlite` is recommended."
 39 | DEFAULT_DB_NAME = "reddit.db"
 40 | 
 41 | DELETED_USERNAME = "__DeletedUser__"
 42 | DELETED_USER_FULLNAME = "t2_1234567"
 43 | 
 44 | T = TypeVar("T", Comment, Post)
 45 | 
 46 | 
 47 | def _save_items(
 48 |     db: Database,
 49 |     items: list[T],
 50 |     upsert_func: Callable[[Database, Iterable[T], Optional[PrefixType]], int],
 51 |     table_prefix: Optional[PrefixType] = None,
 52 | ) -> int:
 53 |     if not items:
 54 |         return 0
 55 | 
 56 |     insert_users(db, items)
 57 |     upsert_subreddits(db, items)
 58 |     return upsert_func(db, items, table_prefix)
 59 | 
 60 | 
 61 | save_comments = partial(_save_items, upsert_func=upsert_comments)
 62 | save_posts = partial(_save_items, upsert_func=upsert_posts)
 63 | 
 64 | 
 65 | def load_data_from_files(
 66 |     db: Database,
 67 |     archive_path: Path,
 68 |     own_data=True,
 69 |     tables_prefix: Optional[PrefixType] = None,
 70 | ):
 71 |     """
 72 |     if own data is true, requires a username to save. Otherwise, will add a placeholder
 73 |     (for external data)
 74 |     """
 75 |     new_comment_ids = load_unsaved_ids_from_file(
 76 |         db, archive_path, "comments", prefix=tables_prefix
 77 |     )
 78 |     click.echo(f"\nFetching info about {'your' if own_data else 'saved'} comments")
 79 |     comments = cast(list[Comment], load_info(new_comment_ids))
 80 | 
 81 |     post_ids = load_unsaved_ids_from_file(
 82 |         db, archive_path, "posts", prefix=tables_prefix
 83 |     )
 84 |     click.echo(f"\nFetching info about {'your' if own_data else 'saved'} posts")
 85 |     posts = cast(list[Post], load_info(post_ids))
 86 | 
 87 |     username = None
 88 |     user_fullname = None
 89 | 
 90 |     if own_data:
 91 |         # find the username, first from any of the loaded comments/posts
 92 |         if user_details := (
 93 |             find_user_details_from_items(comments)
 94 |             or find_user_details_from_items(posts)
 95 |         ):
 96 |             username, user_fullname = user_details
 97 |         # if all loaded posts are removed (which could be the case on subsequent runs),
 98 |         # then try to load from archive
 99 |         elif username := get_username_from_archive(archive_path):
100 |             user_fullname = f"t2_{get_user_id(username)}"
101 |         # otherwise, your posts without a username won't be saved;
102 |         # this only happens for malformed archives
103 |         else:
104 |             click.echo(
105 |                 "\nUnable to guess username from API content or archive; some data will not be saved.",
106 |                 err=True,
107 |             )
108 |     else:
109 |         username = DELETED_USERNAME
110 |         user_fullname = DELETED_USER_FULLNAME
111 | 
112 |     if username and user_fullname:
113 |         comments = add_missing_user_fragment(comments, username, user_fullname)
114 |         posts = add_missing_user_fragment(posts, username, user_fullname)
115 | 
116 |     num_comments_written = save_comments(db, comments, table_prefix=tables_prefix)
117 |     num_posts_written = save_posts(db, posts, table_prefix=tables_prefix)
118 | 
119 |     messages = [
120 |         "\nDone!",
121 |         f" - saved {num_comments_written} new comments",
122 |         f" - saved {num_posts_written} new posts",
123 |     ]
124 | 
125 |     if missing_comments := len(comments) - num_comments_written:
126 |         messages.append(
127 |             f" - failed to find {missing_comments} missing comments; ignored for now"
128 |         )
129 |     if missing_posts := len(post_ids) - num_posts_written:
130 |         messages.append(
131 |             f" - failed to find {missing_posts} missing posts; ignored for now"
132 |         )
133 | 
134 |     click.echo("\n".join(messages))
135 | 
136 | 
137 | @cli.command()
138 | @click.argument("username")
139 | @click.option(
140 |     "--db",
141 |     "db_path",
142 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
143 |     default=DEFAULT_DB_NAME,
144 |     help=DB_PATH_HELP,
145 | )
146 | def user(db_path: str, username: str):
147 |     username = clean_username(username)
148 |     click.echo(f"loading data about /u/{username} into {db_path}")
149 | 
150 |     db = Database(db_path)
151 | 
152 |     click.echo("\nfetching (up to 10 pages of) comments")
153 |     comments = load_comments_for_user(username)
154 |     save_comments(db, comments)
155 |     click.echo(f"saved/updated {len(comments)} comments")
156 | 
157 |     click.echo("\nfetching (up to 10 pages of) posts")
158 |     posts = load_posts_for_user(username)
159 |     save_posts(db, posts)
160 |     click.echo(f"saved/updated {len(posts)} posts")
161 | 
162 |     if not (comments or posts):
163 |         raise click.ClickException(f"no data found for username: {username}")
164 | 
165 |     ensure_fts(db)
166 | 
167 | 
168 | @cli.command()
169 | @click.argument(
170 |     "archive_path",
171 |     type=click.Path(file_okay=False, dir_okay=True, allow_dash=False, path_type=Path),
172 | )
173 | @click.option(
174 |     "--db",
175 |     "db_path",
176 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
177 |     default=DEFAULT_DB_NAME,
178 |     help=DB_PATH_HELP,
179 | )
180 | @click.option(
181 |     "--skip-saved",
182 |     is_flag=True,
183 |     default=False,
184 |     help="Skip hydrating data about your saved posts and comments.",
185 | )
186 | def archive(archive_path: Path, db_path: str, skip_saved: bool):
187 |     click.echo(f"loading data found in archive at {archive_path} into {db_path}")
188 | 
189 |     db = Database(db_path)
190 | 
191 |     load_data_from_files(db, archive_path)
192 | 
193 |     # I don't love this double negative, but it is what it is
194 |     if not skip_saved:
195 |         load_data_from_files(db, archive_path, own_data=False, tables_prefix="saved_")
196 | 
197 |     ensure_fts(db)
198 | 


--------------------------------------------------------------------------------
/reddit_user_to_sqlite/csv_helpers.py:
--------------------------------------------------------------------------------
 1 | from csv import DictReader
 2 | from pathlib import Path
 3 | from typing import Literal, Optional
 4 | 
 5 | from sqlite_utils import Database
 6 | 
 7 | ItemType = Literal["comments", "posts"]
 8 | PrefixType = Literal["saved_"]
 9 | 
10 | FULLNAME_PREFIX: dict[ItemType, str] = {
11 |     "comments": "t1",
12 |     "posts": "t3",
13 | }
14 | 
15 | 
16 | def build_table_name(
17 |     table_name: ItemType, table_prefix: Optional[PrefixType] = None
18 | ) -> str:
19 |     return f"{table_prefix or ''}{table_name}"
20 | 
21 | 
22 | def validate_and_build_path(archive_path: Path, item_type: str) -> Path:
23 |     filename = f"{item_type}.csv"
24 |     if not (file := archive_path / filename).exists():
25 |         # LOAD BEARING MESSAGE: the brew formula expects the phrase "unzipped GDPR archive folder" to be printed on error
26 |         raise ValueError(
27 |             f'Ensure path "{archive_path}" points to an unzipped Reddit GDPR archive folder; "{filename}" not found in the expected spot.'
28 |         )
29 |     return file
30 | 
31 | 
32 | def load_unsaved_ids_from_file(
33 |     db: Database,
34 |     archive_path: Path,
35 |     item_type: ItemType,
36 |     prefix: Optional[PrefixType] = None,
37 | ) -> list[str]:
38 |     filename = build_table_name(item_type, prefix)
39 |     # we save each file into a matching table
40 |     saved_ids = {row["id"] for row in db[filename].rows}
41 | 
42 |     with open(
43 |         validate_and_build_path(archive_path, filename), encoding="utf-8"
44 |     ) as archive_rows:
45 |         return [
46 |             f'{FULLNAME_PREFIX[item_type]}_{c["id"]}'
47 |             for c in DictReader(archive_rows)
48 |             if c["id"] not in saved_ids
49 |         ]
50 | 
51 | 
52 | def get_username_from_archive(archive_path: Path) -> Optional[str]:
53 |     with open(validate_and_build_path(archive_path, "statistics")) as stat_rows:
54 |         try:
55 |             return next(
56 |                 row["value"]
57 |                 for row in DictReader(stat_rows)
58 |                 if row["statistic"] == "account name"
59 |             )
60 |         except StopIteration:
61 |             pass
62 | 


--------------------------------------------------------------------------------
/reddit_user_to_sqlite/helpers.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from itertools import islice
 3 | from typing import Iterable, Optional, TypeVar
 4 | 
 5 | T = TypeVar("T")
 6 | 
 7 | 
 8 | # https://docs.python.org/3.11/library/itertools.html#itertools-recipes
 9 | # available natively in 3.12
10 | def batched(iterable: Iterable[T], n: int) -> Iterable[tuple[T]]:
11 |     "Batch data into tuples of length n. The last batch may be shorter."
12 |     # batched('ABCDEFG', 3) --> ABC DEF G
13 |     if n < 1:
14 |         raise ValueError("n must be at least one")
15 |     it = iter(iterable)
16 |     while batch := tuple(islice(it, n)):
17 |         yield batch
18 | 
19 | 
20 | def clean_username(username: str) -> str:
21 |     """
22 |     strips the leading `/u/` off the front of a username, if present
23 |     """
24 |     if re.match(r"/?u/", username):
25 |         return username.strip().strip("/u")
26 |     return username
27 | 
28 | 
29 | def find_user_details_from_items(items) -> Optional[tuple[str, str]]:
30 |     """
31 |     Returns a 2-tuple of prefixed user_id and username if found, otherwise None
32 |     """
33 |     try:
34 |         return next(
35 |             (c["author"], c["author_fullname"]) for c in items if "author_fullname" in c
36 |         )
37 |     except StopIteration:
38 |         return None
39 | 


--------------------------------------------------------------------------------
/reddit_user_to_sqlite/reddit_api.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import (
  3 |     TYPE_CHECKING,
  4 |     Any,
  5 |     Literal,
  6 |     Optional,
  7 |     Sequence,
  8 |     TypedDict,
  9 |     TypeVar,
 10 |     Union,
 11 |     cast,
 12 |     final,
 13 | )
 14 | 
 15 | import click
 16 | import requests
 17 | from tqdm import tqdm, trange
 18 | 
 19 | from reddit_user_to_sqlite.helpers import batched
 20 | 
 21 | if TYPE_CHECKING:
 22 |     from typing import NotRequired
 23 | 
 24 | USER_AGENT = "reddit-user-to-sqlite"
 25 | 
 26 | 
 27 | class SubredditFragment(TypedDict):
 28 |     ## SUBREDDIT
 29 |     # "consoledeals"
 30 |     subreddit: str
 31 |     # ID
 32 |     subreddit_id: str
 33 |     # "public"
 34 |     subreddit_type: str
 35 | 
 36 | 
 37 | class UserFragment(TypedDict):
 38 |     # comment author username
 39 |     author: str
 40 |     # comment author prefixed id
 41 |     author_fullname: "NotRequired[str]"
 42 | 
 43 | 
 44 | class Comment(SubredditFragment, UserFragment):
 45 |     # this is only the relevant fields from the response
 46 | 
 47 |     ## COMMENT
 48 |     # short ID
 49 |     id: str
 50 |     # full ID
 51 |     name: str
 52 | 
 53 |     total_awards_received: int
 54 |     gilded: int
 55 | 
 56 |     # the ID of a post or comment
 57 |     parent_id: str
 58 |     score: int
 59 | 
 60 |     # maybe always 0? or i'm just boring
 61 |     controversiality: int
 62 |     # plaintext (or markdown?)
 63 |     body: str
 64 |     body_html: str
 65 |     # is the commenter OP?
 66 |     is_submitter: bool
 67 |     # 1682464342.0,
 68 |     created: float
 69 |     # "/r/x/comments/...
 70 |     permalink: str
 71 | 
 72 |     ## POST
 73 |     # post title
 74 |     link_title: str
 75 |     num_comments: int
 76 |     # post ID
 77 |     link_id: str
 78 |     link_permalink: str
 79 |     # "r/consoledeals",
 80 |     subreddit_name_prefixed: str
 81 | 
 82 | 
 83 | class Post(SubredditFragment, UserFragment):
 84 |     # no prefix
 85 |     id: str
 86 | 
 87 |     title: str
 88 | 
 89 |     # markdown content of the post; could be empty
 90 |     selftext: str
 91 |     # external link (or self link)
 92 |     url: str
 93 |     # link to reddit thread (sans domain)
 94 |     permalink: str
 95 | 
 96 |     upvote_ratio: float
 97 |     score: int
 98 |     total_awards_received: int
 99 | 
100 |     num_comments: int
101 |     over_18: bool
102 | 
103 |     # timestamp
104 |     created: float
105 | 
106 | 
107 | # class Subreddit(TypedDict):
108 | #     should_archive_posts: bool
109 | 
110 | 
111 | @final
112 | class ResourceWrapper(TypedDict):
113 |     kind: str
114 |     data: Union[Comment, Post]
115 | 
116 | 
117 | class SuccessResponse(TypedDict):
118 |     kind: Literal["Listing", "t2"]
119 | 
120 | 
121 | @final
122 | class PagedResponseBody(TypedDict):
123 |     before: Optional[str]
124 |     after: Optional[str]
125 |     modhash: str
126 |     geo_filter: str
127 |     dist: int
128 |     children: Sequence[ResourceWrapper]
129 | 
130 | 
131 | @final
132 | class PagedResponse(SuccessResponse):
133 |     data: PagedResponseBody
134 | 
135 | 
136 | @final
137 | class UserData(TypedDict):
138 |     id: str
139 | 
140 | 
141 | @final
142 | class UserResponse(SuccessResponse):
143 |     data: UserData
144 | 
145 | 
146 | @final
147 | class ErorrResponse(TypedDict):
148 |     message: str
149 |     error: int
150 | 
151 | 
152 | ErrorHeaders = TypedDict(
153 |     "ErrorHeaders",
154 |     {
155 |         "x-ratelimit-used": str,
156 |         "x-ratelimit-remaining": str,
157 |         "x-ratelimit-reset": str,
158 |     },
159 | )
160 | 
161 | # max API page size is 100
162 | PAGE_SIZE = 100
163 | 
164 | 
165 | class RedditRateLimitException(Exception):
166 |     """
167 |     more info: https://support.reddithelp.com/hc/en-us/articles/16160319875092-Reddit-Data-API-Wiki
168 |     """
169 | 
170 |     def __init__(self, headers: ErrorHeaders) -> None:
171 |         super().__init__("Rate limited by Reddit")
172 | 
173 |         self.used = int(headers["x-ratelimit-used"])
174 |         self.remaining = int(headers["x-ratelimit-remaining"])
175 |         self.window_total = self.used + self.remaining
176 |         self.reset_after_seconds = int(headers["x-ratelimit-reset"])
177 | 
178 |     @property
179 |     def stats(self) -> str:
180 |         return f"Used {self.used}/{self.window_total} requests (resets in {self.reset_after_seconds} seconds)"
181 | 
182 | 
183 | def _unwrap_response_and_raise(response: requests.Response):
184 |     result = response.json()
185 | 
186 |     if "error" in result:
187 |         if result["error"] == 429:
188 |             raise RedditRateLimitException(cast(ErrorHeaders, response.headers))
189 | 
190 |         raise ValueError(
191 |             f'Received API error from Reddit (code {result["error"]}): {result["message"]}'
192 |         )
193 | 
194 |     return result
195 | 
196 | 
197 | def _call_reddit_api(url: str, params: Optional[dict[str, Any]] = None):
198 |     return _unwrap_response_and_raise(
199 |         requests.get(
200 |             url,
201 |             {"raw_json": 1, "limit": PAGE_SIZE, **(params or {})},  # type: ignore
202 |             headers={"user-agent": USER_AGENT},
203 |         )
204 |     )
205 | 
206 | 
207 | def _rate_limit_message(e: RedditRateLimitException) -> str:
208 |     return f"Rate limited by reddit; try again in {e.reset_after_seconds} seconds. Until then, saving what we have"
209 | 
210 | 
211 | def _load_paged_resource(resource: Literal["comments", "submitted"], username: str):
212 |     """
213 |     handles paging logic for arbitrary-length queries with an "after" param
214 |     """
215 |     result = []
216 |     after = None
217 |     # max number of pages we can fetch
218 |     for _ in trange(10):
219 |         try:
220 |             response: PagedResponse = _call_reddit_api(
221 |                 f"https://www.reddit.com/user/{username}/{resource}.json",
222 |                 params={"after": after},
223 |             )
224 | 
225 |             result += [c["data"] for c in response["data"]["children"]]
226 |             after = response["data"]["after"]
227 |             if len(response["data"]["children"]) < PAGE_SIZE:
228 |                 break
229 |         except RedditRateLimitException as e:
230 |             click.echo(_rate_limit_message(e), err=True)
231 |             break
232 | 
233 |     return result
234 | 
235 | 
236 | def load_comments_for_user(username: str) -> list[Comment]:
237 |     return _load_paged_resource("comments", username)
238 | 
239 | 
240 | def load_posts_for_user(username: str) -> list[Post]:
241 |     return _load_paged_resource("submitted", username)
242 | 
243 | 
244 | def load_info(resources: Sequence[str]) -> list[Union[Comment, Post]]:
245 |     """
246 |     calls the `/info` endpoint to fetch data about a sequence of resources that include the type prefix
247 |     """
248 |     result = []
249 |     for batch in batched(
250 |         tqdm(resources, disable=bool(os.environ.get("DISABLE_PROGRESS"))), PAGE_SIZE
251 |     ):
252 |         try:
253 |             response: PagedResponse = _call_reddit_api(
254 |                 "https://www.reddit.com/api/info.json",
255 |                 params={"id": ",".join(batch)},
256 |             )
257 |             result += [c["data"] for c in response["data"]["children"]]
258 |         except RedditRateLimitException as e:
259 |             click.echo(_rate_limit_message(e), err=True)
260 |             break
261 | 
262 |     return result
263 | 
264 | 
265 | def get_user_id(username: str) -> str:
266 |     response: UserResponse = _call_reddit_api(
267 |         f"https://www.reddit.com/user/{username}/about.json"
268 |     )
269 | 
270 |     return response["data"]["id"]
271 | 
272 | 
273 | T = TypeVar("T", Comment, Post)
274 | 
275 | 
276 | def add_missing_user_fragment(
277 |     items: list[T], username: str, user_fullname: str
278 | ) -> list[T]:
279 |     """
280 |     If an item lacks user details, this adds them. Otherwise the item passes through untouched.
281 |     """
282 |     return [
283 |         cast(T, {**i, "author": username, "author_fullname": user_fullname})
284 |         if "author_fullname" not in i
285 |         else i
286 |         for i in items
287 |     ]
288 | 


--------------------------------------------------------------------------------
/reddit_user_to_sqlite/sqlite_helpers.py:
--------------------------------------------------------------------------------
  1 | from typing import Callable, Iterable, Optional, Sequence, TypedDict, TypeVar
  2 | 
  3 | from sqlite_utils import Database
  4 | 
  5 | from reddit_user_to_sqlite.csv_helpers import PrefixType, build_table_name
  6 | from reddit_user_to_sqlite.reddit_api import (
  7 |     Comment,
  8 |     Post,
  9 |     SubredditFragment,
 10 |     UserFragment,
 11 | )
 12 | 
 13 | 
 14 | class SubredditRow(TypedDict):
 15 |     id: str
 16 |     name: str
 17 |     type: str
 18 |     # TODO: handle archiving and updating
 19 |     # archives_posts: bool
 20 | 
 21 | 
 22 | def item_to_subreddit_row(item: SubredditFragment) -> SubredditRow:
 23 |     return {
 24 |         "id": item["subreddit_id"][3:],
 25 |         "name": item["subreddit"],
 26 |         "type": item["subreddit_type"],
 27 |     }
 28 | 
 29 | 
 30 | def upsert_subreddits(db: Database, subreddits: Iterable[SubredditFragment]):
 31 |     # upserts are actually important here, since subs are going private/public a lot
 32 |     # https://github.com/simonw/sqlite-utils/issues/554
 33 |     db["subreddits"].upsert_all(  # type: ignore
 34 |         map(item_to_subreddit_row, subreddits),
 35 |         # ignore=True,  # type: ignore
 36 |         # only relevant if creating the table
 37 |         pk="id",  # type: ignore
 38 |         not_null=["id", "name"],  # type: ignore
 39 |     )
 40 | 
 41 | 
 42 | class UserRow(TypedDict):
 43 |     id: str
 44 |     username: str
 45 | 
 46 | 
 47 | def item_to_user_row(item: UserFragment) -> Optional[UserRow]:
 48 |     if "author_fullname" in item:
 49 |         return {"id": item["author_fullname"][3:], "username": item["author"]}
 50 | 
 51 | 
 52 | def insert_users(db: Database, users: Sequence[UserFragment]):
 53 |     existing_users = {u["id"] for u in db["users"].rows}
 54 | 
 55 |     unique_new_users = {
 56 |         # needs to be hashable so it's deduped
 57 |         (u["id"], u["username"])
 58 |         for user in users
 59 |         if (u := item_to_user_row(user)) and u["id"] not in existing_users
 60 |     }
 61 | 
 62 |     new_users = [{"id": user[0], "username": user[1]} for user in unique_new_users]
 63 | 
 64 |     db["users"].insert_all(  # type: ignore
 65 |         new_users,
 66 |         # ignore any write error
 67 |         # ignore=True,
 68 |         # only relevant if creating the table
 69 |         pk="id",  # type: ignore
 70 |         not_null=["id", "username"],  # type: ignore
 71 |     )
 72 | 
 73 | 
 74 | class CommentRow(TypedDict):
 75 |     id: str
 76 |     timestamp: int
 77 |     score: int
 78 |     text: str
 79 |     user: str
 80 |     is_submitter: int
 81 |     subreddit: str
 82 |     permalink: str
 83 |     controversiality: int
 84 |     num_awards: int
 85 | 
 86 | 
 87 | def comment_to_comment_row(comment: Comment) -> Optional[CommentRow]:
 88 |     if "author_fullname" not in comment:
 89 |         return
 90 | 
 91 |     return {
 92 |         "id": comment["id"],
 93 |         "timestamp": int(comment["created"]),
 94 |         "score": comment["score"],
 95 |         "text": comment["body"],
 96 |         "user": comment["author_fullname"][3:],  # strip leading t2_
 97 |         "subreddit": comment["subreddit_id"][3:],  # strip leading t5_
 98 |         "permalink": f'https://old.reddit.com{comment["permalink"]}?context=10',
 99 |         "is_submitter": int(comment["is_submitter"]),
100 |         "controversiality": comment["controversiality"],
101 |         "num_awards": comment["total_awards_received"],
102 |     }
103 | 
104 | 
105 | T = TypeVar("T")
106 | U = TypeVar("U")
107 | 
108 | 
109 | def apply_and_filter(
110 |     filterer: Callable[[T], Optional[U]], items: Iterable[T]
111 | ) -> list[U]:
112 |     return [c for c in map(filterer, items) if c]
113 | 
114 | 
115 | def upsert_comments(
116 |     db: Database, comments: Iterable[Comment], table_prefix: Optional[PrefixType] = None
117 | ) -> int:
118 |     comment_rows = apply_and_filter(comment_to_comment_row, comments)
119 |     db[build_table_name("comments", table_prefix)].upsert_all(  # type: ignore
120 |         comment_rows,
121 |         pk="id",  # type: ignore
122 |         # update the schema - needed if user does archive first
123 |         alter=True,  # type: ignore
124 |         foreign_keys=[  # type: ignore
125 |             (
126 |                 "subreddit",
127 |                 "subreddits",
128 |                 "id",
129 |             ),
130 |             (
131 |                 "user",
132 |                 "users",
133 |                 "id",
134 |             ),
135 |         ],
136 |         # can re-add or assert this later, but the rows aren't created if this is present
137 |         # see: https://github.com/simonw/sqlite-utils/issues/538
138 |         # not_null=["id", "timestamp", "text", "user", "subreddit", "permalink"],
139 |     )
140 |     return len(comment_rows)
141 | 
142 | 
143 | class PostRow(TypedDict):
144 |     id: str
145 |     timestamp: int
146 |     score: int
147 |     title: str
148 |     text: str
149 |     external_url: str
150 |     user: str
151 |     subreddit: str
152 |     permalink: str
153 |     upvote_ratio: float
154 |     score: int
155 |     num_comments: int
156 |     num_awards: int
157 |     is_removed: int
158 | 
159 | 
160 | def post_to_post_row(post: Post) -> Optional[PostRow]:
161 |     if "author_fullname" not in post:
162 |         return
163 | 
164 |     return {
165 |         "id": post["id"],
166 |         "timestamp": int(post["created"]),
167 |         "score": post["score"],
168 |         "num_comments": post["num_comments"],
169 |         "title": post["title"],
170 |         "text": post["selftext"],
171 |         "external_url": "" if "reddit.com" in post["url"] else post["url"],
172 |         "user": post["author_fullname"][3:],
173 |         "subreddit": post["subreddit_id"][3:],
174 |         "permalink": f'https://old.reddit.com{post["permalink"]}',
175 |         "upvote_ratio": post["upvote_ratio"],
176 |         "num_awards": post["total_awards_received"],
177 |         "is_removed": int(post["selftext"] == "[removed]"),
178 |     }
179 | 
180 | 
181 | def upsert_posts(
182 |     db: Database, posts: Iterable[Post], table_prefix: Optional[PrefixType] = None
183 | ) -> int:
184 |     post_rows = apply_and_filter(post_to_post_row, posts)
185 |     db[build_table_name("posts", table_prefix)].insert_all(  # type: ignore
186 |         post_rows,
187 |         upsert=True,
188 |         pk="id",  # type: ignore
189 |         alter=True,  # type: ignore
190 |         foreign_keys=[  # type: ignore
191 |             (
192 |                 "subreddit",
193 |                 "subreddits",
194 |                 "id",
195 |             ),
196 |             (
197 |                 "user",
198 |                 "users",
199 |                 "id",
200 |             ),
201 |         ],
202 |     )
203 |     return len(post_rows)
204 | 
205 | 
206 | FTS_INSTRUCTIONS: list[tuple[str, list[str]]] = [
207 |     ("comments", ["text"]),
208 |     ("posts", ["title", "text"]),
209 |     ("saved_comments", ["text"]),
210 |     ("saved_posts", ["title", "text"]),
211 | ]
212 | 
213 | 
214 | def ensure_fts(db: Database):
215 |     table_names = set(db.table_names())
216 |     for table, columns in FTS_INSTRUCTIONS:
217 |         if table in table_names and f"{table}_fts" not in table_names:
218 |             db[table].enable_fts(columns, create_triggers=True)
219 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavdid/reddit-user-to-sqlite/e02fad746694f32ebc2ee2efce82652857682cc6/tests/__init__.py


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from typing import Any, Literal, Optional, Protocol, Union
  3 | 
  4 | import pytest
  5 | import responses
  6 | from responses import BaseResponse, RequestsMock, matchers
  7 | from sqlite_utils import Database
  8 | 
  9 | from reddit_user_to_sqlite.reddit_api import (
 10 |     USER_AGENT,
 11 |     ErrorHeaders,
 12 |     PagedResponse,
 13 |     Post,
 14 | )
 15 | from reddit_user_to_sqlite.sqlite_helpers import CommentRow, PostRow, UserRow
 16 | 
 17 | 
 18 | @pytest.fixture
 19 | def tmp_db_path(tmp_path):
 20 |     """
 21 |     returns a Database path in a temp dir
 22 |     """
 23 |     return str(tmp_path / "test.db")
 24 | 
 25 | 
 26 | @pytest.fixture
 27 | def tmp_db(tmp_db_path):
 28 |     """
 29 |     returns a Database in a temp dir
 30 |     """
 31 |     return Database(tmp_db_path)
 32 | 
 33 | 
 34 | def _wrap_response(*children) -> PagedResponse:
 35 |     return {
 36 |         "kind": "Listing",
 37 |         "data": {
 38 |             "after": None,
 39 |             "dist": 1,
 40 |             "modhash": "whatever",
 41 |             "geo_filter": "",
 42 |             "children": [{"kind": "t_", "data": c} for c in children],
 43 |             "before": None,
 44 |         },
 45 |     }
 46 | 
 47 | 
 48 | @pytest.fixture
 49 | def comment():
 50 |     """
 51 |     A raw (unwrapped) comment object from the Reddit API
 52 |     """
 53 |     return {
 54 |         "subreddit_id": "t5_2t3ad",
 55 |         "approved_at_utc": None,
 56 |         "author_is_blocked": False,
 57 |         "comment_type": None,
 58 |         "link_title": "What games do you guys love to replay or never get bored with?",
 59 |         "mod_reason_by": None,
 60 |         "banned_by": None,
 61 |         "ups": 1,
 62 |         "num_reports": None,
 63 |         "author_flair_type": "text",
 64 |         "total_awards_received": 3,
 65 |         "subreddit": "patientgamers",
 66 |         "link_author": "DefinitionWest",
 67 |         "likes": None,
 68 |         "replies": "",
 69 |         "user_reports": [],
 70 |         "saved": False,
 71 |         "id": "jj0ti6f",
 72 |         "banned_at_utc": None,
 73 |         "mod_reason_title": None,
 74 |         "gilded": 0,
 75 |         "archived": False,
 76 |         "collapsed_reason_code": None,
 77 |         "no_follow": True,
 78 |         "author": "xavdid",
 79 |         "num_comments": 250,
 80 |         "can_mod_post": False,
 81 |         "send_replies": True,
 82 |         "parent_id": "t1_jirew06",
 83 |         "score": 1,
 84 |         "author_fullname": "t2_np8mb41h",
 85 |         "over_18": False,
 86 |         "report_reasons": None,
 87 |         "removal_reason": None,
 88 |         "approved_by": None,
 89 |         "controversiality": 0,
 90 |         "body": "Such a great game to pick up for a run every couple of months. Every time I think I'm done, it pulls be back in.",
 91 |         "edited": False,
 92 |         "top_awarded_type": None,
 93 |         "downs": 0,
 94 |         "author_flair_css_class": None,
 95 |         "is_submitter": False,
 96 |         "collapsed": False,
 97 |         "author_flair_richtext": [],
 98 |         "author_patreon_flair": False,
 99 |         "body_html": '&lt;div class="md"&gt;&lt;p&gt;Such a great game to pick up for a run every couple of months. Every time I think I&amp;#39;m done, it pulls be back in.&lt;/p&gt;\n&lt;/div&gt;',
100 |         "gildings": {},
101 |         "collapsed_reason": None,
102 |         "distinguished": None,
103 |         "associated_award": None,
104 |         "stickied": False,
105 |         "author_premium": False,
106 |         "can_gild": True,
107 |         "link_id": "t3_1371yrv",
108 |         "unrepliable_reason": None,
109 |         "author_flair_text_color": None,
110 |         "score_hidden": False,
111 |         "permalink": "/r/patientgamers/comments/1371yrv/what_games_do_you_guys_love_to_replay_or_never/jj0ti6f/",
112 |         "subreddit_type": "public",
113 |         "link_permalink": "https://www.reddit.com/r/patientgamers/comments/1371yrv/what_games_do_you_guys_love_to_replay_or_never/",
114 |         "name": "t1_jj0ti6f",
115 |         "author_flair_template_id": None,
116 |         "subreddit_name_prefixed": "r/patientgamers",
117 |         "author_flair_text": None,
118 |         "treatment_tags": [],
119 |         "created": 1683327131.0,
120 |         "created_utc": 1683327131.0,
121 |         "awarders": [],
122 |         "all_awardings": [],
123 |         "locked": False,
124 |         "author_flair_background_color": None,
125 |         "collapsed_because_crowd_control": None,
126 |         "mod_reports": [],
127 |         "quarantine": False,
128 |         "mod_note": None,
129 |         "link_url": "https://www.reddit.com/r/patientgamers/comments/1371yrv/what_games_do_you_guys_love_to_replay_or_never/",
130 |     }
131 | 
132 | 
133 | @pytest.fixture
134 | def modify_comment(comment):
135 |     def _modify(d):
136 |         return {**comment, **d}
137 | 
138 |     return _modify
139 | 
140 | 
141 | @pytest.fixture
142 | def modify_post(self_post):
143 |     def _modify(d):
144 |         return {**self_post, **d}
145 | 
146 |     return _modify
147 | 
148 | 
149 | @pytest.fixture
150 | def removed_comment():
151 |     return {
152 |         "total_awards_received": 0,
153 |         "approved_at_utc": None,
154 |         "author_is_blocked": False,
155 |         "comment_type": None,
156 |         "edited": False,
157 |         "mod_reason_by": None,
158 |         "banned_by": None,
159 |         "removal_reason": None,
160 |         "link_id": "t3_puwue",
161 |         "author_flair_template_id": None,
162 |         "likes": None,
163 |         "replies": "",
164 |         "user_reports": [],
165 |         "saved": False,
166 |         "id": "c3sgfl4",
167 |         "banned_at_utc": None,
168 |         "mod_reason_title": None,
169 |         "gilded": 0,
170 |         "archived": True,
171 |         "collapsed_reason_code": "DELETED",
172 |         "no_follow": True,
173 |         "author": "[deleted]",
174 |         "can_mod_post": False,
175 |         "created_utc": 1329550785.0,
176 |         "send_replies": True,
177 |         "parent_id": "t1_c3sgeij",
178 |         "score": -1,
179 |         "approved_by": None,
180 |         "mod_note": None,
181 |         "all_awardings": [],
182 |         "subreddit_id": "t5_2qm4e",
183 |         "body": "[removed]",
184 |         "awarders": [],
185 |         "author_flair_css_class": None,
186 |         "name": "t1_c3sgfl4",
187 |         "downs": 0,
188 |         "is_submitter": False,
189 |         "body_html": '<div class="md"><p>[removed]</p>\n</div>',
190 |         "gildings": {},
191 |         "collapsed_reason": None,
192 |         "distinguished": None,
193 |         "associated_award": None,
194 |         "stickied": False,
195 |         "can_gild": True,
196 |         "top_awarded_type": None,
197 |         "unrepliable_reason": None,
198 |         "author_flair_text_color": "dark",
199 |         "score_hidden": False,
200 |         "permalink": "/r/askscience/comments/asdf/why_do_birds_fly/",
201 |         "num_reports": None,
202 |         "locked": False,
203 |         "report_reasons": None,
204 |         "created": 1329550785.0,
205 |         "subreddit": "askscience",
206 |         "author_flair_text": None,
207 |         "treatment_tags": [],
208 |         "collapsed": True,
209 |         "subreddit_name_prefixed": "r/askscience",
210 |         "controversiality": 0,
211 |         "author_flair_background_color": "",
212 |         "collapsed_because_crowd_control": None,
213 |         "mod_reports": [],
214 |         "subreddit_type": "public",
215 |         "ups": -1,
216 |     }
217 | 
218 | 
219 | @pytest.fixture
220 | def removed_comment_response(removed_comment):
221 |     return _wrap_response(removed_comment)
222 | 
223 | 
224 | @pytest.fixture
225 | def all_comments_response(comment, removed_comment):
226 |     return _wrap_response(comment, removed_comment)
227 | 
228 | 
229 | @pytest.fixture
230 | def stored_comment() -> CommentRow:
231 |     """
232 |     a serialized comment row in the db
233 |     """
234 |     return {
235 |         "controversiality": 0,
236 |         "id": "jj0ti6f",
237 |         "is_submitter": 0,
238 |         "permalink": "https://old.reddit.com/r/patientgamers/comments/1371yrv/what_games_do_you_guys_love_to_replay_or_never/jj0ti6f/?context=10",
239 |         "score": 1,
240 |         "subreddit": "2t3ad",
241 |         "text": "Such a great game to pick up for a run every couple of months. Every time I think I'm done, it pulls be back in.",
242 |         "timestamp": 1683327131,
243 |         "user": "np8mb41h",
244 |         "num_awards": 3,
245 |     }
246 | 
247 | 
248 | @pytest.fixture
249 | def stored_removed_comment() -> CommentRow:
250 |     return {
251 |         "controversiality": 0,
252 |         "id": "c3sgfl4",
253 |         "is_submitter": 0,
254 |         "permalink": "https://old.reddit.com/r/askscience/comments/asdf/why_do_birds_fly/?context=10",
255 |         "score": -1,
256 |         "subreddit": "2qm4e",
257 |         "text": "[removed]",
258 |         "timestamp": 1329550785,
259 |         # manually added this - if it's stored, I must have found a user
260 |         "user": "np8mb41h",
261 |         "num_awards": 0,
262 |     }
263 | 
264 | 
265 | @pytest.fixture
266 | def stored_removed_comment_placeholder_user() -> CommentRow:
267 |     return {
268 |         "controversiality": 0,
269 |         "id": "c3sgfl4",
270 |         "is_submitter": 0,
271 |         "permalink": "https://old.reddit.com/r/askscience/comments/asdf/why_do_birds_fly/?context=10",
272 |         "score": -1,
273 |         "subreddit": "2qm4e",
274 |         "text": "[removed]",
275 |         "timestamp": 1329550785,
276 |         "user": "1234567",
277 |         "num_awards": 0,
278 |     }
279 | 
280 | 
281 | @pytest.fixture
282 | def comment_response(comment) -> PagedResponse:
283 |     """
284 |     The full response from Reddit with a comment child
285 |     """
286 |     return _wrap_response(comment)
287 | 
288 | 
289 | @pytest.fixture
290 | def self_post():
291 |     """
292 |     A raw (unwrapped) self post object from the Reddit API
293 |     """
294 |     return {
295 |         "all_awardings": [],
296 |         "allow_live_comments": False,
297 |         "approved_at_utc": None,
298 |         "approved_by": None,
299 |         "archived": False,
300 |         "author": "xavdid",
301 |         "author_flair_background_color": None,
302 |         "author_flair_css_class": None,
303 |         "author_flair_richtext": [],
304 |         "author_flair_template_id": None,
305 |         "author_flair_text": None,
306 |         "author_flair_text_color": None,
307 |         "author_flair_type": "text",
308 |         "author_fullname": "t2_np8mb41h",
309 |         "author_is_blocked": False,
310 |         "author_patreon_flair": False,
311 |         "author_premium": False,
312 |         "awarders": [],
313 |         "banned_at_utc": None,
314 |         "banned_by": None,
315 |         "can_gild": False,
316 |         "can_mod_post": False,
317 |         "category": None,
318 |         "clicked": False,
319 |         "content_categories": None,
320 |         "contest_mode": False,
321 |         "created": 1653623084,
322 |         "created_utc": 1653623084,
323 |         "discussion_type": None,
324 |         "distinguished": None,
325 |         "domain": "self.KeybaseProofs",
326 |         "downs": 0,
327 |         "edited": False,
328 |         "gilded": 0,
329 |         "gildings": {},
330 |         "hidden": False,
331 |         "hide_score": False,
332 |         "id": "uypaav",
333 |         "is_created_from_ads_ui": False,
334 |         "is_crosspostable": False,
335 |         "is_meta": False,
336 |         "is_original_content": False,
337 |         "is_reddit_media_domain": False,
338 |         "is_robot_indexable": True,
339 |         "is_self": True,
340 |         "is_video": False,
341 |         "likes": None,
342 |         "link_flair_background_color": "",
343 |         "link_flair_css_class": None,
344 |         "link_flair_richtext": [],
345 |         "link_flair_text": None,
346 |         "link_flair_text_color": "dark",
347 |         "link_flair_type": "text",
348 |         "locked": False,
349 |         "media": None,
350 |         "media_embed": {},
351 |         "media_only": False,
352 |         "mod_note": None,
353 |         "mod_reason_by": None,
354 |         "mod_reason_title": None,
355 |         "mod_reports": [],
356 |         "name": "t3_uypaav",
357 |         "no_follow": True,
358 |         "num_comments": 0,
359 |         "num_crossposts": 0,
360 |         "num_reports": None,
361 |         "over_18": False,
362 |         "parent_whitelist_status": "all_ads",
363 |         "permalink": "/r/KeybaseProofs/comments/uypaav/my_keybase_proof_redditxavdid_keybasexavdid/",
364 |         "pinned": False,
365 |         "post_hint": "self",
366 |         "preview": {
367 |             "enabled": False,
368 |             "images": [
369 |                 {
370 |                     "id": "-YTScuArtOT7VGFuDeGCZvRtPZZ6N8YNPBBjDIA6KiQ",
371 |                     "resolutions": [
372 |                         {
373 |                             "height": 108,
374 |                             "url": "https://external-preview.redd.it/d8t5K0qquzpFUYxW8QDLgM8lFUUyu6zo_KM_cFv2JjY.jpg?width=108&crop=smart&auto=webp&v=enabled&s=3076e81be7310fd25b111faa85f33dcd722e3e07",
375 |                             "width": 108,
376 |                         },
377 |                         {
378 |                             "height": 216,
379 |                             "url": "https://external-preview.redd.it/d8t5K0qquzpFUYxW8QDLgM8lFUUyu6zo_KM_cFv2JjY.jpg?width=216&crop=smart&auto=webp&v=enabled&s=80217a00e40d70bdf57ebd1510d5ff49a1b1b5a4",
380 |                             "width": 216,
381 |                         },
382 |                         {
383 |                             "height": 320,
384 |                             "url": "https://external-preview.redd.it/d8t5K0qquzpFUYxW8QDLgM8lFUUyu6zo_KM_cFv2JjY.jpg?width=320&crop=smart&auto=webp&v=enabled&s=547611bba1890b9b67fc84e2d31badb682bd25bb",
385 |                             "width": 320,
386 |                         },
387 |                     ],
388 |                     "source": {
389 |                         "height": 360,
390 |                         "url": "https://external-preview.redd.it/d8t5K0qquzpFUYxW8QDLgM8lFUUyu6zo_KM_cFv2JjY.jpg?auto=webp&v=enabled&s=ff41e339b6994c953c13eb917d562e7b0793831e",
391 |                         "width": 360,
392 |                     },
393 |                     "variants": {},
394 |                 }
395 |             ],
396 |         },
397 |         "pwls": 6,
398 |         "quarantine": False,
399 |         "removal_reason": None,
400 |         "removed_by": None,
401 |         "removed_by_category": None,
402 |         "report_reasons": None,
403 |         "saved": False,
404 |         "score": 1,
405 |         "secure_media": None,
406 |         "secure_media_embed": {},
407 |         "selftext": "### Keybase proof\n...-----END PGP MESSAGE-----\n",
408 |         "selftext_html": '<!-- SC_OFF --><div class="md"><h3>Keybase proof</h3>\n-----END PGP MESSAGE-----\n</code></pre>\n</div><!-- SC_ON -->',
409 |         "send_replies": True,
410 |         "spoiler": False,
411 |         "stickied": False,
412 |         "subreddit": "KeybaseProofs",
413 |         "subreddit_id": "t5_32u6q",
414 |         "subreddit_name_prefixed": "r/KeybaseProofs",
415 |         "subreddit_subscribers": 7428,
416 |         "subreddit_type": "public",
417 |         "suggested_sort": None,
418 |         "thumbnail": "self",
419 |         "thumbnail_height": None,
420 |         "thumbnail_width": None,
421 |         "title": "My Keybase proof [reddit:xavdid = keybase:xavdid]",
422 |         "top_awarded_type": None,
423 |         "total_awards_received": 0,
424 |         "treatment_tags": [],
425 |         "ups": 1,
426 |         "upvote_ratio": 1,
427 |         "url": "https://www.reddit.com/r/KeybaseProofs/comments/uypaav/my_keybase_proof_redditxavdid_keybasexavdid/",
428 |         "user_reports": [],
429 |         "view_count": None,
430 |         "visited": False,
431 |         "whitelist_status": "all_ads",
432 |         "wls": 6,
433 |     }
434 | 
435 | 
436 | @pytest.fixture
437 | def stored_self_post() -> PostRow:
438 |     return {
439 |         "external_url": "",
440 |         "id": "uypaav",
441 |         "is_removed": 0,
442 |         "num_awards": 0,
443 |         "permalink": "https://old.reddit.com/r/KeybaseProofs/comments/uypaav/my_keybase_proof_redditxavdid_keybasexavdid/",
444 |         "score": 1,
445 |         "subreddit": "32u6q",
446 |         "text": "### Keybase proof\n...-----END PGP MESSAGE-----\n",
447 |         "timestamp": 1653623084,
448 |         "num_comments": 0,
449 |         "title": "My Keybase proof [reddit:xavdid = keybase:xavdid]",
450 |         "upvote_ratio": 1,
451 |         "user": "np8mb41h",
452 |     }
453 | 
454 | 
455 | @pytest.fixture
456 | def self_post_response(self_post):
457 |     return _wrap_response(self_post)
458 | 
459 | 
460 | @pytest.fixture
461 | def removed_post():
462 |     """
463 |     A raw (unwrapped) removed post object from the Reddit API
464 |     """
465 |     return {
466 |         "approved_at_utc": None,
467 |         "subreddit": "videos",
468 |         "selftext": "[deleted]",
469 |         "user_reports": [],
470 |         "saved": False,
471 |         "mod_reason_title": None,
472 |         "gilded": 0,
473 |         "clicked": False,
474 |         "title": "Tommy Wiseau Wishes YOU A Happy Memorial Day! — Urban Outfitters",
475 |         "link_flair_richtext": [],
476 |         "subreddit_name_prefixed": "r/videos",
477 |         "hidden": False,
478 |         "pwls": 6,
479 |         "link_flair_css_class": None,
480 |         "downs": 0,
481 |         "thumbnail_height": 52,
482 |         "top_awarded_type": None,
483 |         "hide_score": False,
484 |         "name": "t3_1f55rr",
485 |         "quarantine": False,
486 |         "link_flair_text_color": "dark",
487 |         "upvote_ratio": 1,
488 |         "author_flair_background_color": "",
489 |         "subreddit_type": "public",
490 |         "ups": 1,
491 |         "total_awards_received": 0,
492 |         "media_embed": {},
493 |         "thumbnail_width": 70,
494 |         "author_flair_template_id": None,
495 |         "is_original_content": False,
496 |         "secure_media": None,
497 |         "is_reddit_media_domain": False,
498 |         "is_meta": False,
499 |         "category": None,
500 |         "secure_media_embed": {},
501 |         "link_flair_text": None,
502 |         "can_mod_post": False,
503 |         "score": 1,
504 |         "approved_by": None,
505 |         "is_created_from_ads_ui": False,
506 |         "thumbnail": "default",
507 |         "edited": False,
508 |         "author_flair_css_class": None,
509 |         "gildings": {},
510 |         "content_categories": None,
511 |         "is_self": False,
512 |         "mod_note": None,
513 |         "created": 1369671390.0,
514 |         "link_flair_type": "text",
515 |         "wls": 6,
516 |         "removed_by_category": None,
517 |         "banned_by": None,
518 |         "domain": "",
519 |         "allow_live_comments": False,
520 |         "selftext_html": '<!-- SC_OFF --><div class="md"><p>[deleted]</p>\n</div><!-- SC_ON -->',
521 |         "likes": None,
522 |         "suggested_sort": None,
523 |         "banned_at_utc": None,
524 |         "url_overridden_by_dest": "",
525 |         "view_count": None,
526 |         "archived": False,
527 |         "no_follow": True,
528 |         "is_crosspostable": False,
529 |         "pinned": False,
530 |         "over_18": False,
531 |         "all_awardings": [],
532 |         "awarders": [],
533 |         "media_only": False,
534 |         "can_gild": False,
535 |         "spoiler": False,
536 |         "locked": False,
537 |         "author_flair_text": None,
538 |         "treatment_tags": [],
539 |         "visited": False,
540 |         "removed_by": None,
541 |         "num_reports": None,
542 |         "distinguished": None,
543 |         "subreddit_id": "t5_2qh1e",
544 |         "author_is_blocked": False,
545 |         "mod_reason_by": None,
546 |         "removal_reason": None,
547 |         "link_flair_background_color": "",
548 |         "id": "1f55rr",
549 |         "is_robot_indexable": False,
550 |         "report_reasons": None,
551 |         "author": "[deleted]",
552 |         "discussion_type": None,
553 |         "num_comments": 0,
554 |         "send_replies": False,
555 |         "whitelist_status": "all_ads",
556 |         "contest_mode": False,
557 |         "mod_reports": [],
558 |         "author_flair_text_color": "dark",
559 |         "permalink": "/r/videos/comments/1f55rr/tommy_wiseau_wishes_you_a_happy_memorial_day/",
560 |         "parent_whitelist_status": "all_ads",
561 |         "stickied": False,
562 |         "url": "",
563 |         "subreddit_subscribers": 26688085,
564 |         "created_utc": 1369671390.0,
565 |         "num_crossposts": 0,
566 |         "media": None,
567 |         "is_video": False,
568 |     }
569 | 
570 | 
571 | @pytest.fixture
572 | def stored_removed_post() -> PostRow:
573 |     return {
574 |         "external_url": "",
575 |         "id": "1f55rr",
576 |         "is_removed": 0,
577 |         "num_awards": 0,
578 |         "num_comments": 0,
579 |         "permalink": "https://old.reddit.com/r/videos/comments/1f55rr/tommy_wiseau_wishes_you_a_happy_memorial_day/",
580 |         "score": 1,
581 |         "subreddit": "2qh1e",
582 |         "text": "[deleted]",
583 |         "timestamp": 1369671390,
584 |         "title": "Tommy Wiseau Wishes YOU A Happy Memorial Day! — Urban Outfitters",
585 |         "upvote_ratio": 1,
586 |         # manually added this - if it's stored, I must have found a user
587 |         "user": "np8mb41h",
588 |     }
589 | 
590 | 
591 | @pytest.fixture
592 | def stored_removed_post_placeholder_user() -> PostRow:
593 |     return {
594 |         "external_url": "",
595 |         "id": "1f55rr",
596 |         "is_removed": 0,
597 |         "num_awards": 0,
598 |         "num_comments": 0,
599 |         "permalink": "https://old.reddit.com/r/videos/comments/1f55rr/tommy_wiseau_wishes_you_a_happy_memorial_day/",
600 |         "score": 1,
601 |         "subreddit": "2qh1e",
602 |         "text": "[deleted]",
603 |         "timestamp": 1369671390,
604 |         "title": "Tommy Wiseau Wishes YOU A Happy Memorial Day! — Urban Outfitters",
605 |         "upvote_ratio": 1,
606 |         "user": "1234567",
607 |     }
608 | 
609 | 
610 | @pytest.fixture
611 | def removed_post_response(removed_post):
612 |     return _wrap_response(removed_post)
613 | 
614 | 
615 | @pytest.fixture
616 | def external_post(self_post: Post) -> Post:
617 |     """
618 |     A raw (unwrapped) external post object from the Reddit API
619 |     """
620 |     return {
621 |         **self_post,
622 |         "selftext": "",
623 |         "url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
624 |         "id": "qwer",
625 |     }
626 | 
627 | 
628 | @pytest.fixture
629 | def stored_external_post(stored_self_post: PostRow) -> PostRow:
630 |     return {
631 |         **stored_self_post,
632 |         "text": "",
633 |         "external_url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
634 |         "id": "qwer",
635 |     }
636 | 
637 | 
638 | @pytest.fixture
639 | def all_posts_response(self_post, removed_post, external_post):
640 |     return _wrap_response(self_post, removed_post, external_post)
641 | 
642 | 
643 | @pytest.fixture
644 | def empty_response():
645 |     return _wrap_response()
646 | 
647 | 
648 | @pytest.fixture()
649 | def mock():
650 |     with responses.RequestsMock() as mock_requests:
651 |         yield mock_requests
652 | 
653 | 
654 | class MockPagedFunc(Protocol):
655 |     def __call__(
656 |         self,
657 |         resource: Literal["comments", "submitted"],
658 |         json: Any,
659 |         params: Optional[dict[str, Union[str, int]]] = None,
660 |         headers: Optional[dict[str, str]] = None,
661 |     ) -> BaseResponse:
662 |         ...
663 | 
664 | 
665 | @pytest.fixture
666 | def mock_paged_request(mock: RequestsMock) -> MockPagedFunc:
667 |     """
668 |     call this to mock a list of items for a user
669 |     """
670 | 
671 |     def _mock_request(
672 |         resource: Literal["comments", "submitted"],
673 |         json: Any,
674 |         params: Optional[dict[str, Union[str, int]]] = None,
675 |         headers: Optional[dict[str, str]] = None,
676 |     ):
677 |         params = {"limit": 100, "raw_json": 1, **(params or {})}
678 | 
679 |         return mock.get(
680 |             f"https://www.reddit.com/user/xavdid/{resource}.json",
681 |             match=[
682 |                 matchers.query_param_matcher(params),
683 |                 matchers.header_matcher({"user-agent": USER_AGENT}),
684 |             ],
685 |             json=json,
686 |             headers=headers,
687 |         )
688 | 
689 |     return _mock_request
690 | 
691 | 
692 | class MockInfoFunc(Protocol):
693 |     def __call__(
694 |         self, ids: str, json: Any, headers: Optional[dict[str, str]] = None, limit=100
695 |     ) -> BaseResponse:
696 |         ...
697 | 
698 | 
699 | # need to extract this so I can call it manually
700 | # def _build_mock_info_req(mock: RequestsMock) -> MockInfoFunc:
701 | 
702 | 
703 | @pytest.fixture
704 | def mock_info_request(mock: RequestsMock) -> MockInfoFunc:
705 |     """
706 |     call this to mirror loading info about a sequence of fullnames (type-prefixed ids)
707 |     """
708 | 
709 |     def _mock_request(
710 |         ids: str,
711 |         json: Any,
712 |         headers: Optional[dict[str, str]] = None,
713 |         limit=100,
714 |     ):
715 |         params = {"limit": limit, "raw_json": 1, "id": ids}
716 | 
717 |         return mock.get(
718 |             "https://www.reddit.com/api/info.json",
719 |             match=[
720 |                 matchers.query_param_matcher(params),
721 |                 matchers.header_matcher({"user-agent": USER_AGENT}),
722 |             ],
723 |             json=json,
724 |             headers=headers,
725 |         )
726 | 
727 |     return _mock_request
728 | 
729 | 
730 | @pytest.fixture
731 | def comment_info_response(modify_comment):
732 |     return _wrap_response(*(modify_comment({"id": i}) for i in "ac"))
733 | 
734 | 
735 | @pytest.fixture
736 | def post_info_response(modify_post):
737 |     return _wrap_response(*(modify_post({"id": i}) for i in "df"))
738 | 
739 | 
740 | @pytest.fixture
741 | def stored_user() -> UserRow:
742 |     return {"id": "np8mb41h", "username": "xavdid"}
743 | 
744 | 
745 | @pytest.fixture
746 | def deleted_user() -> UserRow:
747 |     return {"id": "1234567", "username": "__DeletedUser__"}
748 | 
749 | 
750 | @pytest.fixture
751 | def user_response():
752 |     return {
753 |         "kind": "t2",
754 |         "data": {
755 |             "is_employee": False,
756 |             "is_friend": False,
757 |             "subreddit": {
758 |                 "default_set": True,
759 |                 "user_is_contributor": None,
760 |                 "banner_img": "",
761 |                 "allowed_media_in_comments": [],
762 |                 "user_is_banned": None,
763 |                 "free_form_reports": True,
764 |                 "community_icon": None,
765 |                 "show_media": True,
766 |                 "icon_color": "#51E9F4",
767 |                 "user_is_muted": None,
768 |                 "display_name": "u_xavdid",
769 |                 "header_img": None,
770 |                 "title": "",
771 |                 "previous_names": [],
772 |                 "over_18": False,
773 |                 "icon_size": [256, 256],
774 |                 "primary_color": "",
775 |                 "icon_img": "https://www.redditstatic.com/avatars/defaults/v2/avatar_default_5.png",
776 |                 "description": "",
777 |                 "submit_link_label": "",
778 |                 "header_size": None,
779 |                 "restrict_posting": True,
780 |                 "restrict_commenting": False,
781 |                 "subscribers": 0,
782 |                 "submit_text_label": "",
783 |                 "is_default_icon": True,
784 |                 "link_flair_position": "",
785 |                 "display_name_prefixed": "u/xavdid",
786 |                 "key_color": "",
787 |                 "name": "t5_6fndvc",
788 |                 "is_default_banner": True,
789 |                 "url": "/user/xavdid/",
790 |                 "quarantine": False,
791 |                 "banner_size": None,
792 |                 "user_is_moderator": None,
793 |                 "accept_followers": True,
794 |                 "public_description": "",
795 |                 "link_flair_enabled": False,
796 |                 "disable_contributor_requests": False,
797 |                 "subreddit_type": "user",
798 |                 "user_is_subscriber": None,
799 |             },
800 |             "snoovatar_size": None,
801 |             "awardee_karma": 0,
802 |             "id": "np8mb41h",
803 |             "verified": True,
804 |             "is_gold": False,
805 |             "is_mod": False,
806 |             "awarder_karma": 0,
807 |             "has_verified_email": True,
808 |             "icon_img": "https://www.redditstatic.com/avatars/defaults/v2/avatar_default_5.png",
809 |             "hide_from_robots": False,
810 |             "link_karma": 1,
811 |             "is_blocked": False,
812 |             "total_karma": 3,
813 |             "pref_show_snoovatar": False,
814 |             "name": "xavdid",
815 |             "created": 1653622688.0,
816 |             "created_utc": 1653622688.0,
817 |             "snoovatar_img": "",
818 |             "comment_karma": 2,
819 |             "accept_followers": True,
820 |             "has_subscribed": False,
821 |         },
822 |     }
823 | 
824 | 
825 | @pytest.fixture
826 | def rate_limit_headers() -> ErrorHeaders:
827 |     return {
828 |         "x-ratelimit-used": "4",
829 |         "x-ratelimit-remaining": "6",
830 |         "x-ratelimit-reset": "20",
831 |     }
832 | 
833 | 
834 | class MockUserFunc(Protocol):
835 |     def __call__(self, username: str, json: Any) -> BaseResponse:
836 |         ...
837 | 
838 | 
839 | @pytest.fixture
840 | def mock_user_request(mock: RequestsMock) -> MockUserFunc:
841 |     """
842 |     call this to mirror loading info about a sequence of fullnames (type-prefixed ids)
843 |     """
844 | 
845 |     def _mock_request(username: str, json: Any):
846 |         return mock.get(
847 |             f"https://www.reddit.com/user/{username}/about.json",
848 |             match=[
849 |                 matchers.header_matcher({"user-agent": USER_AGENT}),
850 |             ],
851 |             json=json,
852 |         )
853 | 
854 |     return _mock_request
855 | 
856 | 
857 | @pytest.fixture
858 | def archive_dir(tmp_path: Path):
859 |     (archive_dir := tmp_path / "archive").mkdir()
860 |     return archive_dir
861 | 
862 | 
863 | class WriteArchiveFileFunc(Protocol):
864 |     def __call__(self, filename: str, lines: list[str]) -> Path:
865 |         ...
866 | 
867 | 
868 | @pytest.fixture
869 | def write_archive_file(archive_dir: Path) -> WriteArchiveFileFunc:
870 |     """
871 |     write `lines` into `archive_dir/filename`.
872 |     """
873 | 
874 |     def _write_file(filename: str, lines: list[str]):
875 |         (new_file := archive_dir / filename).write_text("\n".join(lines))
876 |         return new_file
877 | 
878 |     return _write_file
879 | 
880 | 
881 | @pytest.fixture
882 | def stats_file(write_archive_file: WriteArchiveFileFunc):
883 |     """
884 |     write a basic statistics file into the archive directory
885 |     """
886 | 
887 |     return write_archive_file(
888 |         "statistics.csv",
889 |         [
890 |             "statistic,value",
891 |             "account name,xavdid",
892 |             "export time,2023-05-02 06:57:14 UTC",
893 |             "is_deleted,False",
894 |             "registration date,2014-05-19 22:02:20 UTC",
895 |             "email verified,True",
896 |             "email address,whatever@gmail.com",
897 |         ],
898 |     )
899 | 
900 | 
901 | @pytest.fixture
902 | def comments_file(write_archive_file: WriteArchiveFileFunc):
903 |     return write_archive_file("comments.csv", ["id", "a", "c"])
904 | 
905 | 
906 | @pytest.fixture
907 | def saved_comments_file(write_archive_file: WriteArchiveFileFunc):
908 |     return write_archive_file("saved_comments.csv", ["id", "g", "h"])
909 | 
910 | 
911 | @pytest.fixture
912 | def posts_file(write_archive_file: WriteArchiveFileFunc):
913 |     return write_archive_file("posts.csv", ["id", "d", "f"])
914 | 
915 | 
916 | @pytest.fixture
917 | def saved_posts_file(write_archive_file: WriteArchiveFileFunc):
918 |     return write_archive_file("saved_posts.csv", ["id", "j", "k"])
919 | 
920 | 
921 | @pytest.fixture
922 | def empty_file_at_path(write_archive_file: WriteArchiveFileFunc):
923 |     def _empty_file(filename: str):
924 |         return write_archive_file(filename, [])
925 | 
926 |     return _empty_file
927 | 
928 | 
929 | # ---
930 | 
931 | 
932 | # https://docs.pytest.org/en/latest/example/simple.html#control-skipping-of-tests-according-to-command-line-option
933 | 
934 | 
935 | def pytest_addoption(parser):
936 |     parser.addoption(
937 |         "--include-live", action="store_true", default=False, help="run live API tests"
938 |     )
939 | 
940 | 
941 | def pytest_configure(config):
942 |     config.addinivalue_line("markers", "live: mark test as hitting the live API")
943 | 
944 | 
945 | def pytest_collection_modifyitems(config, items):
946 |     if config.getoption("--include-live"):
947 |         # include-live flag given in cli; do not skip slow tests
948 |         return
949 | 
950 |     skip_live = pytest.mark.skip(reason="need --include-live flag to run")
951 |     for item in items:
952 |         if "live" in item.keywords:
953 |             item.add_marker(skip_live)
954 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
  1 | from traceback import print_tb
  2 | 
  3 | import pytest
  4 | from click.testing import CliRunner
  5 | from sqlite_utils import Database
  6 | 
  7 | from reddit_user_to_sqlite.cli import cli
  8 | from tests.conftest import (
  9 |     MockInfoFunc,
 10 |     MockPagedFunc,
 11 |     MockUserFunc,
 12 |     WriteArchiveFileFunc,
 13 | )
 14 | 
 15 | 
 16 | @pytest.mark.parametrize("username", ["xavdid", "/u/xavdid", "u/xavdid"])
 17 | def test_load_data_for_user(
 18 |     tmp_db_path: str,
 19 |     tmp_db: Database,
 20 |     mock_paged_request: MockPagedFunc,
 21 |     username,
 22 |     all_posts_response,
 23 |     stored_comment,
 24 |     stored_self_post,
 25 |     stored_external_post,
 26 |     stored_user,
 27 |     all_comments_response,
 28 | ):
 29 |     comment_response = mock_paged_request(
 30 |         resource="comments", json=all_comments_response
 31 |     )
 32 |     post_response = mock_paged_request(resource="submitted", json=all_posts_response)
 33 | 
 34 |     result = CliRunner().invoke(cli, ["user", username, "--db", tmp_db_path])
 35 |     assert not result.exception, result.exception
 36 | 
 37 |     assert {
 38 |         "subreddits",
 39 |         "users",
 40 |         "comments",
 41 |         "comments_fts",
 42 |         "posts",
 43 |         "posts_fts",
 44 |     }.issubset(tmp_db.table_names())
 45 | 
 46 |     assert list(tmp_db["subreddits"].rows) == [
 47 |         {"id": "2t3ad", "name": "patientgamers", "type": "public"},
 48 |         {"id": "2qm4e", "name": "askscience", "type": "public"},
 49 |         {"id": "32u6q", "name": "KeybaseProofs", "type": "public"},
 50 |         {"id": "2qh1e", "name": "videos", "type": "public"},
 51 |     ]
 52 |     assert list(tmp_db["users"].rows) == [stored_user]
 53 |     assert list(tmp_db["comments"].rows) == [stored_comment]
 54 |     assert list(tmp_db["posts"].rows) == [
 55 |         stored_self_post,
 56 |         stored_external_post,
 57 |     ]
 58 | 
 59 |     assert comment_response.call_count == 1
 60 |     assert post_response.call_count == 1
 61 | 
 62 | 
 63 | @pytest.mark.live
 64 | def test_load_live_data(
 65 |     tmp_db_path: str, tmp_db: Database, stored_comment, stored_self_post, stored_user
 66 | ):
 67 |     result = CliRunner().invoke(cli, ["user", "xavdid", "--db", tmp_db_path])
 68 |     assert not result.exception, result.exception
 69 | 
 70 |     assert {"subreddits", "users", "comments", "comments_fts"} <= set(
 71 |         tmp_db.table_names()
 72 |     )
 73 | 
 74 |     assert {"id": "2t3ad", "name": "patientgamers", "type": "public"} in list(
 75 |         tmp_db["subreddits"].rows
 76 |     )
 77 |     assert list(tmp_db["users"].rows) == [stored_user]
 78 | 
 79 |     comments = list(tmp_db["comments"].rows)
 80 |     assert (
 81 |         len(comments) <= 1000
 82 |     ), "this test will start to fail if/when I've made 1k comments on this account"
 83 |     assert stored_comment in comments
 84 | 
 85 |     posts = list(tmp_db["posts"].rows)
 86 |     assert (
 87 |         len(posts) <= 1000
 88 |     ), "this test will start to fail if/when I've made 1k posts on this account"
 89 |     assert stored_self_post["id"] in {p["id"] for p in posts}
 90 | 
 91 | 
 92 | def test_missing_user_errors(tmp_db_path: str, mock_paged_request: MockPagedFunc):
 93 |     mock_paged_request(
 94 |         resource="comments", json={"error": 404, "message": "no user by that name"}
 95 |     )
 96 |     result = CliRunner().invoke(cli, ["user", "xavdid", "--db", tmp_db_path])
 97 | 
 98 |     assert result.exception
 99 |     assert (
100 |         str(result.exception)
101 |         == "Received API error from Reddit (code 404): no user by that name"
102 |     )
103 | 
104 | 
105 | def test_no_data(tmp_db_path: str, mock_paged_request: MockPagedFunc, empty_response):
106 |     mock_paged_request(resource="comments", json=empty_response)
107 |     mock_paged_request(resource="submitted", json=empty_response)
108 | 
109 |     result = CliRunner().invoke(cli, ["user", "xavdid", "--db", tmp_db_path])
110 | 
111 |     assert result.exit_code == 1
112 |     assert result.stdout  # not sure why it's in "out" not "err"
113 |     assert "Error: no data found for username: xavdid" in result.stdout
114 | 
115 | 
116 | def test_comments_but_no_posts(
117 |     tmp_db_path: str,
118 |     tmp_db: Database,
119 |     mock_paged_request: MockPagedFunc,
120 |     empty_response,
121 |     comment_response,
122 |     stored_comment,
123 |     stored_user,
124 | ):
125 |     mock_paged_request(resource="comments", json=comment_response)
126 |     mock_paged_request(resource="submitted", json=empty_response)
127 | 
128 |     result = CliRunner().invoke(cli, ["user", "xavdid", "--db", tmp_db_path])
129 |     assert not result.exception, result.exception
130 | 
131 |     assert list(tmp_db["users"].rows) == [stored_user]
132 |     assert list(tmp_db["posts"].rows) == []
133 |     assert list(tmp_db["comments"].rows) == [stored_comment]
134 | 
135 | 
136 | def test_posts_but_no_comments(
137 |     tmp_db_path: str,
138 |     tmp_db: Database,
139 |     mock_paged_request: MockPagedFunc,
140 |     empty_response,
141 |     self_post_response,
142 |     stored_self_post,
143 |     stored_user,
144 | ):
145 |     mock_paged_request(resource="comments", json=empty_response)
146 |     mock_paged_request(resource="submitted", json=self_post_response)
147 | 
148 |     result = CliRunner().invoke(cli, ["user", "xavdid", "--db", tmp_db_path])
149 |     assert not result.exception, result.exception
150 | 
151 |     assert list(tmp_db["users"].rows) == [stored_user]
152 |     assert list(tmp_db["comments"].rows) == []
153 |     assert list(tmp_db["posts"].rows) == [stored_self_post]
154 | 
155 | 
156 | @pytest.mark.usefixtures("comments_file", "posts_file")
157 | def test_cold_load_data_from_archive(
158 |     tmp_db_path,
159 |     mock_info_request: MockInfoFunc,
160 |     archive_dir,
161 |     tmp_db: Database,
162 |     stored_user,
163 |     stored_comment,
164 |     stored_self_post,
165 |     comment_info_response,
166 |     post_info_response,
167 |     empty_file_at_path,
168 | ):
169 |     empty_file_at_path("saved_comments.csv")
170 |     empty_file_at_path("saved_posts.csv")
171 | 
172 |     mock_info_request("t1_a,t1_c", json=comment_info_response)
173 |     mock_info_request("t3_d,t3_f", json=post_info_response)
174 | 
175 |     result = CliRunner().invoke(cli, ["archive", str(archive_dir), "--db", tmp_db_path])
176 |     assert not result.exception, print(result.exception)
177 | 
178 |     assert {
179 |         "subreddits",
180 |         "users",
181 |         "comments",
182 |         "comments_fts",
183 |         "posts",
184 |         "posts_fts",
185 |     } <= set(tmp_db.table_names())
186 | 
187 |     assert list(tmp_db["subreddits"].rows) == [
188 |         {"id": "2t3ad", "name": "patientgamers", "type": "public"},
189 |         {"id": "32u6q", "name": "KeybaseProofs", "type": "public"},
190 |     ]
191 |     assert list(tmp_db["users"].rows) == [stored_user]
192 |     assert list(tmp_db["comments"].rows) == [{**stored_comment, "id": i} for i in "ac"]
193 |     assert list(tmp_db["posts"].rows) == [{**stored_self_post, "id": i} for i in "df"]
194 | 
195 | 
196 | @pytest.mark.usefixtures("comments_file")
197 | def test_cold_load_comments_only_from_archive(
198 |     tmp_db_path,
199 |     mock_info_request: MockInfoFunc,
200 |     empty_file_at_path,
201 |     archive_dir,
202 |     tmp_db: Database,
203 |     stored_comment,
204 |     stored_user,
205 |     comment_info_response,
206 | ):
207 |     mock_info_request("t1_a,t1_c", json=comment_info_response)
208 |     empty_file_at_path("posts.csv")
209 |     empty_file_at_path("saved_comments.csv")
210 |     empty_file_at_path("saved_posts.csv")
211 | 
212 |     result = CliRunner().invoke(cli, ["archive", str(archive_dir), "--db", tmp_db_path])
213 |     assert not result.exception
214 | 
215 |     assert {"subreddits", "users", "comments", "comments_fts"} <= set(
216 |         tmp_db.table_names()
217 |     )
218 |     assert list(tmp_db["subreddits"].rows) == [
219 |         {"id": "2t3ad", "name": "patientgamers", "type": "public"}
220 |     ]
221 |     assert list(tmp_db["users"].rows) == [stored_user]
222 |     assert list(tmp_db["comments"].rows) == [{**stored_comment, "id": i} for i in "ac"]
223 |     assert list(tmp_db["posts"].rows) == []
224 | 
225 | 
226 | @pytest.mark.usefixtures("posts_file")
227 | def test_cold_load_posts_only_from_archive(
228 |     tmp_db_path,
229 |     mock_info_request: MockInfoFunc,
230 |     empty_file_at_path,
231 |     archive_dir,
232 |     tmp_db: Database,
233 |     stored_self_post,
234 |     stored_user,
235 |     post_info_response,
236 | ):
237 |     empty_file_at_path("comments.csv")
238 |     empty_file_at_path("saved_comments.csv")
239 |     empty_file_at_path("saved_posts.csv")
240 | 
241 |     mock_info_request("t3_d,t3_f", json=post_info_response)
242 | 
243 |     result = CliRunner().invoke(cli, ["archive", str(archive_dir), "--db", tmp_db_path])
244 |     assert not result.exception
245 | 
246 |     assert {"subreddits", "users", "posts", "posts_fts"} <= set(tmp_db.table_names())
247 |     assert list(tmp_db["subreddits"].rows) == [
248 |         {"id": "32u6q", "name": "KeybaseProofs", "type": "public"}
249 |     ]
250 |     assert list(tmp_db["users"].rows) == [stored_user]
251 |     assert list(tmp_db["comments"].rows) == []
252 |     assert list(tmp_db["posts"].rows) == [{**stored_self_post, "id": i} for i in "df"]
253 | 
254 | 
255 | def test_loads_data_from_both_sources_api_first(
256 |     tmp_db_path,
257 |     mock_info_request: MockInfoFunc,
258 |     mock_paged_request: MockPagedFunc,
259 |     comment_response,
260 |     self_post_response,
261 |     archive_dir,
262 |     tmp_db: Database,
263 |     stored_comment,
264 |     stored_self_post,
265 |     stored_user,
266 |     comment_info_response,
267 |     post_info_response,
268 |     write_archive_file: WriteArchiveFileFunc,
269 |     empty_file_at_path,
270 | ):
271 |     empty_file_at_path("saved_comments.csv")
272 |     empty_file_at_path("saved_posts.csv")
273 | 
274 |     mock_paged_request("comments", json=comment_response)
275 |     mock_paged_request("submitted", json=self_post_response)
276 | 
277 |     api_result = CliRunner().invoke(cli, ["user", "xavdid", "--db", tmp_db_path])
278 |     assert not api_result.exception
279 | 
280 |     assert {
281 |         "subreddits",
282 |         "users",
283 |         "comments",
284 |         "comments_fts",
285 |         "posts",
286 |         "posts_fts",
287 |     } <= set(tmp_db.table_names())
288 |     assert list(tmp_db["subreddits"].rows) == [
289 |         {"id": "2t3ad", "name": "patientgamers", "type": "public"},
290 |         {"id": "32u6q", "name": "KeybaseProofs", "type": "public"},
291 |     ]
292 |     assert list(tmp_db["comments"].rows) == [stored_comment]
293 |     assert list(tmp_db["posts"].rows) == [stored_self_post]
294 | 
295 |     # second pass
296 |     mock_info_request("t1_a,t1_c", json=comment_info_response)
297 |     mock_info_request("t3_d,t3_f", json=post_info_response)
298 | 
299 |     write_archive_file("comments.csv", ["id", "a", "c", stored_comment["id"]])
300 |     write_archive_file("posts.csv", ["id", "d", "f", stored_self_post["id"]])
301 | 
302 |     archive_result = CliRunner().invoke(
303 |         cli, ["archive", str(archive_dir), "--db", tmp_db_path]
304 |     )
305 |     assert not archive_result.exception, print(archive_result.exception)
306 | 
307 |     assert list(tmp_db["users"].rows) == [stored_user]
308 |     assert list(tmp_db["comments"].rows) == [
309 |         stored_comment,
310 |         *({**stored_comment, "id": i} for i in "ac"),
311 |     ]
312 |     assert list(tmp_db["posts"].rows) == [
313 |         stored_self_post,
314 |         *({**stored_self_post, "id": i} for i in "df"),
315 |     ]
316 | 
317 | 
318 | def test_loads_data_from_both_sources_archive_first(
319 |     tmp_db_path,
320 |     mock_info_request: MockInfoFunc,
321 |     mock_paged_request: MockPagedFunc,
322 |     comment_response,
323 |     self_post_response,
324 |     archive_dir,
325 |     tmp_db: Database,
326 |     stored_comment,
327 |     stored_self_post,
328 |     stored_user,
329 |     comment_info_response,
330 |     post_info_response,
331 |     write_archive_file: WriteArchiveFileFunc,
332 |     empty_file_at_path,
333 | ):
334 |     # second pass
335 |     mock_info_request("t1_a,t1_c", json=comment_info_response)
336 |     mock_info_request("t3_d,t3_f", json=post_info_response)
337 | 
338 |     write_archive_file("comments.csv", ["id", "a", "c"])
339 |     write_archive_file("posts.csv", ["id", "d", "f"])
340 | 
341 |     empty_file_at_path("saved_comments.csv")
342 |     empty_file_at_path("saved_posts.csv")
343 | 
344 |     archive_result = CliRunner().invoke(
345 |         cli, ["archive", str(archive_dir), "--db", tmp_db_path]
346 |     )
347 |     assert not archive_result.exception, print(archive_result.exception)
348 | 
349 |     assert {
350 |         "subreddits",
351 |         "users",
352 |         "comments",
353 |         "comments_fts",
354 |         "posts",
355 |         "posts_fts",
356 |     } <= set(tmp_db.table_names())
357 | 
358 |     assert list(tmp_db["users"].rows) == [stored_user]
359 |     assert list(tmp_db["comments"].rows) == [{**stored_comment, "id": i} for i in "ac"]
360 |     assert list(tmp_db["posts"].rows) == [{**stored_self_post, "id": i} for i in "df"]
361 | 
362 |     mock_paged_request("comments", json=comment_response)
363 |     mock_paged_request("submitted", json=self_post_response)
364 | 
365 |     api_result = CliRunner().invoke(cli, ["user", "xavdid", "--db", tmp_db_path])
366 |     assert not api_result.exception, print_tb(api_result.exception.__traceback__)
367 | 
368 |     assert list(tmp_db["subreddits"].rows) == [
369 |         {"id": "2t3ad", "name": "patientgamers", "type": "public"},
370 |         {"id": "32u6q", "name": "KeybaseProofs", "type": "public"},
371 |     ]
372 |     assert list(tmp_db["comments"].rows) == [
373 |         *({**stored_comment, "id": i} for i in "ac"),
374 |         stored_comment,
375 |     ]
376 |     assert list(tmp_db["posts"].rows) == [
377 |         *({**stored_self_post, "id": i} for i in "df"),
378 |         stored_self_post,
379 |     ]
380 | 
381 | 
382 | def test_adds_username_to_removed_posts_in_mixed_archive(
383 |     archive_dir,
384 |     tmp_db_path,
385 |     tmp_db: Database,
386 |     stored_user,
387 |     stored_comment,
388 |     stored_removed_comment,
389 |     stored_self_post,
390 |     stored_removed_post,
391 |     mock_info_request: MockInfoFunc,
392 |     write_archive_file: WriteArchiveFileFunc,
393 |     all_comments_response,
394 |     all_posts_response,
395 |     stored_external_post,
396 |     empty_file_at_path,
397 | ):
398 |     mock_info_request("t1_jj0ti6f,t1_c3sgfl4", json=all_comments_response)
399 |     mock_info_request("t3_uypaav,t3_1f55rr,t3_qwer", json=all_posts_response)
400 | 
401 |     write_archive_file("comments.csv", ["id", "jj0ti6f", "c3sgfl4"])
402 |     write_archive_file("posts.csv", ["id", "uypaav", "1f55rr", "qwer"])
403 |     empty_file_at_path("saved_comments.csv")
404 |     empty_file_at_path("saved_posts.csv")
405 | 
406 |     api_result = CliRunner().invoke(
407 |         cli, ["archive", str(archive_dir), "--db", tmp_db_path]
408 |     )
409 |     assert not api_result.exception, print(api_result.exception)
410 | 
411 |     assert list(tmp_db["subreddits"].rows) == [
412 |         {"id": "2t3ad", "name": "patientgamers", "type": "public"},
413 |         {"id": "2qm4e", "name": "askscience", "type": "public"},
414 |         {"id": "32u6q", "name": "KeybaseProofs", "type": "public"},
415 |         {"id": "2qh1e", "name": "videos", "type": "public"},
416 |     ]
417 |     assert list(tmp_db["users"].rows) == [stored_user]
418 |     assert list(tmp_db["comments"].rows) == [stored_comment, stored_removed_comment]
419 |     assert list(tmp_db["posts"].rows) == [
420 |         stored_self_post,
421 |         stored_removed_post,
422 |         stored_external_post,
423 |     ]
424 | 
425 | 
426 | @pytest.mark.usefixtures("stats_file")
427 | def test_load_username_from_file(
428 |     tmp_db: Database,
429 |     tmp_db_path,
430 |     user_response,
431 |     archive_dir,
432 |     removed_post_response,
433 |     stored_removed_comment,
434 |     stored_removed_post,
435 |     stored_user,
436 |     mock_info_request: MockInfoFunc,
437 |     mock_user_request: MockUserFunc,
438 |     write_archive_file: WriteArchiveFileFunc,
439 |     removed_comment_response,
440 |     empty_file_at_path,
441 | ):
442 |     mock_info_request("t1_c3sgfl4", json=removed_comment_response)
443 |     mock_info_request("t3_1f55rr", json=removed_post_response)
444 | 
445 |     mock_user_request("xavdid", json=user_response)
446 | 
447 |     write_archive_file("comments.csv", ["id", "c3sgfl4"])
448 |     write_archive_file("posts.csv", ["id", "1f55rr"])
449 |     empty_file_at_path("saved_comments.csv")
450 |     empty_file_at_path("saved_posts.csv")
451 | 
452 |     api_result = CliRunner().invoke(
453 |         cli, ["archive", str(archive_dir), "--db", tmp_db_path]
454 |     )
455 |     assert not api_result.exception, print(api_result.exception)
456 | 
457 |     assert {
458 |         "subreddits",
459 |         "users",
460 |         "comments",
461 |         "comments_fts",
462 |         "posts",
463 |         "posts_fts",
464 |     } <= set(tmp_db.table_names())
465 | 
466 |     assert list(tmp_db["subreddits"].rows) == [
467 |         {"id": "2qm4e", "name": "askscience", "type": "public"},
468 |         {"id": "2qh1e", "name": "videos", "type": "public"},
469 |     ]
470 |     assert list(tmp_db["users"].rows) == [stored_user]
471 |     assert list(tmp_db["comments"].rows) == [stored_removed_comment]
472 |     assert list(tmp_db["posts"].rows) == [stored_removed_post]
473 | 
474 | 
475 | def test_missing_username_entirely(
476 |     tmp_db: Database,
477 |     tmp_db_path,
478 |     archive_dir,
479 |     removed_post_response,
480 |     empty_file_at_path,
481 |     mock_info_request: MockInfoFunc,
482 |     write_archive_file: WriteArchiveFileFunc,
483 |     removed_comment_response,
484 | ):
485 |     mock_info_request("t1_c3sgfl4", json=removed_comment_response)
486 |     mock_info_request("t3_1f55rr", json=removed_post_response)
487 | 
488 |     empty_file_at_path("statistics.csv")
489 | 
490 |     write_archive_file("comments.csv", ["id", "c3sgfl4"])
491 |     write_archive_file("posts.csv", ["id", "1f55rr"])
492 |     empty_file_at_path("saved_comments.csv")
493 |     empty_file_at_path("saved_posts.csv")
494 | 
495 |     api_result = CliRunner().invoke(
496 |         cli, ["archive", str(archive_dir), "--db", tmp_db_path]
497 |     )
498 |     assert not api_result.exception, print(api_result.exception)
499 | 
500 |     assert "Unable to guess username" in api_result.output
501 |     assert "some data will not be saved." in api_result.output
502 |     assert "ignored for now" in api_result.output
503 | 
504 |     assert tmp_db.table_names() == ["subreddits"]
505 | 
506 |     assert list(tmp_db["subreddits"].rows) == [
507 |         {"id": "2qm4e", "name": "askscience", "type": "public"},
508 |         {"id": "2qh1e", "name": "videos", "type": "public"},
509 |     ]
510 |     assert list(tmp_db["users"].rows) == []
511 |     assert list(tmp_db["comments"].rows) == []
512 |     assert list(tmp_db["posts"].rows) == []
513 | 
514 | 
515 | def test_load_saved_data(
516 |     tmp_db: Database,
517 |     tmp_db_path,
518 |     archive_dir,
519 |     empty_file_at_path,
520 |     mock_info_request: MockInfoFunc,
521 |     write_archive_file: WriteArchiveFileFunc,
522 |     all_comments_response,
523 |     stored_user,
524 |     deleted_user,
525 |     stored_removed_comment_placeholder_user,
526 |     stored_comment,
527 |     all_posts_response,
528 |     stored_self_post,
529 |     stored_removed_post_placeholder_user,
530 |     stored_external_post,
531 | ):
532 |     empty_file_at_path("comments.csv")
533 |     empty_file_at_path("posts.csv")
534 |     empty_file_at_path("statistics.csv")
535 |     write_archive_file("saved_comments.csv", ["id", "jj0ti6f", "c3sgfl4"])
536 |     write_archive_file("saved_posts.csv", ["id", "uypaav", "1f55rr", "qwer"])
537 | 
538 |     mock_info_request("t1_jj0ti6f,t1_c3sgfl4", json=all_comments_response)
539 |     mock_info_request("t3_uypaav,t3_1f55rr,t3_qwer", json=all_posts_response)
540 | 
541 |     result = CliRunner().invoke(
542 |         cli,
543 |         ["archive", str(archive_dir), "--db", tmp_db_path],
544 |     )
545 |     assert not result.exception, result.exception
546 |     assert result.stdout  # not sure why it's in "out" not "err"
547 |     assert "saved 2 new comments" in result.stdout
548 | 
549 |     assert {
550 |         "subreddits",
551 |         "users",
552 |         "saved_comments",
553 |         "saved_comments_fts",
554 |         "saved_posts",
555 |         "saved_posts_fts",
556 |     }.issubset(tmp_db.table_names())
557 |     assert "comments" not in tmp_db.table_names()
558 |     assert "posts" not in tmp_db.table_names()
559 | 
560 |     assert list(tmp_db["subreddits"].rows) == [
561 |         {"id": "2t3ad", "name": "patientgamers", "type": "public"},
562 |         {"id": "2qm4e", "name": "askscience", "type": "public"},
563 |         {"id": "32u6q", "name": "KeybaseProofs", "type": "public"},
564 |         {"id": "2qh1e", "name": "videos", "type": "public"},
565 |     ]
566 |     # for some reason, .rows was returning rows inconsistently, so I ordered it
567 |     assert list(tmp_db["users"].rows_where(order_by="id")) == [
568 |         deleted_user,
569 |         stored_user,
570 |     ]
571 |     assert list(tmp_db["saved_comments"].rows) == [
572 |         stored_comment,
573 |         stored_removed_comment_placeholder_user,
574 |     ]
575 |     assert list(tmp_db["saved_posts"].rows) == [
576 |         stored_self_post,
577 |         stored_removed_post_placeholder_user,
578 |         stored_external_post,
579 |     ]
580 | 
581 | 
582 | def test_load_data_skip_saved(
583 |     tmp_db: Database,
584 |     tmp_db_path,
585 |     archive_dir,
586 |     empty_file_at_path,
587 |     write_archive_file: WriteArchiveFileFunc,
588 | ):
589 |     empty_file_at_path("comments.csv")
590 |     empty_file_at_path("posts.csv")
591 |     empty_file_at_path("statistics.csv")
592 |     write_archive_file("saved_comments.csv", ["id", "jj0ti6f", "c3sgfl4"])
593 |     write_archive_file("saved_posts.csv", ["id", "uypaav", "1f55rr", "qwer"])
594 | 
595 |     result = CliRunner().invoke(
596 |         cli,
597 |         ["archive", str(archive_dir), "--db", tmp_db_path, "--skip-saved"],
598 |     )
599 |     assert not result.exception, result.exception
600 |     assert result.stdout  # not sure why it's in "out" not "err"
601 |     assert "saved 0 new comments" in result.stdout
602 | 
603 |     table_names = tmp_db.table_names()
604 |     for s in {
605 |         "subreddits",
606 |         "users",
607 |         "saved_comments",
608 |         "saved_comments_fts",
609 |         "saved_posts",
610 |         "saved_posts_fts",
611 |         "comments",
612 |         "posts",
613 |     }:
614 |         assert s not in table_names
615 | 
616 |     assert list(tmp_db["subreddits"].rows) == []
617 |     assert list(tmp_db["users"].rows) == []
618 |     assert list(tmp_db["saved_comments"].rows) == []
619 |     assert list(tmp_db["saved_posts"].rows) == []
620 | 


--------------------------------------------------------------------------------
/tests/test_csv_helpers.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | 
  3 | import pytest
  4 | from sqlite_utils import Database
  5 | 
  6 | from reddit_user_to_sqlite.csv_helpers import (
  7 |     build_table_name,
  8 |     get_username_from_archive,
  9 |     load_unsaved_ids_from_file,
 10 |     validate_and_build_path,
 11 | )
 12 | 
 13 | 
 14 | def test_validate_and_build_path(archive_dir, stats_file):
 15 |     assert validate_and_build_path(archive_dir, "statistics") == stats_file
 16 | 
 17 | 
 18 | def test_validate_and_build_fails(archive_dir: Path):
 19 |     with pytest.raises(ValueError) as err:
 20 |         validate_and_build_path(archive_dir, "posts")
 21 | 
 22 |     err_msg = str(err.value)
 23 | 
 24 |     assert str(archive_dir) in err_msg
 25 |     assert 'posts.csv" not found' in err_msg
 26 | 
 27 | 
 28 | @pytest.mark.usefixtures("comments_file")
 29 | def test_load_comment_ids_from_file_empty_db(tmp_db: Database, archive_dir):
 30 |     assert load_unsaved_ids_from_file(tmp_db, archive_dir, "comments") == [
 31 |         "t1_a",
 32 |         "t1_c",
 33 |     ]
 34 | 
 35 | 
 36 | @pytest.mark.usefixtures("comments_file")
 37 | def test_load_comment_ids_from_file_non_db(tmp_db: Database, archive_dir):
 38 |     tmp_db["comments"].insert({"id": "a"})  # type: ignore
 39 | 
 40 |     assert load_unsaved_ids_from_file(tmp_db, archive_dir, "comments") == [
 41 |         "t1_c",
 42 |     ]
 43 | 
 44 | 
 45 | @pytest.mark.usefixtures("saved_comments_file")
 46 | def test_load_saved_comment_ids_from_file_empty_db(tmp_db: Database, archive_dir):
 47 |     assert load_unsaved_ids_from_file(
 48 |         tmp_db, archive_dir, "comments", prefix="saved_"
 49 |     ) == [
 50 |         "t1_g",
 51 |         "t1_h",
 52 |     ]
 53 | 
 54 | 
 55 | @pytest.mark.usefixtures("saved_comments_file")
 56 | def test_load_saved_comment_ids_from_file_non_empty_db(tmp_db: Database, archive_dir):
 57 |     tmp_db["saved_comments"].insert({"id": "h"})  # type: ignore
 58 | 
 59 |     assert load_unsaved_ids_from_file(
 60 |         tmp_db, archive_dir, "comments", prefix="saved_"
 61 |     ) == ["t1_g"]
 62 | 
 63 | 
 64 | def test_load_comment_ids_missing_files(tmp_db: Database, archive_dir):
 65 |     with pytest.raises(ValueError) as err:
 66 |         load_unsaved_ids_from_file(tmp_db, archive_dir, "comments")
 67 | 
 68 |     err_msg = str(err)
 69 |     assert 'comments.csv" not found' in err_msg
 70 |     # LOAD BEARING TEST, DO NOT REMOVE
 71 |     assert "unzipped Reddit GDPR archive" in err_msg
 72 | 
 73 | 
 74 | @pytest.mark.usefixtures("posts_file")
 75 | def test_load_post_ids_from_file_empty_db(tmp_db: Database, archive_dir):
 76 |     assert load_unsaved_ids_from_file(tmp_db, archive_dir, "posts") == [
 77 |         "t3_d",
 78 |         "t3_f",
 79 |     ]
 80 | 
 81 | 
 82 | @pytest.mark.usefixtures("posts_file")
 83 | def test_load_post_ids_from_file_some_db(tmp_db: Database, archive_dir):
 84 |     tmp_db["posts"].insert({"id": "d"})  # type: ignore
 85 | 
 86 |     assert load_unsaved_ids_from_file(tmp_db, archive_dir, "posts") == [
 87 |         "t3_f",
 88 |     ]
 89 | 
 90 | 
 91 | @pytest.mark.usefixtures("saved_posts_file")
 92 | def test_load_saved_post_ids_from_file_empty_db(tmp_db: Database, archive_dir):
 93 |     assert load_unsaved_ids_from_file(
 94 |         tmp_db, archive_dir, "posts", prefix="saved_"
 95 |     ) == [
 96 |         "t3_j",
 97 |         "t3_k",
 98 |     ]
 99 | 
100 | 
101 | @pytest.mark.usefixtures("saved_posts_file")
102 | def test_load_saved_post_ids_from_file_non_empty_db(tmp_db: Database, archive_dir):
103 |     tmp_db["saved_posts"].insert({"id": "j"})  # type: ignore
104 | 
105 |     assert load_unsaved_ids_from_file(
106 |         tmp_db, archive_dir, "posts", prefix="saved_"
107 |     ) == ["t3_k"]
108 | 
109 | 
110 | def test_load_post_ids_missing_files(tmp_db: Database, archive_dir):
111 |     with pytest.raises(ValueError) as err:
112 |         load_unsaved_ids_from_file(tmp_db, archive_dir, "posts")
113 | 
114 |     assert 'posts.csv" not found' in str(err.value)
115 | 
116 | 
117 | @pytest.mark.usefixtures("stats_file")
118 | def test_get_username_from_archive(archive_dir):
119 |     assert get_username_from_archive(archive_dir) == "xavdid"
120 | 
121 | 
122 | def test_get_username_from_archive_no_name(archive_dir: Path):
123 |     (archive_dir / "statistics.csv").touch()
124 |     assert get_username_from_archive(archive_dir) is None
125 | 
126 | 
127 | def test_get_username_from_archive_missing_file(archive_dir):
128 |     with pytest.raises(ValueError) as err:
129 |         get_username_from_archive(archive_dir)
130 | 
131 |     assert 'statistics.csv" not found' in str(err.value)
132 | 
133 | 
134 | @pytest.mark.parametrize(
135 |     ["table_name", "table_prefix", "expected"],
136 |     [
137 |         ("comments", None, "comments"),
138 |         ("posts", None, "posts"),
139 |         ("comments", "saved_", "saved_comments"),
140 |         ("posts", "saved_", "saved_posts"),
141 |     ],
142 | )
143 | def test_build_table_name(table_name, table_prefix, expected):
144 |     assert build_table_name(table_name, table_prefix) == expected
145 | 


--------------------------------------------------------------------------------
/tests/test_helpers.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from reddit_user_to_sqlite.helpers import clean_username, find_user_details_from_items
 4 | 
 5 | 
 6 | @pytest.mark.parametrize(
 7 |     "username, expected",
 8 |     [
 9 |         ("/u/xavdid", "xavdid"),
10 |         ("u/xavdid", "xavdid"),
11 |         ("xavdid", "xavdid"),
12 |         ("unbelievable", "unbelievable"),
13 |     ],
14 | )
15 | def test_clean_username(username, expected):
16 |     assert clean_username(username) == expected
17 | 
18 | 
19 | # to verify that fixtures that modify previous fixture results don't mutate them
20 | def test_fixture_modifications(self_post, removed_post):
21 |     assert self_post != removed_post
22 | 
23 | 
24 | def test_unique_fixture_ids(self_post, removed_post, external_post):
25 |     # all post types should have unique ids
26 |     assert len({p["id"] for p in [self_post, removed_post, external_post]}) == 3
27 | 
28 | 
29 | def test_find_user_details_from_items():
30 |     assert find_user_details_from_items(
31 |         [
32 |             {"asdf": 1},
33 |             {"author_fullname": "t2_abc123", "author": "xavdid"},
34 |         ]
35 |     ) == ("xavdid", "t2_abc123")
36 | 
37 | 
38 | def test_fail_to_find_user_details_from_items():
39 |     assert find_user_details_from_items([{"asdf": 1}, {"author": "xavdid"}]) is None
40 | 


--------------------------------------------------------------------------------
/tests/test_reddit_api.py:
--------------------------------------------------------------------------------
  1 | from unittest.mock import MagicMock, patch
  2 | 
  3 | import pytest
  4 | 
  5 | from reddit_user_to_sqlite.reddit_api import (
  6 |     PagedResponse,
  7 |     RedditRateLimitException,
  8 |     _unwrap_response_and_raise,
  9 |     add_missing_user_fragment,
 10 |     get_user_id,
 11 |     load_comments_for_user,
 12 |     load_info,
 13 |     load_posts_for_user,
 14 | )
 15 | from tests.conftest import MockInfoFunc, MockPagedFunc, MockUserFunc
 16 | 
 17 | 
 18 | def test_load_comments(mock_paged_request: MockPagedFunc, comment_response, comment):
 19 |     response = mock_paged_request(resource="comments", json=comment_response)
 20 | 
 21 |     assert load_comments_for_user("xavdid") == [comment]
 22 | 
 23 |     assert response.call_count == 1
 24 | 
 25 | 
 26 | @patch("reddit_user_to_sqlite.reddit_api.PAGE_SIZE", new=1)
 27 | def test_load_comments_rate_limited(
 28 |     mock_paged_request: MockPagedFunc, comment_response, comment, rate_limit_headers
 29 | ):
 30 |     good_response = mock_paged_request(
 31 |         resource="comments", params={"limit": 1}, json=comment_response
 32 |     )
 33 |     bad_response = mock_paged_request(
 34 |         resource="comments",
 35 |         params={"limit": 1},
 36 |         json={"error": 429},
 37 |         headers=rate_limit_headers,
 38 |     )
 39 | 
 40 |     # despite getting an error, we still got the first comment
 41 |     assert load_comments_for_user("xavdid") == [comment]
 42 | 
 43 |     assert good_response.call_count == 1
 44 |     assert bad_response.call_count == 1
 45 | 
 46 | 
 47 | def test_load_posts(mock_paged_request: MockPagedFunc, self_post_response, self_post):
 48 |     response = mock_paged_request(resource="submitted", json=self_post_response)
 49 | 
 50 |     assert load_posts_for_user("xavdid") == [self_post]
 51 |     assert response.call_count == 1
 52 | 
 53 | 
 54 | @patch("reddit_user_to_sqlite.reddit_api.PAGE_SIZE", new=1)
 55 | def test_loads_10_pages(mock_paged_request: MockPagedFunc, comment_response, comment):
 56 |     response = mock_paged_request(
 57 |         resource="comments", params={"limit": 1}, json=comment_response
 58 |     )
 59 | 
 60 |     assert load_comments_for_user("xavdid") == [comment] * 10
 61 | 
 62 |     assert response.call_count == 10
 63 | 
 64 | 
 65 | @patch("reddit_user_to_sqlite.reddit_api.PAGE_SIZE", new=1)
 66 | def test_loads_multiple_pages(
 67 |     mock_paged_request: MockPagedFunc, comment_response: PagedResponse, comment
 68 | ):
 69 |     comment_response["data"]["after"] = "abc"
 70 |     first_request = mock_paged_request(
 71 |         resource="comments", params={"limit": 1}, json=comment_response
 72 |     )
 73 | 
 74 |     comment_response["data"]["after"] = "def"
 75 |     second_request = mock_paged_request(
 76 |         resource="comments", params={"limit": 1, "after": "abc"}, json=comment_response
 77 |     )
 78 | 
 79 |     comment_response["data"]["children"] = []
 80 |     third_request = mock_paged_request(
 81 |         resource="comments", params={"limit": 1, "after": "def"}, json=comment_response
 82 |     )
 83 | 
 84 |     comments = load_comments_for_user("xavdid")
 85 | 
 86 |     assert first_request.call_count == 1
 87 |     assert second_request.call_count == 1
 88 |     assert third_request.call_count == 1
 89 | 
 90 |     assert comments == [comment, comment]
 91 | 
 92 | 
 93 | def test_error_response(mock_paged_request: MockPagedFunc):
 94 |     mock_paged_request(
 95 |         resource="comments", json={"error": 500, "message": "you broke reddit"}
 96 |     )
 97 | 
 98 |     with pytest.raises(ValueError) as err:
 99 |         load_comments_for_user("xavdid")
100 | 
101 |     assert (
102 |         str(err.value) == "Received API error from Reddit (code 500): you broke reddit"
103 |     )
104 | 
105 | 
106 | def test_load_info(mock_info_request: MockInfoFunc, comment_response, comment):
107 |     mock_info_request("a,b,c", json=comment_response)
108 | 
109 |     assert load_info(["a", "b", "c"]) == [comment]
110 | 
111 | 
112 | @patch("reddit_user_to_sqlite.reddit_api.PAGE_SIZE", new=2)
113 | def test_load_info_pages(mock_info_request: MockInfoFunc, comment_response, comment):
114 |     mock_info_request("a,b", json=comment_response, limit=2)
115 |     mock_info_request("c,d", json=comment_response, limit=2)
116 |     mock_info_request("e", json=comment_response, limit=2)
117 | 
118 |     assert load_info(["a", "b", "c", "d", "e"]) == [comment] * 3
119 | 
120 | 
121 | @patch("reddit_user_to_sqlite.reddit_api.PAGE_SIZE", new=2)
122 | def test_load_info_pages_with_rate_limit(
123 |     mock_info_request: MockInfoFunc, comment_response, comment, rate_limit_headers
124 | ):
125 |     mock_info_request("a,b", json=comment_response, limit=2)
126 |     mock_info_request("c,d", json=comment_response, limit=2)
127 |     mock_info_request("e", json={"error": 429}, limit=2, headers=rate_limit_headers)
128 | 
129 |     # call for e fails, but we still got the first ones
130 |     assert load_info(["a", "b", "c", "d", "e"]) == [comment] * 2
131 | 
132 | 
133 | def test_load_info_empty(mock_info_request: MockInfoFunc, empty_response):
134 |     mock_info_request("a,b,c,d,e,f,g,h", json=empty_response)
135 | 
136 |     assert load_info(["a", "b", "c", "d", "e", "f", "g", "h"]) == []
137 | 
138 | 
139 | def test_unwrap_and_raise_passes_good_responses_through():
140 |     response = {"neat": True}
141 |     assert _unwrap_response_and_raise(MagicMock(json=lambda: response)) == response
142 | 
143 | 
144 | def test_unwrap_and_raise_raises_unknown_errors():
145 |     with pytest.raises(ValueError) as err:
146 |         _unwrap_response_and_raise(
147 |             MagicMock(json=lambda: {"error": 123, "message": "cool"})
148 |         )
149 |     assert str(err.value) == "Received API error from Reddit (code 123): cool"
150 | 
151 | 
152 | def test_unwrap_and_raise_raises_rate_limit_errors(rate_limit_headers):
153 |     with pytest.raises(RedditRateLimitException) as err:
154 |         _unwrap_response_and_raise(
155 |             MagicMock(
156 |                 json=lambda: {"error": 429, "message": "cool"},
157 |                 headers=rate_limit_headers,
158 |             )
159 |         )
160 | 
161 |     e = err.value
162 | 
163 |     assert e.used == 4
164 |     assert e.remaining == 6
165 |     assert e.window_total == 10
166 |     assert e.reset_after_seconds == 20
167 |     assert e.stats == "Used 4/10 requests (resets in 20 seconds)"
168 | 
169 | 
170 | def test_get_user_id(mock_user_request: MockUserFunc, user_response):
171 |     mock_user_request("xavdid", json=user_response)
172 | 
173 |     assert get_user_id("xavdid") == "np8mb41h"
174 | 
175 | 
176 | def test_get_user_id_unknown_user(mock_user_request: MockUserFunc):
177 |     mock_user_request("xavdid", json={"message": "Not Found", "error": 404})
178 |     with pytest.raises(ValueError):
179 |         get_user_id("xavdid")
180 | 
181 | 
182 | def test_add_missing_user_fragment():
183 |     items = [{"a": 1}, {"a": 2}, {"a": 3}]
184 |     assert add_missing_user_fragment(items, "xavdid", "t2_abc123") == [  # type: ignore
185 |         {"a": 1, "author": "xavdid", "author_fullname": "t2_abc123"},
186 |         {"a": 2, "author": "xavdid", "author_fullname": "t2_abc123"},
187 |         {"a": 3, "author": "xavdid", "author_fullname": "t2_abc123"},
188 |     ]
189 | 
190 | 
191 | def test_add_missing_user_fragment_no_overwrite():
192 |     items = [{"a": 1}, {"author": "david", "author_fullname": "t2_def456"}]
193 | 
194 |     assert add_missing_user_fragment(items, "xavdid", "t2_abc123") == [  # type: ignore
195 |         {"a": 1, "author": "xavdid", "author_fullname": "t2_abc123"},
196 |         {"author": "david", "author_fullname": "t2_def456"},
197 |     ]
198 | 


--------------------------------------------------------------------------------
/tests/test_sqlite_helpers.py:
--------------------------------------------------------------------------------
  1 | from typing import Callable, Optional
  2 | 
  3 | import pytest
  4 | from pytest import FixtureRequest
  5 | from sqlite_utils import Database
  6 | from sqlite_utils.db import ForeignKey, NotFoundError
  7 | 
  8 | from reddit_user_to_sqlite.reddit_api import (
  9 |     Comment,
 10 |     Post,
 11 |     SubredditFragment,
 12 |     UserFragment,
 13 | )
 14 | from reddit_user_to_sqlite.sqlite_helpers import (
 15 |     CommentRow,
 16 |     comment_to_comment_row,
 17 |     insert_users,
 18 |     item_to_subreddit_row,
 19 |     item_to_user_row,
 20 |     post_to_post_row,
 21 |     upsert_comments,
 22 |     upsert_posts,
 23 |     upsert_subreddits,
 24 | )
 25 | 
 26 | 
 27 | @pytest.fixture
 28 | def make_sr():
 29 |     def make_subreddit(name: str, id_=None, type_="public") -> SubredditFragment:
 30 |         # returns the relevant sub-portions of
 31 |         return {
 32 |             "subreddit": name,
 33 |             "subreddit_id": f"t5_{id_ or name}",
 34 |             "subreddit_type": type_,
 35 |         }
 36 | 
 37 |     return make_subreddit
 38 | 
 39 | 
 40 | MakeUserFunc = Callable[[str], UserFragment]
 41 | 
 42 | 
 43 | @pytest.fixture
 44 | def make_user() -> MakeUserFunc:
 45 |     def _make_user(name: str, id_: Optional[str] = None) -> UserFragment:
 46 |         return {"author_fullname": f"t2_{id_ or name[::-1]}", "author": name}
 47 | 
 48 |     return _make_user
 49 | 
 50 | 
 51 | def test_insert_subreddits(tmp_db: Database, make_sr):
 52 |     upsert_subreddits(
 53 |         tmp_db,
 54 |         [
 55 |             make_sr("Games"),
 56 |             make_sr("JRPG", type_="private"),
 57 |         ],
 58 |     )
 59 | 
 60 |     assert "subreddits" in tmp_db.table_names()
 61 |     assert list(tmp_db["subreddits"].rows) == [
 62 |         {"id": "Games", "name": "Games", "type": "public"},
 63 |         {"id": "JRPG", "name": "JRPG", "type": "private"},
 64 |     ]
 65 | 
 66 | 
 67 | @pytest.mark.skip(
 68 |     "skipped because of a sqlite-utils bug; subreddits to get upserted right now"
 69 | )
 70 | def test_repeat_subs_ignored(tmp_db: Database, make_sr):
 71 |     upsert_subreddits(
 72 |         tmp_db,
 73 |         [
 74 |             make_sr("Games"),
 75 |             make_sr("JRPG", type_="private"),
 76 |         ],
 77 |     )
 78 | 
 79 |     # updates are ignored
 80 |     upsert_subreddits(
 81 |         tmp_db,
 82 |         [
 83 |             make_sr("ames", id_="Games"),
 84 |             make_sr("RPG", id_="JRPG"),
 85 |             make_sr("Apple"),
 86 |         ],
 87 |     )
 88 | 
 89 |     assert "subreddits" in tmp_db.table_names()
 90 |     assert list(tmp_db["subreddits"].rows) == [
 91 |         {"id": "Games", "name": "Games", "type": "public"},
 92 |         {"id": "JRPG", "name": "JRPG", "type": "private"},
 93 |         {"id": "Apple", "name": "Apple", "type": "public"},
 94 |     ]
 95 | 
 96 | 
 97 | def test_insert_user(tmp_db: Database, make_user: MakeUserFunc):
 98 |     insert_users(tmp_db, [make_user("xavdid")])
 99 | 
100 |     assert "users" in tmp_db.table_names()
101 |     assert list(tmp_db["users"].rows) == [
102 |         {"id": "didvax", "username": "xavdid"},
103 |     ]
104 | 
105 | 
106 | def test_insert_user_missing(tmp_db: Database, make_user: MakeUserFunc):
107 |     user = make_user("xavdid")
108 |     user.pop("author_fullname")
109 |     insert_users(tmp_db, [user])
110 | 
111 |     assert "users" not in tmp_db.table_names()
112 | 
113 | 
114 | def test_insert_comments(
115 |     tmp_db: Database, comment: Comment, stored_comment: CommentRow
116 | ):
117 |     upsert_subreddits(tmp_db, [comment])
118 |     insert_users(tmp_db, [comment])
119 | 
120 |     comment_without_user = comment.copy()
121 |     comment.pop("author_fullname")
122 | 
123 |     upsert_comments(tmp_db, [comment, comment_without_user])
124 | 
125 |     assert {"subreddits", "users", "comments"}.issubset(tmp_db.table_names())
126 | 
127 |     assert list(tmp_db["comments"].rows) == [stored_comment]
128 | 
129 |     assert tmp_db["comments"].foreign_keys == [  # type: ignore
130 |         ForeignKey("comments", "subreddit", "subreddits", "id"),
131 |         ForeignKey("comments", "user", "users", "id"),
132 |     ]
133 | 
134 |     failure_reasons = []
135 |     for k in ["user", "subreddit"]:
136 |         try:
137 |             tmp_db[f"{k}s"].get(stored_comment[k])  # type: ignore
138 |         except NotFoundError:
139 |             failure_reasons.append(f"broken foreign key relationship for comment.{k}")
140 | 
141 |     if failure_reasons:
142 |         pytest.fail(", ".join(failure_reasons))
143 | 
144 | 
145 | def test_update_comments(tmp_db: Database, comment: Comment, stored_comment):
146 |     upsert_subreddits(tmp_db, [comment])
147 |     insert_users(tmp_db, [comment])
148 |     upsert_comments(tmp_db, [comment])
149 | 
150 |     assert list(tmp_db["comments"].rows) == [stored_comment]
151 | 
152 |     assert comment["score"] != 10
153 |     comment["score"] = 10
154 |     upsert_comments(tmp_db, [comment])
155 | 
156 |     updated_comment = tmp_db["comments"].get(comment["id"])  # type: ignore
157 |     assert updated_comment["score"] == 10
158 | 
159 | 
160 | # https://engineeringfordatascience.com/posts/pytest_fixtures_with_parameterize/
161 | @pytest.mark.parametrize(
162 |     ["post_type", "stored_post_type"],
163 |     [
164 |         ("self_post", "stored_self_post"),
165 |         # ("removed_post", "stored_removed_post"),
166 |         ("external_post", "stored_external_post"),
167 |     ],
168 | )
169 | def test_insert_posts(
170 |     tmp_db: Database, request: FixtureRequest, post_type: str, stored_post_type: str
171 | ):
172 |     post: Post = request.getfixturevalue(post_type)
173 |     stored_post = request.getfixturevalue(stored_post_type)
174 | 
175 |     no_user_post = post.copy()
176 |     no_user_post.pop("author_fullname")
177 | 
178 |     upsert_subreddits(tmp_db, [post])
179 |     insert_users(tmp_db, [post])
180 | 
181 |     upsert_posts(tmp_db, [post, no_user_post])
182 | 
183 |     assert {"subreddits", "users", "posts"}.issubset(tmp_db.table_names())
184 | 
185 |     assert list(tmp_db["posts"].rows) == [stored_post]
186 | 
187 |     assert tmp_db["posts"].foreign_keys == [  # type: ignore
188 |         ForeignKey("posts", "subreddit", "subreddits", "id"),
189 |         ForeignKey("posts", "user", "users", "id"),
190 |     ]
191 | 
192 |     failure_reasons = []
193 |     for k in ["user", "subreddit"]:
194 |         try:
195 |             tmp_db[f"{k}s"].get(stored_post[k])  # type: ignore
196 |         except NotFoundError:
197 |             failure_reasons.append(f"broken foreign key relationship for comment.{k}")
198 | 
199 |     if failure_reasons:
200 |         pytest.fail(", ".join(failure_reasons))
201 | 
202 | 
203 | @pytest.mark.parametrize(
204 |     ["item", "expected"],
205 |     [
206 |         (
207 |             {"author_fullname": "t1_abc123", "author": "xavdid"},
208 |             {"id": "abc123", "username": "xavdid"},
209 |         ),
210 |         ({"author": "xavdid"}, None),
211 |     ],
212 | )
213 | def test_item_to_user_row(item, expected):
214 |     assert item_to_user_row(item) == expected
215 | 
216 | 
217 | @pytest.mark.parametrize(
218 |     ["item", "expected"],
219 |     [
220 |         (
221 |             {
222 |                 "subreddit_id": "t3_abc123",
223 |                 "subreddit": "Games",
224 |                 "subreddit_type": "public",
225 |             },
226 |             {"id": "abc123", "name": "Games", "type": "public"},
227 |         ),
228 |         # ({}, None),
229 |     ],
230 | )
231 | def test_item_to_subreddit_row(item, expected):
232 |     assert item_to_subreddit_row(item) == expected
233 | 
234 | 
235 | def test_comment_to_comment_row(comment, stored_comment):
236 |     assert comment_to_comment_row(comment) == stored_comment
237 | 
238 | 
239 | def test_comment_to_comment_row_missing_user(comment):
240 |     comment.pop("author_fullname")
241 |     assert comment_to_comment_row(comment) is None
242 | 
243 | 
244 | def test_post_to_post_row(self_post, stored_self_post):
245 |     assert post_to_post_row(self_post) == stored_self_post
246 | 
247 | 
248 | def test_post_to_post_row_missing_user(self_post):
249 |     self_post.pop("author_fullname")
250 |     assert post_to_post_row(self_post) is None
251 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | env_list =
 3 |     py3{9,10,11}
 4 | minversion = 4.6.0
 5 | isolated_build = True
 6 | 
 7 | [testenv]
 8 | description = run the tests with pytest
 9 | package = wheel
10 | wheel_build_env = .pkg
11 | 
12 | commands =
13 |     pip install .[test]
14 |     python -m pytest {tty:--color=yes} {posargs}
15 | 


--------------------------------------------------------------------------------