├── .gitignore ├── .python-version ├── CHANGELOG.md ├── LICENSE ├── README.md ├── justfile ├── pyproject.toml ├── reddit_user_to_sqlite ├── __init__.py ├── cli.py ├── csv_helpers.py ├── helpers.py ├── reddit_api.py └── sqlite_helpers.py ├── tests ├── __init__.py ├── conftest.py ├── test_cli.py ├── test_csv_helpers.py ├── test_helpers.py ├── test_reddit_api.py └── test_sqlite_helpers.py └── tox.ini /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.toptal.com/developers/gitignore/api/python 2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python 3 | 4 | ### Python ### 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | cover/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | .pybuilder/ 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | # For a library or package, you might want to ignore these files since the code is 91 | # intended to run in multiple environments; otherwise, check them in: 92 | # .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # poetry 102 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 103 | # This is especially recommended for binary packages to ensure reproducibility, and is more 104 | # commonly ignored for libraries. 105 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 106 | #poetry.lock 107 | 108 | # pdm 109 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 110 | #pdm.lock 111 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 112 | # in version control. 113 | # https://pdm.fming.dev/#use-with-ide 114 | .pdm.toml 115 | 116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 117 | __pypackages__/ 118 | 119 | # Celery stuff 120 | celerybeat-schedule 121 | celerybeat.pid 122 | 123 | # SageMath parsed files 124 | *.sage.py 125 | 126 | # Environments 127 | .env 128 | .venv 129 | env/ 130 | venv/ 131 | ENV/ 132 | env.bak/ 133 | venv.bak/ 134 | 135 | # Spyder project settings 136 | .spyderproject 137 | .spyproject 138 | 139 | # Rope project settings 140 | .ropeproject 141 | 142 | # mkdocs documentation 143 | /site 144 | 145 | # mypy 146 | .mypy_cache/ 147 | .dmypy.json 148 | dmypy.json 149 | 150 | # Pyre type checker 151 | .pyre/ 152 | 153 | # pytype static type analyzer 154 | .pytype/ 155 | 156 | # Cython debug symbols 157 | cython_debug/ 158 | 159 | # PyCharm 160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 162 | # and can be added to the global gitignore or merged into this file. For a more nuclear 163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 164 | #.idea/ 165 | 166 | ### Python Patch ### 167 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration 168 | poetry.toml 169 | 170 | # ruff 171 | .ruff_cache/ 172 | 173 | # LSP config files 174 | pyrightconfig.json 175 | 176 | # End of https://www.toptal.com/developers/gitignore/api/python 177 | 178 | *.db 179 | config.json 180 | metadata.json 181 | 182 | launch.json 183 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.11.0 2 | 3.10.0 3 | 3.9.16 4 | 3.8.16 5 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | This project uses [SemVer](https://semver.org/) for versioning. Its public APIs, runtime support, and documented file locations won't change incompatibly outside of major versions (once version 1.0.0 has been released). There may be breaking schema changes in minor releases before 1.0.0 and will be noted in these release notes. 4 | 5 | ## 0.4.2 6 | 7 | _released `2023-07-22`_ 8 | 9 | - handle [new rate limiting](https://support.reddithelp.com/hc/en-us/articles/16160319875092-Reddit-Data-API-Wiki) more gracefully (fixes [#23](https://github.com/xavdid/reddit-user-to-sqlite/issues/23) via [#24](https://github.com/xavdid/reddit-user-to-sqlite/pull/24) (by [@piyh](https://github.com/piyh)) and [#25](https://github.com/xavdid/reddit-user-to-sqlite/pull/25)) 10 | 11 | ## 0.4.1 12 | 13 | _released `2023-06-25`_ 14 | 15 | - specify `utf-8` as the default character encoding, improving windows compatibility (fixes [#10](https://github.com/xavdid/reddit-user-to-sqlite/issues/10)) 16 | 17 | ## 0.4.0 18 | 19 | _released `2023-06-14`_ 20 | 21 | - the `archive` command includes saved posts / comments by default (in their own table). Use the `--skip-saved` flag to opt out of this behavior ([#16](https://github.com/xavdid/reddit-user-to-sqlite/pull/16)) 22 | - add support for Python 3.9, verified using `tox` ([#19](https://github.com/xavdid/reddit-user-to-sqlite/pull/19)) 23 | - add `num_awards` column to comments (was omitted by accident) ([#18](https://github.com/xavdid/reddit-user-to-sqlite/pull/18)) 24 | - added support for disabling the progress bars via the `DISABLE_PROGRESS` env var. Set it to `1` to disable progress bars ([#16](https://github.com/xavdid/reddit-user-to-sqlite/pull/16)) 25 | 26 | ## 0.3.1 27 | 28 | _released `2023-06-09`_ 29 | 30 | - remove dependency on 3.11 by adding `typing-extensions` ([#3](https://github.com/xavdid/reddit-user-to-sqlite/pull/3) by [@changlinli](https://github.com/changlinli)) 31 | 32 | ## 0.3.0 33 | 34 | _released `2023-05-23`_ 35 | 36 | - adds the `archive` command, which loads data from a Reddit GDPR archive ([#1](https://github.com/xavdid/reddit-user-to-sqlite/pull/1)) 37 | - added more help text to both commands 38 | - provide more info about the counts of comments/posts saved/updated 39 | 40 | ## 0.2.0 41 | 42 | _released `2023-05-07`_ 43 | 44 | - improves the `user` command to also fetch submitted posts and store them in a corresponding `posts` table. 45 | 46 | ## 0.1.0 47 | 48 | _released `2023-05-06`_ 49 | 50 | - Initial public release! 51 | - Adds the `user` command, which currently only fetches comments 52 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 David Brownman 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # reddit-user-to-sqlite 2 | 3 | Stores all the content from a specific user in a SQLite database. This includes their comments and their posts. 4 | 5 | ## Install 6 | 7 | The best way to install the package is by using [pipx](https://pypa.github.io/pipx/): 8 | 9 | ```bash 10 | pipx install reddit-user-to-sqlite 11 | ``` 12 | 13 | It's also available via [brew](https://brew.sh/): 14 | 15 | ```bash 16 | brew install xavdid/projects/reddit-user-to-sqlite 17 | ``` 18 | 19 | ## Usage 20 | 21 | The CLI currently exposes two commands: `user` and `archive`. They allow you to archive recent comments/posts from the API or _all_ posts (as read from a CSV file). 22 | 23 | ### user 24 | 25 | Fetches all comments and posts for a specific user. 26 | 27 | ```bash 28 | reddit-user-to-sqlite user your_username 29 | reddit-user-to-sqlite user your_username --db my-reddit-data.db 30 | ``` 31 | 32 | #### Params 33 | 34 | > Note: the argument order is reversed from most dogsheep packages (which take db_path first). This method allows for use of a default db name, so I prefer it. 35 | 36 | 1. `username`: a case-insensitive string. The leading `/u/` is optional (and ignored if supplied). 37 | 2. (optional) `--db`: the path to a sqlite file, which will be created or updated as needed. Defaults to `reddit.db`. 38 | 39 | ### archive 40 | 41 | Reads the output of a [Reddit GDPR archive](https://support.reddithelp.com/hc/en-us/articles/360043048352-How-do-I-request-a-copy-of-my-Reddit-data-and-information-) and fetches additional info from the Reddit API (where possible). This allows you to store more than 1k posts/comments. 42 | 43 | > FYI: this behavior is built with the assumption that the archive that Reddit provides has the same format regardless of if you select `GDPR` or `CCPA` as the request type. But, just to be on the safe side, I recommend selecting `GDPR` during the export process until I'm able to confirm. 44 | 45 | #### Params 46 | 47 | > Note: the argument order is reversed from most dogsheep packages (which take db_path first). This method allows for use of a default db name, so I prefer it. 48 | 49 | 1. `archive_path`: the path to the (unzipped) archive directory on your machine. Don't rename/move the files that Reddit gives you. 50 | 2. (optional) `--db`: the path to a sqlite file, which will be created or updated as needed. Defaults to `reddit.db`. 51 | 3. (optional) `--skip-saved`: a flag for skipping the inclusion of loading saved comments/posts from the archive. 52 | 53 | ## Viewing Data 54 | 55 | The resulting SQLite database pairs well with [Datasette](https://datasette.io/), a tool for viewing SQLite in the web. Below is my recommended configuration. 56 | 57 | First, install `datasette`: 58 | 59 | ```bash 60 | pipx install datasette 61 | ``` 62 | 63 | Then, add the recommended plugins (for rendering timestamps and markdown): 64 | 65 | ```bash 66 | pipx inject datasette datasette-render-markdown datasette-render-timestamps 67 | ``` 68 | 69 | Finally, create a `metadata.json` file next to your `reddit.db` with the following: 70 | 71 | ```json 72 | { 73 | "databases": { 74 | "reddit": { 75 | "tables": { 76 | "comments": { 77 | "sort_desc": "timestamp", 78 | "plugins": { 79 | "datasette-render-markdown": { 80 | "columns": ["text"] 81 | }, 82 | "datasette-render-timestamps": { 83 | "columns": ["timestamp"] 84 | } 85 | } 86 | }, 87 | "posts": { 88 | "sort_desc": "timestamp", 89 | "plugins": { 90 | "datasette-render-markdown": { 91 | "columns": ["text"] 92 | }, 93 | "datasette-render-timestamps": { 94 | "columns": ["timestamp"] 95 | } 96 | } 97 | }, 98 | "subreddits": { 99 | "sort": "name" 100 | } 101 | } 102 | } 103 | } 104 | } 105 | ``` 106 | 107 | Now when you run 108 | 109 | ```bash 110 | datasette reddit.db --metadata metadata.json 111 | ``` 112 | 113 | You'll get a nice, formatted output: 114 | 115 | ![](https://cdn.zappy.app/93b1760ab541a8b68c2ee2899be5e079.png) 116 | 117 | ![](https://cdn.zappy.app/5850a782196d1c7a83a054400c0a5dc4.png) 118 | 119 | ## Motivation 120 | 121 | I got nervous when I saw Reddit's [notification of upcoming API changes](https://old.reddit.com/r/reddit/comments/12qwagm/an_update_regarding_reddits_api/). To ensure I could always access data I created, I wanted to make sure I had a backup in place before anything changed in a big way. 122 | 123 | ## FAQs 124 | 125 | ### Why does this post only show 1k recent comments / posts? 126 | 127 | Reddit's paging API only shows 1000 items (page 11 is an empty list). If you have more comments (or posts) than than that, you can use the [GDPR archive import feature](#archive) feature to backfill your older data. 128 | 129 | ### Why are my longer posts truncated in Datasette? 130 | 131 | Datasette truncates long text fields by default. You can disable this behavior by using the `truncate_cells_html` flag when running `datasette` ([see the docs](https://docs.datasette.io/en/stable/settings.html#truncate-cells-html)): 132 | 133 | ```shell 134 | datasette reddit.db --setting truncate_cells_html 0 135 | ``` 136 | 137 | ### How do I store a username that starts with `-`? 138 | 139 | By default, [click](https://click.palletsprojects.com/en/8.1.x/) (the argument parser this uses) interprets leading dashes on argument as a flag. If you're fetching data for user `-asdf`, you'll get an error saying `Error: No such option: -a`. To ensure the last argument is interpreted positionally, put it after a `--`: 140 | 141 | ```shell 142 | reddit-user-to-sqlite user -- -asdf 143 | ``` 144 | 145 | ### Why do some of my posts say `[removed]` even though I can see them on the web? 146 | 147 | If a post is removed, only the mods and the user who posted it can see its text. Since this tool currently runs without any authentication, those removed posts can't be fetched via the API. 148 | 149 | To load data about your own removed posts, use the [GDPR archive import feature](#archive). 150 | 151 | ### Why is the database missing data returned by the Reddit API? 152 | 153 | While most [Dogsheep](https://github.com/dogsheep) projects grab the raw JSON output of their source APIs, Reddit's API has a lot of junk in it. So, I opted for a slimmed down approach. 154 | 155 | If there's a field missing that you think would be useful, feel free to open an issue! 156 | 157 | ### Does this tool refetch old data? 158 | 159 | When running the `user` command, yes. It fetches and updates up to 1k each of comments and posts and updates the local copy. 160 | 161 | When running the `archive` command, no. To cut down on API requests, it only fetches data about comments/posts that aren't yet in the database (since the archive may include many items). 162 | 163 | Both of these may change in the future to be more in line with [Reddit's per-subreddit archiving guidelines](https://www.reddit.com/r/modnews/comments/py2xy2/voting_commenting_on_archived_posts/). 164 | 165 | ## Development 166 | 167 | This section is people making changes to this package. 168 | 169 | When in a virtual environment, run the following: 170 | 171 | ```bash 172 | pip install -e '.[test]' 173 | ``` 174 | 175 | This installs the package in `--edit` mode and makes its dependencies available. You can now run `reddit-user-to-sqlite` to invoke the CLI. 176 | 177 | ### Running Tests 178 | 179 | In your virtual environment, a simple `pytest` should run the unit test suite. You can also run `pyright` for type checking. 180 | 181 | ### Releasing New Versions 182 | 183 | > these notes are mostly for myself (or other contributors) 184 | 185 | 1. Run `just release` while your venv is active 186 | 2. paste the stored API key (If you're getting invalid password, verify that `~/.pypirc` is empty) 187 | -------------------------------------------------------------------------------- /justfile: -------------------------------------------------------------------------------- 1 | _default: 2 | just --list 3 | 4 | # error out if this isn't being run in a venv 5 | _require-venv: 6 | #!/usr/bin/env python 7 | import sys 8 | sys.exit(sys.prefix == sys.base_prefix) 9 | 10 | # run test suite against multiple python versions 11 | @tox: 12 | tox run-parallel 13 | 14 | @lint: 15 | ruff . 16 | black --check --quiet . 17 | 18 | # lint&fix files, useful for a pre-commit hook 19 | @lint-fix: 20 | ruff . --fix 21 | black --quiet . 22 | 23 | @typecheck: 24 | pyright -p pyproject.toml 25 | 26 | # perform all checks, but don't change any files 27 | @validate: tox lint typecheck 28 | 29 | @local: _require-venv validate 30 | 31 | # run the full ci pipeline 32 | ci: && validate 33 | pip install .[test,ci] 34 | 35 | # useful for reinstalling after changing dependencies 36 | @reinstall: _require-venv 37 | pip install -e .[test,ci] 38 | 39 | @release: _require-venv validate 40 | rm -rf dist 41 | pip install -e .[release] 42 | python -m build 43 | # give upload api key at runtime 44 | python -m twine upload --username __token__ dist/* 45 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "reddit-user-to-sqlite" 3 | version = "0.4.2" 4 | 5 | authors = [{ name = "David Brownman", email = "beamneocube@gmail.com" }] 6 | description = "Create a SQLite database containing data pulled from Reddit about a single user." 7 | readme = "README.md" 8 | license = { file = "LICENSE" } 9 | 10 | requires-python = ">=3.9" 11 | classifiers = [ 12 | "Programming Language :: Python :: 3", 13 | "Development Status :: 3 - Alpha", 14 | "Environment :: Console", 15 | "License :: OSI Approved :: MIT License", 16 | "Operating System :: OS Independent", 17 | "Natural Language :: English", 18 | ] 19 | keywords = ["sqlite", "reddit", "dogsheep"] 20 | 21 | dependencies = [ 22 | "sqlite-utils==3.32.1", 23 | "click==8.1.3", 24 | "requests==2.29.0", 25 | "tqdm==4.65.0", 26 | ] 27 | 28 | [project.optional-dependencies] 29 | test = ["pytest==7.3.1", "responses==0.23.1"] 30 | release = ["twine==4.0.2", "build==0.10.0"] 31 | ci = ["black==23.3.0", "pyright==1.1.318", "ruff==0.0.277"] 32 | 33 | [project.urls] 34 | "Homepage" = "https://github.com/xavdid/reddit-user-to-sqlite" 35 | "Bug Tracker" = "https://github.com/xavdid/reddit-user-to-sqlite/issues" 36 | "Author" = "https://xavd.id" 37 | "Changelog" = "https://github.com/xavdid/reddit-user-to-sqlite/blob/main/CHANGELOG.md" 38 | 39 | [project.scripts] 40 | reddit-user-to-sqlite = "reddit_user_to_sqlite.cli:cli" 41 | 42 | [build-system] 43 | requires = ["flit_core>=3.4"] 44 | build-backend = "flit_core.buildapi" 45 | 46 | # needed so the LSP performs typechecking 47 | [tool.pyright] 48 | 49 | [tool.ruff] 50 | select = ["E", "F", "I001"] # defaults & isort 51 | ignore = ["E501"] 52 | -------------------------------------------------------------------------------- /reddit_user_to_sqlite/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavdid/reddit-user-to-sqlite/e02fad746694f32ebc2ee2efce82652857682cc6/reddit_user_to_sqlite/__init__.py -------------------------------------------------------------------------------- /reddit_user_to_sqlite/cli.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | from pathlib import Path 3 | from typing import Callable, Iterable, Optional, TypeVar, cast 4 | 5 | import click 6 | from sqlite_utils import Database 7 | 8 | from reddit_user_to_sqlite.csv_helpers import ( 9 | PrefixType, 10 | get_username_from_archive, 11 | load_unsaved_ids_from_file, 12 | ) 13 | from reddit_user_to_sqlite.helpers import clean_username, find_user_details_from_items 14 | from reddit_user_to_sqlite.reddit_api import ( 15 | Comment, 16 | Post, 17 | add_missing_user_fragment, 18 | get_user_id, 19 | load_comments_for_user, 20 | load_info, 21 | load_posts_for_user, 22 | ) 23 | from reddit_user_to_sqlite.sqlite_helpers import ( 24 | ensure_fts, 25 | insert_users, 26 | upsert_comments, 27 | upsert_posts, 28 | upsert_subreddits, 29 | ) 30 | 31 | 32 | @click.group() 33 | @click.version_option() 34 | def cli(): 35 | "Save data from Reddit to a SQLite database" 36 | 37 | 38 | DB_PATH_HELP = "A path to a SQLite database file. If it doesn't exist, it will be created. It can have any extension, `.db` or `.sqlite` is recommended." 39 | DEFAULT_DB_NAME = "reddit.db" 40 | 41 | DELETED_USERNAME = "__DeletedUser__" 42 | DELETED_USER_FULLNAME = "t2_1234567" 43 | 44 | T = TypeVar("T", Comment, Post) 45 | 46 | 47 | def _save_items( 48 | db: Database, 49 | items: list[T], 50 | upsert_func: Callable[[Database, Iterable[T], Optional[PrefixType]], int], 51 | table_prefix: Optional[PrefixType] = None, 52 | ) -> int: 53 | if not items: 54 | return 0 55 | 56 | insert_users(db, items) 57 | upsert_subreddits(db, items) 58 | return upsert_func(db, items, table_prefix) 59 | 60 | 61 | save_comments = partial(_save_items, upsert_func=upsert_comments) 62 | save_posts = partial(_save_items, upsert_func=upsert_posts) 63 | 64 | 65 | def load_data_from_files( 66 | db: Database, 67 | archive_path: Path, 68 | own_data=True, 69 | tables_prefix: Optional[PrefixType] = None, 70 | ): 71 | """ 72 | if own data is true, requires a username to save. Otherwise, will add a placeholder 73 | (for external data) 74 | """ 75 | new_comment_ids = load_unsaved_ids_from_file( 76 | db, archive_path, "comments", prefix=tables_prefix 77 | ) 78 | click.echo(f"\nFetching info about {'your' if own_data else 'saved'} comments") 79 | comments = cast(list[Comment], load_info(new_comment_ids)) 80 | 81 | post_ids = load_unsaved_ids_from_file( 82 | db, archive_path, "posts", prefix=tables_prefix 83 | ) 84 | click.echo(f"\nFetching info about {'your' if own_data else 'saved'} posts") 85 | posts = cast(list[Post], load_info(post_ids)) 86 | 87 | username = None 88 | user_fullname = None 89 | 90 | if own_data: 91 | # find the username, first from any of the loaded comments/posts 92 | if user_details := ( 93 | find_user_details_from_items(comments) 94 | or find_user_details_from_items(posts) 95 | ): 96 | username, user_fullname = user_details 97 | # if all loaded posts are removed (which could be the case on subsequent runs), 98 | # then try to load from archive 99 | elif username := get_username_from_archive(archive_path): 100 | user_fullname = f"t2_{get_user_id(username)}" 101 | # otherwise, your posts without a username won't be saved; 102 | # this only happens for malformed archives 103 | else: 104 | click.echo( 105 | "\nUnable to guess username from API content or archive; some data will not be saved.", 106 | err=True, 107 | ) 108 | else: 109 | username = DELETED_USERNAME 110 | user_fullname = DELETED_USER_FULLNAME 111 | 112 | if username and user_fullname: 113 | comments = add_missing_user_fragment(comments, username, user_fullname) 114 | posts = add_missing_user_fragment(posts, username, user_fullname) 115 | 116 | num_comments_written = save_comments(db, comments, table_prefix=tables_prefix) 117 | num_posts_written = save_posts(db, posts, table_prefix=tables_prefix) 118 | 119 | messages = [ 120 | "\nDone!", 121 | f" - saved {num_comments_written} new comments", 122 | f" - saved {num_posts_written} new posts", 123 | ] 124 | 125 | if missing_comments := len(comments) - num_comments_written: 126 | messages.append( 127 | f" - failed to find {missing_comments} missing comments; ignored for now" 128 | ) 129 | if missing_posts := len(post_ids) - num_posts_written: 130 | messages.append( 131 | f" - failed to find {missing_posts} missing posts; ignored for now" 132 | ) 133 | 134 | click.echo("\n".join(messages)) 135 | 136 | 137 | @cli.command() 138 | @click.argument("username") 139 | @click.option( 140 | "--db", 141 | "db_path", 142 | type=click.Path(file_okay=True, dir_okay=False, allow_dash=False), 143 | default=DEFAULT_DB_NAME, 144 | help=DB_PATH_HELP, 145 | ) 146 | def user(db_path: str, username: str): 147 | username = clean_username(username) 148 | click.echo(f"loading data about /u/{username} into {db_path}") 149 | 150 | db = Database(db_path) 151 | 152 | click.echo("\nfetching (up to 10 pages of) comments") 153 | comments = load_comments_for_user(username) 154 | save_comments(db, comments) 155 | click.echo(f"saved/updated {len(comments)} comments") 156 | 157 | click.echo("\nfetching (up to 10 pages of) posts") 158 | posts = load_posts_for_user(username) 159 | save_posts(db, posts) 160 | click.echo(f"saved/updated {len(posts)} posts") 161 | 162 | if not (comments or posts): 163 | raise click.ClickException(f"no data found for username: {username}") 164 | 165 | ensure_fts(db) 166 | 167 | 168 | @cli.command() 169 | @click.argument( 170 | "archive_path", 171 | type=click.Path(file_okay=False, dir_okay=True, allow_dash=False, path_type=Path), 172 | ) 173 | @click.option( 174 | "--db", 175 | "db_path", 176 | type=click.Path(file_okay=True, dir_okay=False, allow_dash=False), 177 | default=DEFAULT_DB_NAME, 178 | help=DB_PATH_HELP, 179 | ) 180 | @click.option( 181 | "--skip-saved", 182 | is_flag=True, 183 | default=False, 184 | help="Skip hydrating data about your saved posts and comments.", 185 | ) 186 | def archive(archive_path: Path, db_path: str, skip_saved: bool): 187 | click.echo(f"loading data found in archive at {archive_path} into {db_path}") 188 | 189 | db = Database(db_path) 190 | 191 | load_data_from_files(db, archive_path) 192 | 193 | # I don't love this double negative, but it is what it is 194 | if not skip_saved: 195 | load_data_from_files(db, archive_path, own_data=False, tables_prefix="saved_") 196 | 197 | ensure_fts(db) 198 | -------------------------------------------------------------------------------- /reddit_user_to_sqlite/csv_helpers.py: -------------------------------------------------------------------------------- 1 | from csv import DictReader 2 | from pathlib import Path 3 | from typing import Literal, Optional 4 | 5 | from sqlite_utils import Database 6 | 7 | ItemType = Literal["comments", "posts"] 8 | PrefixType = Literal["saved_"] 9 | 10 | FULLNAME_PREFIX: dict[ItemType, str] = { 11 | "comments": "t1", 12 | "posts": "t3", 13 | } 14 | 15 | 16 | def build_table_name( 17 | table_name: ItemType, table_prefix: Optional[PrefixType] = None 18 | ) -> str: 19 | return f"{table_prefix or ''}{table_name}" 20 | 21 | 22 | def validate_and_build_path(archive_path: Path, item_type: str) -> Path: 23 | filename = f"{item_type}.csv" 24 | if not (file := archive_path / filename).exists(): 25 | # LOAD BEARING MESSAGE: the brew formula expects the phrase "unzipped GDPR archive folder" to be printed on error 26 | raise ValueError( 27 | f'Ensure path "{archive_path}" points to an unzipped Reddit GDPR archive folder; "{filename}" not found in the expected spot.' 28 | ) 29 | return file 30 | 31 | 32 | def load_unsaved_ids_from_file( 33 | db: Database, 34 | archive_path: Path, 35 | item_type: ItemType, 36 | prefix: Optional[PrefixType] = None, 37 | ) -> list[str]: 38 | filename = build_table_name(item_type, prefix) 39 | # we save each file into a matching table 40 | saved_ids = {row["id"] for row in db[filename].rows} 41 | 42 | with open( 43 | validate_and_build_path(archive_path, filename), encoding="utf-8" 44 | ) as archive_rows: 45 | return [ 46 | f'{FULLNAME_PREFIX[item_type]}_{c["id"]}' 47 | for c in DictReader(archive_rows) 48 | if c["id"] not in saved_ids 49 | ] 50 | 51 | 52 | def get_username_from_archive(archive_path: Path) -> Optional[str]: 53 | with open(validate_and_build_path(archive_path, "statistics")) as stat_rows: 54 | try: 55 | return next( 56 | row["value"] 57 | for row in DictReader(stat_rows) 58 | if row["statistic"] == "account name" 59 | ) 60 | except StopIteration: 61 | pass 62 | -------------------------------------------------------------------------------- /reddit_user_to_sqlite/helpers.py: -------------------------------------------------------------------------------- 1 | import re 2 | from itertools import islice 3 | from typing import Iterable, Optional, TypeVar 4 | 5 | T = TypeVar("T") 6 | 7 | 8 | # https://docs.python.org/3.11/library/itertools.html#itertools-recipes 9 | # available natively in 3.12 10 | def batched(iterable: Iterable[T], n: int) -> Iterable[tuple[T]]: 11 | "Batch data into tuples of length n. The last batch may be shorter." 12 | # batched('ABCDEFG', 3) --> ABC DEF G 13 | if n < 1: 14 | raise ValueError("n must be at least one") 15 | it = iter(iterable) 16 | while batch := tuple(islice(it, n)): 17 | yield batch 18 | 19 | 20 | def clean_username(username: str) -> str: 21 | """ 22 | strips the leading `/u/` off the front of a username, if present 23 | """ 24 | if re.match(r"/?u/", username): 25 | return username.strip().strip("/u") 26 | return username 27 | 28 | 29 | def find_user_details_from_items(items) -> Optional[tuple[str, str]]: 30 | """ 31 | Returns a 2-tuple of prefixed user_id and username if found, otherwise None 32 | """ 33 | try: 34 | return next( 35 | (c["author"], c["author_fullname"]) for c in items if "author_fullname" in c 36 | ) 37 | except StopIteration: 38 | return None 39 | -------------------------------------------------------------------------------- /reddit_user_to_sqlite/reddit_api.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import ( 3 | TYPE_CHECKING, 4 | Any, 5 | Literal, 6 | Optional, 7 | Sequence, 8 | TypedDict, 9 | TypeVar, 10 | Union, 11 | cast, 12 | final, 13 | ) 14 | 15 | import click 16 | import requests 17 | from tqdm import tqdm, trange 18 | 19 | from reddit_user_to_sqlite.helpers import batched 20 | 21 | if TYPE_CHECKING: 22 | from typing import NotRequired 23 | 24 | USER_AGENT = "reddit-user-to-sqlite" 25 | 26 | 27 | class SubredditFragment(TypedDict): 28 | ## SUBREDDIT 29 | # "consoledeals" 30 | subreddit: str 31 | # ID 32 | subreddit_id: str 33 | # "public" 34 | subreddit_type: str 35 | 36 | 37 | class UserFragment(TypedDict): 38 | # comment author username 39 | author: str 40 | # comment author prefixed id 41 | author_fullname: "NotRequired[str]" 42 | 43 | 44 | class Comment(SubredditFragment, UserFragment): 45 | # this is only the relevant fields from the response 46 | 47 | ## COMMENT 48 | # short ID 49 | id: str 50 | # full ID 51 | name: str 52 | 53 | total_awards_received: int 54 | gilded: int 55 | 56 | # the ID of a post or comment 57 | parent_id: str 58 | score: int 59 | 60 | # maybe always 0? or i'm just boring 61 | controversiality: int 62 | # plaintext (or markdown?) 63 | body: str 64 | body_html: str 65 | # is the commenter OP? 66 | is_submitter: bool 67 | # 1682464342.0, 68 | created: float 69 | # "/r/x/comments/... 70 | permalink: str 71 | 72 | ## POST 73 | # post title 74 | link_title: str 75 | num_comments: int 76 | # post ID 77 | link_id: str 78 | link_permalink: str 79 | # "r/consoledeals", 80 | subreddit_name_prefixed: str 81 | 82 | 83 | class Post(SubredditFragment, UserFragment): 84 | # no prefix 85 | id: str 86 | 87 | title: str 88 | 89 | # markdown content of the post; could be empty 90 | selftext: str 91 | # external link (or self link) 92 | url: str 93 | # link to reddit thread (sans domain) 94 | permalink: str 95 | 96 | upvote_ratio: float 97 | score: int 98 | total_awards_received: int 99 | 100 | num_comments: int 101 | over_18: bool 102 | 103 | # timestamp 104 | created: float 105 | 106 | 107 | # class Subreddit(TypedDict): 108 | # should_archive_posts: bool 109 | 110 | 111 | @final 112 | class ResourceWrapper(TypedDict): 113 | kind: str 114 | data: Union[Comment, Post] 115 | 116 | 117 | class SuccessResponse(TypedDict): 118 | kind: Literal["Listing", "t2"] 119 | 120 | 121 | @final 122 | class PagedResponseBody(TypedDict): 123 | before: Optional[str] 124 | after: Optional[str] 125 | modhash: str 126 | geo_filter: str 127 | dist: int 128 | children: Sequence[ResourceWrapper] 129 | 130 | 131 | @final 132 | class PagedResponse(SuccessResponse): 133 | data: PagedResponseBody 134 | 135 | 136 | @final 137 | class UserData(TypedDict): 138 | id: str 139 | 140 | 141 | @final 142 | class UserResponse(SuccessResponse): 143 | data: UserData 144 | 145 | 146 | @final 147 | class ErorrResponse(TypedDict): 148 | message: str 149 | error: int 150 | 151 | 152 | ErrorHeaders = TypedDict( 153 | "ErrorHeaders", 154 | { 155 | "x-ratelimit-used": str, 156 | "x-ratelimit-remaining": str, 157 | "x-ratelimit-reset": str, 158 | }, 159 | ) 160 | 161 | # max API page size is 100 162 | PAGE_SIZE = 100 163 | 164 | 165 | class RedditRateLimitException(Exception): 166 | """ 167 | more info: https://support.reddithelp.com/hc/en-us/articles/16160319875092-Reddit-Data-API-Wiki 168 | """ 169 | 170 | def __init__(self, headers: ErrorHeaders) -> None: 171 | super().__init__("Rate limited by Reddit") 172 | 173 | self.used = int(headers["x-ratelimit-used"]) 174 | self.remaining = int(headers["x-ratelimit-remaining"]) 175 | self.window_total = self.used + self.remaining 176 | self.reset_after_seconds = int(headers["x-ratelimit-reset"]) 177 | 178 | @property 179 | def stats(self) -> str: 180 | return f"Used {self.used}/{self.window_total} requests (resets in {self.reset_after_seconds} seconds)" 181 | 182 | 183 | def _unwrap_response_and_raise(response: requests.Response): 184 | result = response.json() 185 | 186 | if "error" in result: 187 | if result["error"] == 429: 188 | raise RedditRateLimitException(cast(ErrorHeaders, response.headers)) 189 | 190 | raise ValueError( 191 | f'Received API error from Reddit (code {result["error"]}): {result["message"]}' 192 | ) 193 | 194 | return result 195 | 196 | 197 | def _call_reddit_api(url: str, params: Optional[dict[str, Any]] = None): 198 | return _unwrap_response_and_raise( 199 | requests.get( 200 | url, 201 | {"raw_json": 1, "limit": PAGE_SIZE, **(params or {})}, # type: ignore 202 | headers={"user-agent": USER_AGENT}, 203 | ) 204 | ) 205 | 206 | 207 | def _rate_limit_message(e: RedditRateLimitException) -> str: 208 | return f"Rate limited by reddit; try again in {e.reset_after_seconds} seconds. Until then, saving what we have" 209 | 210 | 211 | def _load_paged_resource(resource: Literal["comments", "submitted"], username: str): 212 | """ 213 | handles paging logic for arbitrary-length queries with an "after" param 214 | """ 215 | result = [] 216 | after = None 217 | # max number of pages we can fetch 218 | for _ in trange(10): 219 | try: 220 | response: PagedResponse = _call_reddit_api( 221 | f"https://www.reddit.com/user/{username}/{resource}.json", 222 | params={"after": after}, 223 | ) 224 | 225 | result += [c["data"] for c in response["data"]["children"]] 226 | after = response["data"]["after"] 227 | if len(response["data"]["children"]) < PAGE_SIZE: 228 | break 229 | except RedditRateLimitException as e: 230 | click.echo(_rate_limit_message(e), err=True) 231 | break 232 | 233 | return result 234 | 235 | 236 | def load_comments_for_user(username: str) -> list[Comment]: 237 | return _load_paged_resource("comments", username) 238 | 239 | 240 | def load_posts_for_user(username: str) -> list[Post]: 241 | return _load_paged_resource("submitted", username) 242 | 243 | 244 | def load_info(resources: Sequence[str]) -> list[Union[Comment, Post]]: 245 | """ 246 | calls the `/info` endpoint to fetch data about a sequence of resources that include the type prefix 247 | """ 248 | result = [] 249 | for batch in batched( 250 | tqdm(resources, disable=bool(os.environ.get("DISABLE_PROGRESS"))), PAGE_SIZE 251 | ): 252 | try: 253 | response: PagedResponse = _call_reddit_api( 254 | "https://www.reddit.com/api/info.json", 255 | params={"id": ",".join(batch)}, 256 | ) 257 | result += [c["data"] for c in response["data"]["children"]] 258 | except RedditRateLimitException as e: 259 | click.echo(_rate_limit_message(e), err=True) 260 | break 261 | 262 | return result 263 | 264 | 265 | def get_user_id(username: str) -> str: 266 | response: UserResponse = _call_reddit_api( 267 | f"https://www.reddit.com/user/{username}/about.json" 268 | ) 269 | 270 | return response["data"]["id"] 271 | 272 | 273 | T = TypeVar("T", Comment, Post) 274 | 275 | 276 | def add_missing_user_fragment( 277 | items: list[T], username: str, user_fullname: str 278 | ) -> list[T]: 279 | """ 280 | If an item lacks user details, this adds them. Otherwise the item passes through untouched. 281 | """ 282 | return [ 283 | cast(T, {**i, "author": username, "author_fullname": user_fullname}) 284 | if "author_fullname" not in i 285 | else i 286 | for i in items 287 | ] 288 | -------------------------------------------------------------------------------- /reddit_user_to_sqlite/sqlite_helpers.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Iterable, Optional, Sequence, TypedDict, TypeVar 2 | 3 | from sqlite_utils import Database 4 | 5 | from reddit_user_to_sqlite.csv_helpers import PrefixType, build_table_name 6 | from reddit_user_to_sqlite.reddit_api import ( 7 | Comment, 8 | Post, 9 | SubredditFragment, 10 | UserFragment, 11 | ) 12 | 13 | 14 | class SubredditRow(TypedDict): 15 | id: str 16 | name: str 17 | type: str 18 | # TODO: handle archiving and updating 19 | # archives_posts: bool 20 | 21 | 22 | def item_to_subreddit_row(item: SubredditFragment) -> SubredditRow: 23 | return { 24 | "id": item["subreddit_id"][3:], 25 | "name": item["subreddit"], 26 | "type": item["subreddit_type"], 27 | } 28 | 29 | 30 | def upsert_subreddits(db: Database, subreddits: Iterable[SubredditFragment]): 31 | # upserts are actually important here, since subs are going private/public a lot 32 | # https://github.com/simonw/sqlite-utils/issues/554 33 | db["subreddits"].upsert_all( # type: ignore 34 | map(item_to_subreddit_row, subreddits), 35 | # ignore=True, # type: ignore 36 | # only relevant if creating the table 37 | pk="id", # type: ignore 38 | not_null=["id", "name"], # type: ignore 39 | ) 40 | 41 | 42 | class UserRow(TypedDict): 43 | id: str 44 | username: str 45 | 46 | 47 | def item_to_user_row(item: UserFragment) -> Optional[UserRow]: 48 | if "author_fullname" in item: 49 | return {"id": item["author_fullname"][3:], "username": item["author"]} 50 | 51 | 52 | def insert_users(db: Database, users: Sequence[UserFragment]): 53 | existing_users = {u["id"] for u in db["users"].rows} 54 | 55 | unique_new_users = { 56 | # needs to be hashable so it's deduped 57 | (u["id"], u["username"]) 58 | for user in users 59 | if (u := item_to_user_row(user)) and u["id"] not in existing_users 60 | } 61 | 62 | new_users = [{"id": user[0], "username": user[1]} for user in unique_new_users] 63 | 64 | db["users"].insert_all( # type: ignore 65 | new_users, 66 | # ignore any write error 67 | # ignore=True, 68 | # only relevant if creating the table 69 | pk="id", # type: ignore 70 | not_null=["id", "username"], # type: ignore 71 | ) 72 | 73 | 74 | class CommentRow(TypedDict): 75 | id: str 76 | timestamp: int 77 | score: int 78 | text: str 79 | user: str 80 | is_submitter: int 81 | subreddit: str 82 | permalink: str 83 | controversiality: int 84 | num_awards: int 85 | 86 | 87 | def comment_to_comment_row(comment: Comment) -> Optional[CommentRow]: 88 | if "author_fullname" not in comment: 89 | return 90 | 91 | return { 92 | "id": comment["id"], 93 | "timestamp": int(comment["created"]), 94 | "score": comment["score"], 95 | "text": comment["body"], 96 | "user": comment["author_fullname"][3:], # strip leading t2_ 97 | "subreddit": comment["subreddit_id"][3:], # strip leading t5_ 98 | "permalink": f'https://old.reddit.com{comment["permalink"]}?context=10', 99 | "is_submitter": int(comment["is_submitter"]), 100 | "controversiality": comment["controversiality"], 101 | "num_awards": comment["total_awards_received"], 102 | } 103 | 104 | 105 | T = TypeVar("T") 106 | U = TypeVar("U") 107 | 108 | 109 | def apply_and_filter( 110 | filterer: Callable[[T], Optional[U]], items: Iterable[T] 111 | ) -> list[U]: 112 | return [c for c in map(filterer, items) if c] 113 | 114 | 115 | def upsert_comments( 116 | db: Database, comments: Iterable[Comment], table_prefix: Optional[PrefixType] = None 117 | ) -> int: 118 | comment_rows = apply_and_filter(comment_to_comment_row, comments) 119 | db[build_table_name("comments", table_prefix)].upsert_all( # type: ignore 120 | comment_rows, 121 | pk="id", # type: ignore 122 | # update the schema - needed if user does archive first 123 | alter=True, # type: ignore 124 | foreign_keys=[ # type: ignore 125 | ( 126 | "subreddit", 127 | "subreddits", 128 | "id", 129 | ), 130 | ( 131 | "user", 132 | "users", 133 | "id", 134 | ), 135 | ], 136 | # can re-add or assert this later, but the rows aren't created if this is present 137 | # see: https://github.com/simonw/sqlite-utils/issues/538 138 | # not_null=["id", "timestamp", "text", "user", "subreddit", "permalink"], 139 | ) 140 | return len(comment_rows) 141 | 142 | 143 | class PostRow(TypedDict): 144 | id: str 145 | timestamp: int 146 | score: int 147 | title: str 148 | text: str 149 | external_url: str 150 | user: str 151 | subreddit: str 152 | permalink: str 153 | upvote_ratio: float 154 | score: int 155 | num_comments: int 156 | num_awards: int 157 | is_removed: int 158 | 159 | 160 | def post_to_post_row(post: Post) -> Optional[PostRow]: 161 | if "author_fullname" not in post: 162 | return 163 | 164 | return { 165 | "id": post["id"], 166 | "timestamp": int(post["created"]), 167 | "score": post["score"], 168 | "num_comments": post["num_comments"], 169 | "title": post["title"], 170 | "text": post["selftext"], 171 | "external_url": "" if "reddit.com" in post["url"] else post["url"], 172 | "user": post["author_fullname"][3:], 173 | "subreddit": post["subreddit_id"][3:], 174 | "permalink": f'https://old.reddit.com{post["permalink"]}', 175 | "upvote_ratio": post["upvote_ratio"], 176 | "num_awards": post["total_awards_received"], 177 | "is_removed": int(post["selftext"] == "[removed]"), 178 | } 179 | 180 | 181 | def upsert_posts( 182 | db: Database, posts: Iterable[Post], table_prefix: Optional[PrefixType] = None 183 | ) -> int: 184 | post_rows = apply_and_filter(post_to_post_row, posts) 185 | db[build_table_name("posts", table_prefix)].insert_all( # type: ignore 186 | post_rows, 187 | upsert=True, 188 | pk="id", # type: ignore 189 | alter=True, # type: ignore 190 | foreign_keys=[ # type: ignore 191 | ( 192 | "subreddit", 193 | "subreddits", 194 | "id", 195 | ), 196 | ( 197 | "user", 198 | "users", 199 | "id", 200 | ), 201 | ], 202 | ) 203 | return len(post_rows) 204 | 205 | 206 | FTS_INSTRUCTIONS: list[tuple[str, list[str]]] = [ 207 | ("comments", ["text"]), 208 | ("posts", ["title", "text"]), 209 | ("saved_comments", ["text"]), 210 | ("saved_posts", ["title", "text"]), 211 | ] 212 | 213 | 214 | def ensure_fts(db: Database): 215 | table_names = set(db.table_names()) 216 | for table, columns in FTS_INSTRUCTIONS: 217 | if table in table_names and f"{table}_fts" not in table_names: 218 | db[table].enable_fts(columns, create_triggers=True) 219 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavdid/reddit-user-to-sqlite/e02fad746694f32ebc2ee2efce82652857682cc6/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Any, Literal, Optional, Protocol, Union 3 | 4 | import pytest 5 | import responses 6 | from responses import BaseResponse, RequestsMock, matchers 7 | from sqlite_utils import Database 8 | 9 | from reddit_user_to_sqlite.reddit_api import ( 10 | USER_AGENT, 11 | ErrorHeaders, 12 | PagedResponse, 13 | Post, 14 | ) 15 | from reddit_user_to_sqlite.sqlite_helpers import CommentRow, PostRow, UserRow 16 | 17 | 18 | @pytest.fixture 19 | def tmp_db_path(tmp_path): 20 | """ 21 | returns a Database path in a temp dir 22 | """ 23 | return str(tmp_path / "test.db") 24 | 25 | 26 | @pytest.fixture 27 | def tmp_db(tmp_db_path): 28 | """ 29 | returns a Database in a temp dir 30 | """ 31 | return Database(tmp_db_path) 32 | 33 | 34 | def _wrap_response(*children) -> PagedResponse: 35 | return { 36 | "kind": "Listing", 37 | "data": { 38 | "after": None, 39 | "dist": 1, 40 | "modhash": "whatever", 41 | "geo_filter": "", 42 | "children": [{"kind": "t_", "data": c} for c in children], 43 | "before": None, 44 | }, 45 | } 46 | 47 | 48 | @pytest.fixture 49 | def comment(): 50 | """ 51 | A raw (unwrapped) comment object from the Reddit API 52 | """ 53 | return { 54 | "subreddit_id": "t5_2t3ad", 55 | "approved_at_utc": None, 56 | "author_is_blocked": False, 57 | "comment_type": None, 58 | "link_title": "What games do you guys love to replay or never get bored with?", 59 | "mod_reason_by": None, 60 | "banned_by": None, 61 | "ups": 1, 62 | "num_reports": None, 63 | "author_flair_type": "text", 64 | "total_awards_received": 3, 65 | "subreddit": "patientgamers", 66 | "link_author": "DefinitionWest", 67 | "likes": None, 68 | "replies": "", 69 | "user_reports": [], 70 | "saved": False, 71 | "id": "jj0ti6f", 72 | "banned_at_utc": None, 73 | "mod_reason_title": None, 74 | "gilded": 0, 75 | "archived": False, 76 | "collapsed_reason_code": None, 77 | "no_follow": True, 78 | "author": "xavdid", 79 | "num_comments": 250, 80 | "can_mod_post": False, 81 | "send_replies": True, 82 | "parent_id": "t1_jirew06", 83 | "score": 1, 84 | "author_fullname": "t2_np8mb41h", 85 | "over_18": False, 86 | "report_reasons": None, 87 | "removal_reason": None, 88 | "approved_by": None, 89 | "controversiality": 0, 90 | "body": "Such a great game to pick up for a run every couple of months. Every time I think I'm done, it pulls be back in.", 91 | "edited": False, 92 | "top_awarded_type": None, 93 | "downs": 0, 94 | "author_flair_css_class": None, 95 | "is_submitter": False, 96 | "collapsed": False, 97 | "author_flair_richtext": [], 98 | "author_patreon_flair": False, 99 | "body_html": '<div class="md"><p>Such a great game to pick up for a run every couple of months. Every time I think I&#39;m done, it pulls be back in.</p>\n</div>', 100 | "gildings": {}, 101 | "collapsed_reason": None, 102 | "distinguished": None, 103 | "associated_award": None, 104 | "stickied": False, 105 | "author_premium": False, 106 | "can_gild": True, 107 | "link_id": "t3_1371yrv", 108 | "unrepliable_reason": None, 109 | "author_flair_text_color": None, 110 | "score_hidden": False, 111 | "permalink": "/r/patientgamers/comments/1371yrv/what_games_do_you_guys_love_to_replay_or_never/jj0ti6f/", 112 | "subreddit_type": "public", 113 | "link_permalink": "https://www.reddit.com/r/patientgamers/comments/1371yrv/what_games_do_you_guys_love_to_replay_or_never/", 114 | "name": "t1_jj0ti6f", 115 | "author_flair_template_id": None, 116 | "subreddit_name_prefixed": "r/patientgamers", 117 | "author_flair_text": None, 118 | "treatment_tags": [], 119 | "created": 1683327131.0, 120 | "created_utc": 1683327131.0, 121 | "awarders": [], 122 | "all_awardings": [], 123 | "locked": False, 124 | "author_flair_background_color": None, 125 | "collapsed_because_crowd_control": None, 126 | "mod_reports": [], 127 | "quarantine": False, 128 | "mod_note": None, 129 | "link_url": "https://www.reddit.com/r/patientgamers/comments/1371yrv/what_games_do_you_guys_love_to_replay_or_never/", 130 | } 131 | 132 | 133 | @pytest.fixture 134 | def modify_comment(comment): 135 | def _modify(d): 136 | return {**comment, **d} 137 | 138 | return _modify 139 | 140 | 141 | @pytest.fixture 142 | def modify_post(self_post): 143 | def _modify(d): 144 | return {**self_post, **d} 145 | 146 | return _modify 147 | 148 | 149 | @pytest.fixture 150 | def removed_comment(): 151 | return { 152 | "total_awards_received": 0, 153 | "approved_at_utc": None, 154 | "author_is_blocked": False, 155 | "comment_type": None, 156 | "edited": False, 157 | "mod_reason_by": None, 158 | "banned_by": None, 159 | "removal_reason": None, 160 | "link_id": "t3_puwue", 161 | "author_flair_template_id": None, 162 | "likes": None, 163 | "replies": "", 164 | "user_reports": [], 165 | "saved": False, 166 | "id": "c3sgfl4", 167 | "banned_at_utc": None, 168 | "mod_reason_title": None, 169 | "gilded": 0, 170 | "archived": True, 171 | "collapsed_reason_code": "DELETED", 172 | "no_follow": True, 173 | "author": "[deleted]", 174 | "can_mod_post": False, 175 | "created_utc": 1329550785.0, 176 | "send_replies": True, 177 | "parent_id": "t1_c3sgeij", 178 | "score": -1, 179 | "approved_by": None, 180 | "mod_note": None, 181 | "all_awardings": [], 182 | "subreddit_id": "t5_2qm4e", 183 | "body": "[removed]", 184 | "awarders": [], 185 | "author_flair_css_class": None, 186 | "name": "t1_c3sgfl4", 187 | "downs": 0, 188 | "is_submitter": False, 189 | "body_html": '

[removed]

\n
', 190 | "gildings": {}, 191 | "collapsed_reason": None, 192 | "distinguished": None, 193 | "associated_award": None, 194 | "stickied": False, 195 | "can_gild": True, 196 | "top_awarded_type": None, 197 | "unrepliable_reason": None, 198 | "author_flair_text_color": "dark", 199 | "score_hidden": False, 200 | "permalink": "/r/askscience/comments/asdf/why_do_birds_fly/", 201 | "num_reports": None, 202 | "locked": False, 203 | "report_reasons": None, 204 | "created": 1329550785.0, 205 | "subreddit": "askscience", 206 | "author_flair_text": None, 207 | "treatment_tags": [], 208 | "collapsed": True, 209 | "subreddit_name_prefixed": "r/askscience", 210 | "controversiality": 0, 211 | "author_flair_background_color": "", 212 | "collapsed_because_crowd_control": None, 213 | "mod_reports": [], 214 | "subreddit_type": "public", 215 | "ups": -1, 216 | } 217 | 218 | 219 | @pytest.fixture 220 | def removed_comment_response(removed_comment): 221 | return _wrap_response(removed_comment) 222 | 223 | 224 | @pytest.fixture 225 | def all_comments_response(comment, removed_comment): 226 | return _wrap_response(comment, removed_comment) 227 | 228 | 229 | @pytest.fixture 230 | def stored_comment() -> CommentRow: 231 | """ 232 | a serialized comment row in the db 233 | """ 234 | return { 235 | "controversiality": 0, 236 | "id": "jj0ti6f", 237 | "is_submitter": 0, 238 | "permalink": "https://old.reddit.com/r/patientgamers/comments/1371yrv/what_games_do_you_guys_love_to_replay_or_never/jj0ti6f/?context=10", 239 | "score": 1, 240 | "subreddit": "2t3ad", 241 | "text": "Such a great game to pick up for a run every couple of months. Every time I think I'm done, it pulls be back in.", 242 | "timestamp": 1683327131, 243 | "user": "np8mb41h", 244 | "num_awards": 3, 245 | } 246 | 247 | 248 | @pytest.fixture 249 | def stored_removed_comment() -> CommentRow: 250 | return { 251 | "controversiality": 0, 252 | "id": "c3sgfl4", 253 | "is_submitter": 0, 254 | "permalink": "https://old.reddit.com/r/askscience/comments/asdf/why_do_birds_fly/?context=10", 255 | "score": -1, 256 | "subreddit": "2qm4e", 257 | "text": "[removed]", 258 | "timestamp": 1329550785, 259 | # manually added this - if it's stored, I must have found a user 260 | "user": "np8mb41h", 261 | "num_awards": 0, 262 | } 263 | 264 | 265 | @pytest.fixture 266 | def stored_removed_comment_placeholder_user() -> CommentRow: 267 | return { 268 | "controversiality": 0, 269 | "id": "c3sgfl4", 270 | "is_submitter": 0, 271 | "permalink": "https://old.reddit.com/r/askscience/comments/asdf/why_do_birds_fly/?context=10", 272 | "score": -1, 273 | "subreddit": "2qm4e", 274 | "text": "[removed]", 275 | "timestamp": 1329550785, 276 | "user": "1234567", 277 | "num_awards": 0, 278 | } 279 | 280 | 281 | @pytest.fixture 282 | def comment_response(comment) -> PagedResponse: 283 | """ 284 | The full response from Reddit with a comment child 285 | """ 286 | return _wrap_response(comment) 287 | 288 | 289 | @pytest.fixture 290 | def self_post(): 291 | """ 292 | A raw (unwrapped) self post object from the Reddit API 293 | """ 294 | return { 295 | "all_awardings": [], 296 | "allow_live_comments": False, 297 | "approved_at_utc": None, 298 | "approved_by": None, 299 | "archived": False, 300 | "author": "xavdid", 301 | "author_flair_background_color": None, 302 | "author_flair_css_class": None, 303 | "author_flair_richtext": [], 304 | "author_flair_template_id": None, 305 | "author_flair_text": None, 306 | "author_flair_text_color": None, 307 | "author_flair_type": "text", 308 | "author_fullname": "t2_np8mb41h", 309 | "author_is_blocked": False, 310 | "author_patreon_flair": False, 311 | "author_premium": False, 312 | "awarders": [], 313 | "banned_at_utc": None, 314 | "banned_by": None, 315 | "can_gild": False, 316 | "can_mod_post": False, 317 | "category": None, 318 | "clicked": False, 319 | "content_categories": None, 320 | "contest_mode": False, 321 | "created": 1653623084, 322 | "created_utc": 1653623084, 323 | "discussion_type": None, 324 | "distinguished": None, 325 | "domain": "self.KeybaseProofs", 326 | "downs": 0, 327 | "edited": False, 328 | "gilded": 0, 329 | "gildings": {}, 330 | "hidden": False, 331 | "hide_score": False, 332 | "id": "uypaav", 333 | "is_created_from_ads_ui": False, 334 | "is_crosspostable": False, 335 | "is_meta": False, 336 | "is_original_content": False, 337 | "is_reddit_media_domain": False, 338 | "is_robot_indexable": True, 339 | "is_self": True, 340 | "is_video": False, 341 | "likes": None, 342 | "link_flair_background_color": "", 343 | "link_flair_css_class": None, 344 | "link_flair_richtext": [], 345 | "link_flair_text": None, 346 | "link_flair_text_color": "dark", 347 | "link_flair_type": "text", 348 | "locked": False, 349 | "media": None, 350 | "media_embed": {}, 351 | "media_only": False, 352 | "mod_note": None, 353 | "mod_reason_by": None, 354 | "mod_reason_title": None, 355 | "mod_reports": [], 356 | "name": "t3_uypaav", 357 | "no_follow": True, 358 | "num_comments": 0, 359 | "num_crossposts": 0, 360 | "num_reports": None, 361 | "over_18": False, 362 | "parent_whitelist_status": "all_ads", 363 | "permalink": "/r/KeybaseProofs/comments/uypaav/my_keybase_proof_redditxavdid_keybasexavdid/", 364 | "pinned": False, 365 | "post_hint": "self", 366 | "preview": { 367 | "enabled": False, 368 | "images": [ 369 | { 370 | "id": "-YTScuArtOT7VGFuDeGCZvRtPZZ6N8YNPBBjDIA6KiQ", 371 | "resolutions": [ 372 | { 373 | "height": 108, 374 | "url": "https://external-preview.redd.it/d8t5K0qquzpFUYxW8QDLgM8lFUUyu6zo_KM_cFv2JjY.jpg?width=108&crop=smart&auto=webp&v=enabled&s=3076e81be7310fd25b111faa85f33dcd722e3e07", 375 | "width": 108, 376 | }, 377 | { 378 | "height": 216, 379 | "url": "https://external-preview.redd.it/d8t5K0qquzpFUYxW8QDLgM8lFUUyu6zo_KM_cFv2JjY.jpg?width=216&crop=smart&auto=webp&v=enabled&s=80217a00e40d70bdf57ebd1510d5ff49a1b1b5a4", 380 | "width": 216, 381 | }, 382 | { 383 | "height": 320, 384 | "url": "https://external-preview.redd.it/d8t5K0qquzpFUYxW8QDLgM8lFUUyu6zo_KM_cFv2JjY.jpg?width=320&crop=smart&auto=webp&v=enabled&s=547611bba1890b9b67fc84e2d31badb682bd25bb", 385 | "width": 320, 386 | }, 387 | ], 388 | "source": { 389 | "height": 360, 390 | "url": "https://external-preview.redd.it/d8t5K0qquzpFUYxW8QDLgM8lFUUyu6zo_KM_cFv2JjY.jpg?auto=webp&v=enabled&s=ff41e339b6994c953c13eb917d562e7b0793831e", 391 | "width": 360, 392 | }, 393 | "variants": {}, 394 | } 395 | ], 396 | }, 397 | "pwls": 6, 398 | "quarantine": False, 399 | "removal_reason": None, 400 | "removed_by": None, 401 | "removed_by_category": None, 402 | "report_reasons": None, 403 | "saved": False, 404 | "score": 1, 405 | "secure_media": None, 406 | "secure_media_embed": {}, 407 | "selftext": "### Keybase proof\n...-----END PGP MESSAGE-----\n", 408 | "selftext_html": '

Keybase proof

\n-----END PGP MESSAGE-----\n\n
', 409 | "send_replies": True, 410 | "spoiler": False, 411 | "stickied": False, 412 | "subreddit": "KeybaseProofs", 413 | "subreddit_id": "t5_32u6q", 414 | "subreddit_name_prefixed": "r/KeybaseProofs", 415 | "subreddit_subscribers": 7428, 416 | "subreddit_type": "public", 417 | "suggested_sort": None, 418 | "thumbnail": "self", 419 | "thumbnail_height": None, 420 | "thumbnail_width": None, 421 | "title": "My Keybase proof [reddit:xavdid = keybase:xavdid]", 422 | "top_awarded_type": None, 423 | "total_awards_received": 0, 424 | "treatment_tags": [], 425 | "ups": 1, 426 | "upvote_ratio": 1, 427 | "url": "https://www.reddit.com/r/KeybaseProofs/comments/uypaav/my_keybase_proof_redditxavdid_keybasexavdid/", 428 | "user_reports": [], 429 | "view_count": None, 430 | "visited": False, 431 | "whitelist_status": "all_ads", 432 | "wls": 6, 433 | } 434 | 435 | 436 | @pytest.fixture 437 | def stored_self_post() -> PostRow: 438 | return { 439 | "external_url": "", 440 | "id": "uypaav", 441 | "is_removed": 0, 442 | "num_awards": 0, 443 | "permalink": "https://old.reddit.com/r/KeybaseProofs/comments/uypaav/my_keybase_proof_redditxavdid_keybasexavdid/", 444 | "score": 1, 445 | "subreddit": "32u6q", 446 | "text": "### Keybase proof\n...-----END PGP MESSAGE-----\n", 447 | "timestamp": 1653623084, 448 | "num_comments": 0, 449 | "title": "My Keybase proof [reddit:xavdid = keybase:xavdid]", 450 | "upvote_ratio": 1, 451 | "user": "np8mb41h", 452 | } 453 | 454 | 455 | @pytest.fixture 456 | def self_post_response(self_post): 457 | return _wrap_response(self_post) 458 | 459 | 460 | @pytest.fixture 461 | def removed_post(): 462 | """ 463 | A raw (unwrapped) removed post object from the Reddit API 464 | """ 465 | return { 466 | "approved_at_utc": None, 467 | "subreddit": "videos", 468 | "selftext": "[deleted]", 469 | "user_reports": [], 470 | "saved": False, 471 | "mod_reason_title": None, 472 | "gilded": 0, 473 | "clicked": False, 474 | "title": "Tommy Wiseau Wishes YOU A Happy Memorial Day! — Urban Outfitters", 475 | "link_flair_richtext": [], 476 | "subreddit_name_prefixed": "r/videos", 477 | "hidden": False, 478 | "pwls": 6, 479 | "link_flair_css_class": None, 480 | "downs": 0, 481 | "thumbnail_height": 52, 482 | "top_awarded_type": None, 483 | "hide_score": False, 484 | "name": "t3_1f55rr", 485 | "quarantine": False, 486 | "link_flair_text_color": "dark", 487 | "upvote_ratio": 1, 488 | "author_flair_background_color": "", 489 | "subreddit_type": "public", 490 | "ups": 1, 491 | "total_awards_received": 0, 492 | "media_embed": {}, 493 | "thumbnail_width": 70, 494 | "author_flair_template_id": None, 495 | "is_original_content": False, 496 | "secure_media": None, 497 | "is_reddit_media_domain": False, 498 | "is_meta": False, 499 | "category": None, 500 | "secure_media_embed": {}, 501 | "link_flair_text": None, 502 | "can_mod_post": False, 503 | "score": 1, 504 | "approved_by": None, 505 | "is_created_from_ads_ui": False, 506 | "thumbnail": "default", 507 | "edited": False, 508 | "author_flair_css_class": None, 509 | "gildings": {}, 510 | "content_categories": None, 511 | "is_self": False, 512 | "mod_note": None, 513 | "created": 1369671390.0, 514 | "link_flair_type": "text", 515 | "wls": 6, 516 | "removed_by_category": None, 517 | "banned_by": None, 518 | "domain": "", 519 | "allow_live_comments": False, 520 | "selftext_html": '

[deleted]

\n
', 521 | "likes": None, 522 | "suggested_sort": None, 523 | "banned_at_utc": None, 524 | "url_overridden_by_dest": "", 525 | "view_count": None, 526 | "archived": False, 527 | "no_follow": True, 528 | "is_crosspostable": False, 529 | "pinned": False, 530 | "over_18": False, 531 | "all_awardings": [], 532 | "awarders": [], 533 | "media_only": False, 534 | "can_gild": False, 535 | "spoiler": False, 536 | "locked": False, 537 | "author_flair_text": None, 538 | "treatment_tags": [], 539 | "visited": False, 540 | "removed_by": None, 541 | "num_reports": None, 542 | "distinguished": None, 543 | "subreddit_id": "t5_2qh1e", 544 | "author_is_blocked": False, 545 | "mod_reason_by": None, 546 | "removal_reason": None, 547 | "link_flair_background_color": "", 548 | "id": "1f55rr", 549 | "is_robot_indexable": False, 550 | "report_reasons": None, 551 | "author": "[deleted]", 552 | "discussion_type": None, 553 | "num_comments": 0, 554 | "send_replies": False, 555 | "whitelist_status": "all_ads", 556 | "contest_mode": False, 557 | "mod_reports": [], 558 | "author_flair_text_color": "dark", 559 | "permalink": "/r/videos/comments/1f55rr/tommy_wiseau_wishes_you_a_happy_memorial_day/", 560 | "parent_whitelist_status": "all_ads", 561 | "stickied": False, 562 | "url": "", 563 | "subreddit_subscribers": 26688085, 564 | "created_utc": 1369671390.0, 565 | "num_crossposts": 0, 566 | "media": None, 567 | "is_video": False, 568 | } 569 | 570 | 571 | @pytest.fixture 572 | def stored_removed_post() -> PostRow: 573 | return { 574 | "external_url": "", 575 | "id": "1f55rr", 576 | "is_removed": 0, 577 | "num_awards": 0, 578 | "num_comments": 0, 579 | "permalink": "https://old.reddit.com/r/videos/comments/1f55rr/tommy_wiseau_wishes_you_a_happy_memorial_day/", 580 | "score": 1, 581 | "subreddit": "2qh1e", 582 | "text": "[deleted]", 583 | "timestamp": 1369671390, 584 | "title": "Tommy Wiseau Wishes YOU A Happy Memorial Day! — Urban Outfitters", 585 | "upvote_ratio": 1, 586 | # manually added this - if it's stored, I must have found a user 587 | "user": "np8mb41h", 588 | } 589 | 590 | 591 | @pytest.fixture 592 | def stored_removed_post_placeholder_user() -> PostRow: 593 | return { 594 | "external_url": "", 595 | "id": "1f55rr", 596 | "is_removed": 0, 597 | "num_awards": 0, 598 | "num_comments": 0, 599 | "permalink": "https://old.reddit.com/r/videos/comments/1f55rr/tommy_wiseau_wishes_you_a_happy_memorial_day/", 600 | "score": 1, 601 | "subreddit": "2qh1e", 602 | "text": "[deleted]", 603 | "timestamp": 1369671390, 604 | "title": "Tommy Wiseau Wishes YOU A Happy Memorial Day! — Urban Outfitters", 605 | "upvote_ratio": 1, 606 | "user": "1234567", 607 | } 608 | 609 | 610 | @pytest.fixture 611 | def removed_post_response(removed_post): 612 | return _wrap_response(removed_post) 613 | 614 | 615 | @pytest.fixture 616 | def external_post(self_post: Post) -> Post: 617 | """ 618 | A raw (unwrapped) external post object from the Reddit API 619 | """ 620 | return { 621 | **self_post, 622 | "selftext": "", 623 | "url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ", 624 | "id": "qwer", 625 | } 626 | 627 | 628 | @pytest.fixture 629 | def stored_external_post(stored_self_post: PostRow) -> PostRow: 630 | return { 631 | **stored_self_post, 632 | "text": "", 633 | "external_url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ", 634 | "id": "qwer", 635 | } 636 | 637 | 638 | @pytest.fixture 639 | def all_posts_response(self_post, removed_post, external_post): 640 | return _wrap_response(self_post, removed_post, external_post) 641 | 642 | 643 | @pytest.fixture 644 | def empty_response(): 645 | return _wrap_response() 646 | 647 | 648 | @pytest.fixture() 649 | def mock(): 650 | with responses.RequestsMock() as mock_requests: 651 | yield mock_requests 652 | 653 | 654 | class MockPagedFunc(Protocol): 655 | def __call__( 656 | self, 657 | resource: Literal["comments", "submitted"], 658 | json: Any, 659 | params: Optional[dict[str, Union[str, int]]] = None, 660 | headers: Optional[dict[str, str]] = None, 661 | ) -> BaseResponse: 662 | ... 663 | 664 | 665 | @pytest.fixture 666 | def mock_paged_request(mock: RequestsMock) -> MockPagedFunc: 667 | """ 668 | call this to mock a list of items for a user 669 | """ 670 | 671 | def _mock_request( 672 | resource: Literal["comments", "submitted"], 673 | json: Any, 674 | params: Optional[dict[str, Union[str, int]]] = None, 675 | headers: Optional[dict[str, str]] = None, 676 | ): 677 | params = {"limit": 100, "raw_json": 1, **(params or {})} 678 | 679 | return mock.get( 680 | f"https://www.reddit.com/user/xavdid/{resource}.json", 681 | match=[ 682 | matchers.query_param_matcher(params), 683 | matchers.header_matcher({"user-agent": USER_AGENT}), 684 | ], 685 | json=json, 686 | headers=headers, 687 | ) 688 | 689 | return _mock_request 690 | 691 | 692 | class MockInfoFunc(Protocol): 693 | def __call__( 694 | self, ids: str, json: Any, headers: Optional[dict[str, str]] = None, limit=100 695 | ) -> BaseResponse: 696 | ... 697 | 698 | 699 | # need to extract this so I can call it manually 700 | # def _build_mock_info_req(mock: RequestsMock) -> MockInfoFunc: 701 | 702 | 703 | @pytest.fixture 704 | def mock_info_request(mock: RequestsMock) -> MockInfoFunc: 705 | """ 706 | call this to mirror loading info about a sequence of fullnames (type-prefixed ids) 707 | """ 708 | 709 | def _mock_request( 710 | ids: str, 711 | json: Any, 712 | headers: Optional[dict[str, str]] = None, 713 | limit=100, 714 | ): 715 | params = {"limit": limit, "raw_json": 1, "id": ids} 716 | 717 | return mock.get( 718 | "https://www.reddit.com/api/info.json", 719 | match=[ 720 | matchers.query_param_matcher(params), 721 | matchers.header_matcher({"user-agent": USER_AGENT}), 722 | ], 723 | json=json, 724 | headers=headers, 725 | ) 726 | 727 | return _mock_request 728 | 729 | 730 | @pytest.fixture 731 | def comment_info_response(modify_comment): 732 | return _wrap_response(*(modify_comment({"id": i}) for i in "ac")) 733 | 734 | 735 | @pytest.fixture 736 | def post_info_response(modify_post): 737 | return _wrap_response(*(modify_post({"id": i}) for i in "df")) 738 | 739 | 740 | @pytest.fixture 741 | def stored_user() -> UserRow: 742 | return {"id": "np8mb41h", "username": "xavdid"} 743 | 744 | 745 | @pytest.fixture 746 | def deleted_user() -> UserRow: 747 | return {"id": "1234567", "username": "__DeletedUser__"} 748 | 749 | 750 | @pytest.fixture 751 | def user_response(): 752 | return { 753 | "kind": "t2", 754 | "data": { 755 | "is_employee": False, 756 | "is_friend": False, 757 | "subreddit": { 758 | "default_set": True, 759 | "user_is_contributor": None, 760 | "banner_img": "", 761 | "allowed_media_in_comments": [], 762 | "user_is_banned": None, 763 | "free_form_reports": True, 764 | "community_icon": None, 765 | "show_media": True, 766 | "icon_color": "#51E9F4", 767 | "user_is_muted": None, 768 | "display_name": "u_xavdid", 769 | "header_img": None, 770 | "title": "", 771 | "previous_names": [], 772 | "over_18": False, 773 | "icon_size": [256, 256], 774 | "primary_color": "", 775 | "icon_img": "https://www.redditstatic.com/avatars/defaults/v2/avatar_default_5.png", 776 | "description": "", 777 | "submit_link_label": "", 778 | "header_size": None, 779 | "restrict_posting": True, 780 | "restrict_commenting": False, 781 | "subscribers": 0, 782 | "submit_text_label": "", 783 | "is_default_icon": True, 784 | "link_flair_position": "", 785 | "display_name_prefixed": "u/xavdid", 786 | "key_color": "", 787 | "name": "t5_6fndvc", 788 | "is_default_banner": True, 789 | "url": "/user/xavdid/", 790 | "quarantine": False, 791 | "banner_size": None, 792 | "user_is_moderator": None, 793 | "accept_followers": True, 794 | "public_description": "", 795 | "link_flair_enabled": False, 796 | "disable_contributor_requests": False, 797 | "subreddit_type": "user", 798 | "user_is_subscriber": None, 799 | }, 800 | "snoovatar_size": None, 801 | "awardee_karma": 0, 802 | "id": "np8mb41h", 803 | "verified": True, 804 | "is_gold": False, 805 | "is_mod": False, 806 | "awarder_karma": 0, 807 | "has_verified_email": True, 808 | "icon_img": "https://www.redditstatic.com/avatars/defaults/v2/avatar_default_5.png", 809 | "hide_from_robots": False, 810 | "link_karma": 1, 811 | "is_blocked": False, 812 | "total_karma": 3, 813 | "pref_show_snoovatar": False, 814 | "name": "xavdid", 815 | "created": 1653622688.0, 816 | "created_utc": 1653622688.0, 817 | "snoovatar_img": "", 818 | "comment_karma": 2, 819 | "accept_followers": True, 820 | "has_subscribed": False, 821 | }, 822 | } 823 | 824 | 825 | @pytest.fixture 826 | def rate_limit_headers() -> ErrorHeaders: 827 | return { 828 | "x-ratelimit-used": "4", 829 | "x-ratelimit-remaining": "6", 830 | "x-ratelimit-reset": "20", 831 | } 832 | 833 | 834 | class MockUserFunc(Protocol): 835 | def __call__(self, username: str, json: Any) -> BaseResponse: 836 | ... 837 | 838 | 839 | @pytest.fixture 840 | def mock_user_request(mock: RequestsMock) -> MockUserFunc: 841 | """ 842 | call this to mirror loading info about a sequence of fullnames (type-prefixed ids) 843 | """ 844 | 845 | def _mock_request(username: str, json: Any): 846 | return mock.get( 847 | f"https://www.reddit.com/user/{username}/about.json", 848 | match=[ 849 | matchers.header_matcher({"user-agent": USER_AGENT}), 850 | ], 851 | json=json, 852 | ) 853 | 854 | return _mock_request 855 | 856 | 857 | @pytest.fixture 858 | def archive_dir(tmp_path: Path): 859 | (archive_dir := tmp_path / "archive").mkdir() 860 | return archive_dir 861 | 862 | 863 | class WriteArchiveFileFunc(Protocol): 864 | def __call__(self, filename: str, lines: list[str]) -> Path: 865 | ... 866 | 867 | 868 | @pytest.fixture 869 | def write_archive_file(archive_dir: Path) -> WriteArchiveFileFunc: 870 | """ 871 | write `lines` into `archive_dir/filename`. 872 | """ 873 | 874 | def _write_file(filename: str, lines: list[str]): 875 | (new_file := archive_dir / filename).write_text("\n".join(lines)) 876 | return new_file 877 | 878 | return _write_file 879 | 880 | 881 | @pytest.fixture 882 | def stats_file(write_archive_file: WriteArchiveFileFunc): 883 | """ 884 | write a basic statistics file into the archive directory 885 | """ 886 | 887 | return write_archive_file( 888 | "statistics.csv", 889 | [ 890 | "statistic,value", 891 | "account name,xavdid", 892 | "export time,2023-05-02 06:57:14 UTC", 893 | "is_deleted,False", 894 | "registration date,2014-05-19 22:02:20 UTC", 895 | "email verified,True", 896 | "email address,whatever@gmail.com", 897 | ], 898 | ) 899 | 900 | 901 | @pytest.fixture 902 | def comments_file(write_archive_file: WriteArchiveFileFunc): 903 | return write_archive_file("comments.csv", ["id", "a", "c"]) 904 | 905 | 906 | @pytest.fixture 907 | def saved_comments_file(write_archive_file: WriteArchiveFileFunc): 908 | return write_archive_file("saved_comments.csv", ["id", "g", "h"]) 909 | 910 | 911 | @pytest.fixture 912 | def posts_file(write_archive_file: WriteArchiveFileFunc): 913 | return write_archive_file("posts.csv", ["id", "d", "f"]) 914 | 915 | 916 | @pytest.fixture 917 | def saved_posts_file(write_archive_file: WriteArchiveFileFunc): 918 | return write_archive_file("saved_posts.csv", ["id", "j", "k"]) 919 | 920 | 921 | @pytest.fixture 922 | def empty_file_at_path(write_archive_file: WriteArchiveFileFunc): 923 | def _empty_file(filename: str): 924 | return write_archive_file(filename, []) 925 | 926 | return _empty_file 927 | 928 | 929 | # --- 930 | 931 | 932 | # https://docs.pytest.org/en/latest/example/simple.html#control-skipping-of-tests-according-to-command-line-option 933 | 934 | 935 | def pytest_addoption(parser): 936 | parser.addoption( 937 | "--include-live", action="store_true", default=False, help="run live API tests" 938 | ) 939 | 940 | 941 | def pytest_configure(config): 942 | config.addinivalue_line("markers", "live: mark test as hitting the live API") 943 | 944 | 945 | def pytest_collection_modifyitems(config, items): 946 | if config.getoption("--include-live"): 947 | # include-live flag given in cli; do not skip slow tests 948 | return 949 | 950 | skip_live = pytest.mark.skip(reason="need --include-live flag to run") 951 | for item in items: 952 | if "live" in item.keywords: 953 | item.add_marker(skip_live) 954 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | from traceback import print_tb 2 | 3 | import pytest 4 | from click.testing import CliRunner 5 | from sqlite_utils import Database 6 | 7 | from reddit_user_to_sqlite.cli import cli 8 | from tests.conftest import ( 9 | MockInfoFunc, 10 | MockPagedFunc, 11 | MockUserFunc, 12 | WriteArchiveFileFunc, 13 | ) 14 | 15 | 16 | @pytest.mark.parametrize("username", ["xavdid", "/u/xavdid", "u/xavdid"]) 17 | def test_load_data_for_user( 18 | tmp_db_path: str, 19 | tmp_db: Database, 20 | mock_paged_request: MockPagedFunc, 21 | username, 22 | all_posts_response, 23 | stored_comment, 24 | stored_self_post, 25 | stored_external_post, 26 | stored_user, 27 | all_comments_response, 28 | ): 29 | comment_response = mock_paged_request( 30 | resource="comments", json=all_comments_response 31 | ) 32 | post_response = mock_paged_request(resource="submitted", json=all_posts_response) 33 | 34 | result = CliRunner().invoke(cli, ["user", username, "--db", tmp_db_path]) 35 | assert not result.exception, result.exception 36 | 37 | assert { 38 | "subreddits", 39 | "users", 40 | "comments", 41 | "comments_fts", 42 | "posts", 43 | "posts_fts", 44 | }.issubset(tmp_db.table_names()) 45 | 46 | assert list(tmp_db["subreddits"].rows) == [ 47 | {"id": "2t3ad", "name": "patientgamers", "type": "public"}, 48 | {"id": "2qm4e", "name": "askscience", "type": "public"}, 49 | {"id": "32u6q", "name": "KeybaseProofs", "type": "public"}, 50 | {"id": "2qh1e", "name": "videos", "type": "public"}, 51 | ] 52 | assert list(tmp_db["users"].rows) == [stored_user] 53 | assert list(tmp_db["comments"].rows) == [stored_comment] 54 | assert list(tmp_db["posts"].rows) == [ 55 | stored_self_post, 56 | stored_external_post, 57 | ] 58 | 59 | assert comment_response.call_count == 1 60 | assert post_response.call_count == 1 61 | 62 | 63 | @pytest.mark.live 64 | def test_load_live_data( 65 | tmp_db_path: str, tmp_db: Database, stored_comment, stored_self_post, stored_user 66 | ): 67 | result = CliRunner().invoke(cli, ["user", "xavdid", "--db", tmp_db_path]) 68 | assert not result.exception, result.exception 69 | 70 | assert {"subreddits", "users", "comments", "comments_fts"} <= set( 71 | tmp_db.table_names() 72 | ) 73 | 74 | assert {"id": "2t3ad", "name": "patientgamers", "type": "public"} in list( 75 | tmp_db["subreddits"].rows 76 | ) 77 | assert list(tmp_db["users"].rows) == [stored_user] 78 | 79 | comments = list(tmp_db["comments"].rows) 80 | assert ( 81 | len(comments) <= 1000 82 | ), "this test will start to fail if/when I've made 1k comments on this account" 83 | assert stored_comment in comments 84 | 85 | posts = list(tmp_db["posts"].rows) 86 | assert ( 87 | len(posts) <= 1000 88 | ), "this test will start to fail if/when I've made 1k posts on this account" 89 | assert stored_self_post["id"] in {p["id"] for p in posts} 90 | 91 | 92 | def test_missing_user_errors(tmp_db_path: str, mock_paged_request: MockPagedFunc): 93 | mock_paged_request( 94 | resource="comments", json={"error": 404, "message": "no user by that name"} 95 | ) 96 | result = CliRunner().invoke(cli, ["user", "xavdid", "--db", tmp_db_path]) 97 | 98 | assert result.exception 99 | assert ( 100 | str(result.exception) 101 | == "Received API error from Reddit (code 404): no user by that name" 102 | ) 103 | 104 | 105 | def test_no_data(tmp_db_path: str, mock_paged_request: MockPagedFunc, empty_response): 106 | mock_paged_request(resource="comments", json=empty_response) 107 | mock_paged_request(resource="submitted", json=empty_response) 108 | 109 | result = CliRunner().invoke(cli, ["user", "xavdid", "--db", tmp_db_path]) 110 | 111 | assert result.exit_code == 1 112 | assert result.stdout # not sure why it's in "out" not "err" 113 | assert "Error: no data found for username: xavdid" in result.stdout 114 | 115 | 116 | def test_comments_but_no_posts( 117 | tmp_db_path: str, 118 | tmp_db: Database, 119 | mock_paged_request: MockPagedFunc, 120 | empty_response, 121 | comment_response, 122 | stored_comment, 123 | stored_user, 124 | ): 125 | mock_paged_request(resource="comments", json=comment_response) 126 | mock_paged_request(resource="submitted", json=empty_response) 127 | 128 | result = CliRunner().invoke(cli, ["user", "xavdid", "--db", tmp_db_path]) 129 | assert not result.exception, result.exception 130 | 131 | assert list(tmp_db["users"].rows) == [stored_user] 132 | assert list(tmp_db["posts"].rows) == [] 133 | assert list(tmp_db["comments"].rows) == [stored_comment] 134 | 135 | 136 | def test_posts_but_no_comments( 137 | tmp_db_path: str, 138 | tmp_db: Database, 139 | mock_paged_request: MockPagedFunc, 140 | empty_response, 141 | self_post_response, 142 | stored_self_post, 143 | stored_user, 144 | ): 145 | mock_paged_request(resource="comments", json=empty_response) 146 | mock_paged_request(resource="submitted", json=self_post_response) 147 | 148 | result = CliRunner().invoke(cli, ["user", "xavdid", "--db", tmp_db_path]) 149 | assert not result.exception, result.exception 150 | 151 | assert list(tmp_db["users"].rows) == [stored_user] 152 | assert list(tmp_db["comments"].rows) == [] 153 | assert list(tmp_db["posts"].rows) == [stored_self_post] 154 | 155 | 156 | @pytest.mark.usefixtures("comments_file", "posts_file") 157 | def test_cold_load_data_from_archive( 158 | tmp_db_path, 159 | mock_info_request: MockInfoFunc, 160 | archive_dir, 161 | tmp_db: Database, 162 | stored_user, 163 | stored_comment, 164 | stored_self_post, 165 | comment_info_response, 166 | post_info_response, 167 | empty_file_at_path, 168 | ): 169 | empty_file_at_path("saved_comments.csv") 170 | empty_file_at_path("saved_posts.csv") 171 | 172 | mock_info_request("t1_a,t1_c", json=comment_info_response) 173 | mock_info_request("t3_d,t3_f", json=post_info_response) 174 | 175 | result = CliRunner().invoke(cli, ["archive", str(archive_dir), "--db", tmp_db_path]) 176 | assert not result.exception, print(result.exception) 177 | 178 | assert { 179 | "subreddits", 180 | "users", 181 | "comments", 182 | "comments_fts", 183 | "posts", 184 | "posts_fts", 185 | } <= set(tmp_db.table_names()) 186 | 187 | assert list(tmp_db["subreddits"].rows) == [ 188 | {"id": "2t3ad", "name": "patientgamers", "type": "public"}, 189 | {"id": "32u6q", "name": "KeybaseProofs", "type": "public"}, 190 | ] 191 | assert list(tmp_db["users"].rows) == [stored_user] 192 | assert list(tmp_db["comments"].rows) == [{**stored_comment, "id": i} for i in "ac"] 193 | assert list(tmp_db["posts"].rows) == [{**stored_self_post, "id": i} for i in "df"] 194 | 195 | 196 | @pytest.mark.usefixtures("comments_file") 197 | def test_cold_load_comments_only_from_archive( 198 | tmp_db_path, 199 | mock_info_request: MockInfoFunc, 200 | empty_file_at_path, 201 | archive_dir, 202 | tmp_db: Database, 203 | stored_comment, 204 | stored_user, 205 | comment_info_response, 206 | ): 207 | mock_info_request("t1_a,t1_c", json=comment_info_response) 208 | empty_file_at_path("posts.csv") 209 | empty_file_at_path("saved_comments.csv") 210 | empty_file_at_path("saved_posts.csv") 211 | 212 | result = CliRunner().invoke(cli, ["archive", str(archive_dir), "--db", tmp_db_path]) 213 | assert not result.exception 214 | 215 | assert {"subreddits", "users", "comments", "comments_fts"} <= set( 216 | tmp_db.table_names() 217 | ) 218 | assert list(tmp_db["subreddits"].rows) == [ 219 | {"id": "2t3ad", "name": "patientgamers", "type": "public"} 220 | ] 221 | assert list(tmp_db["users"].rows) == [stored_user] 222 | assert list(tmp_db["comments"].rows) == [{**stored_comment, "id": i} for i in "ac"] 223 | assert list(tmp_db["posts"].rows) == [] 224 | 225 | 226 | @pytest.mark.usefixtures("posts_file") 227 | def test_cold_load_posts_only_from_archive( 228 | tmp_db_path, 229 | mock_info_request: MockInfoFunc, 230 | empty_file_at_path, 231 | archive_dir, 232 | tmp_db: Database, 233 | stored_self_post, 234 | stored_user, 235 | post_info_response, 236 | ): 237 | empty_file_at_path("comments.csv") 238 | empty_file_at_path("saved_comments.csv") 239 | empty_file_at_path("saved_posts.csv") 240 | 241 | mock_info_request("t3_d,t3_f", json=post_info_response) 242 | 243 | result = CliRunner().invoke(cli, ["archive", str(archive_dir), "--db", tmp_db_path]) 244 | assert not result.exception 245 | 246 | assert {"subreddits", "users", "posts", "posts_fts"} <= set(tmp_db.table_names()) 247 | assert list(tmp_db["subreddits"].rows) == [ 248 | {"id": "32u6q", "name": "KeybaseProofs", "type": "public"} 249 | ] 250 | assert list(tmp_db["users"].rows) == [stored_user] 251 | assert list(tmp_db["comments"].rows) == [] 252 | assert list(tmp_db["posts"].rows) == [{**stored_self_post, "id": i} for i in "df"] 253 | 254 | 255 | def test_loads_data_from_both_sources_api_first( 256 | tmp_db_path, 257 | mock_info_request: MockInfoFunc, 258 | mock_paged_request: MockPagedFunc, 259 | comment_response, 260 | self_post_response, 261 | archive_dir, 262 | tmp_db: Database, 263 | stored_comment, 264 | stored_self_post, 265 | stored_user, 266 | comment_info_response, 267 | post_info_response, 268 | write_archive_file: WriteArchiveFileFunc, 269 | empty_file_at_path, 270 | ): 271 | empty_file_at_path("saved_comments.csv") 272 | empty_file_at_path("saved_posts.csv") 273 | 274 | mock_paged_request("comments", json=comment_response) 275 | mock_paged_request("submitted", json=self_post_response) 276 | 277 | api_result = CliRunner().invoke(cli, ["user", "xavdid", "--db", tmp_db_path]) 278 | assert not api_result.exception 279 | 280 | assert { 281 | "subreddits", 282 | "users", 283 | "comments", 284 | "comments_fts", 285 | "posts", 286 | "posts_fts", 287 | } <= set(tmp_db.table_names()) 288 | assert list(tmp_db["subreddits"].rows) == [ 289 | {"id": "2t3ad", "name": "patientgamers", "type": "public"}, 290 | {"id": "32u6q", "name": "KeybaseProofs", "type": "public"}, 291 | ] 292 | assert list(tmp_db["comments"].rows) == [stored_comment] 293 | assert list(tmp_db["posts"].rows) == [stored_self_post] 294 | 295 | # second pass 296 | mock_info_request("t1_a,t1_c", json=comment_info_response) 297 | mock_info_request("t3_d,t3_f", json=post_info_response) 298 | 299 | write_archive_file("comments.csv", ["id", "a", "c", stored_comment["id"]]) 300 | write_archive_file("posts.csv", ["id", "d", "f", stored_self_post["id"]]) 301 | 302 | archive_result = CliRunner().invoke( 303 | cli, ["archive", str(archive_dir), "--db", tmp_db_path] 304 | ) 305 | assert not archive_result.exception, print(archive_result.exception) 306 | 307 | assert list(tmp_db["users"].rows) == [stored_user] 308 | assert list(tmp_db["comments"].rows) == [ 309 | stored_comment, 310 | *({**stored_comment, "id": i} for i in "ac"), 311 | ] 312 | assert list(tmp_db["posts"].rows) == [ 313 | stored_self_post, 314 | *({**stored_self_post, "id": i} for i in "df"), 315 | ] 316 | 317 | 318 | def test_loads_data_from_both_sources_archive_first( 319 | tmp_db_path, 320 | mock_info_request: MockInfoFunc, 321 | mock_paged_request: MockPagedFunc, 322 | comment_response, 323 | self_post_response, 324 | archive_dir, 325 | tmp_db: Database, 326 | stored_comment, 327 | stored_self_post, 328 | stored_user, 329 | comment_info_response, 330 | post_info_response, 331 | write_archive_file: WriteArchiveFileFunc, 332 | empty_file_at_path, 333 | ): 334 | # second pass 335 | mock_info_request("t1_a,t1_c", json=comment_info_response) 336 | mock_info_request("t3_d,t3_f", json=post_info_response) 337 | 338 | write_archive_file("comments.csv", ["id", "a", "c"]) 339 | write_archive_file("posts.csv", ["id", "d", "f"]) 340 | 341 | empty_file_at_path("saved_comments.csv") 342 | empty_file_at_path("saved_posts.csv") 343 | 344 | archive_result = CliRunner().invoke( 345 | cli, ["archive", str(archive_dir), "--db", tmp_db_path] 346 | ) 347 | assert not archive_result.exception, print(archive_result.exception) 348 | 349 | assert { 350 | "subreddits", 351 | "users", 352 | "comments", 353 | "comments_fts", 354 | "posts", 355 | "posts_fts", 356 | } <= set(tmp_db.table_names()) 357 | 358 | assert list(tmp_db["users"].rows) == [stored_user] 359 | assert list(tmp_db["comments"].rows) == [{**stored_comment, "id": i} for i in "ac"] 360 | assert list(tmp_db["posts"].rows) == [{**stored_self_post, "id": i} for i in "df"] 361 | 362 | mock_paged_request("comments", json=comment_response) 363 | mock_paged_request("submitted", json=self_post_response) 364 | 365 | api_result = CliRunner().invoke(cli, ["user", "xavdid", "--db", tmp_db_path]) 366 | assert not api_result.exception, print_tb(api_result.exception.__traceback__) 367 | 368 | assert list(tmp_db["subreddits"].rows) == [ 369 | {"id": "2t3ad", "name": "patientgamers", "type": "public"}, 370 | {"id": "32u6q", "name": "KeybaseProofs", "type": "public"}, 371 | ] 372 | assert list(tmp_db["comments"].rows) == [ 373 | *({**stored_comment, "id": i} for i in "ac"), 374 | stored_comment, 375 | ] 376 | assert list(tmp_db["posts"].rows) == [ 377 | *({**stored_self_post, "id": i} for i in "df"), 378 | stored_self_post, 379 | ] 380 | 381 | 382 | def test_adds_username_to_removed_posts_in_mixed_archive( 383 | archive_dir, 384 | tmp_db_path, 385 | tmp_db: Database, 386 | stored_user, 387 | stored_comment, 388 | stored_removed_comment, 389 | stored_self_post, 390 | stored_removed_post, 391 | mock_info_request: MockInfoFunc, 392 | write_archive_file: WriteArchiveFileFunc, 393 | all_comments_response, 394 | all_posts_response, 395 | stored_external_post, 396 | empty_file_at_path, 397 | ): 398 | mock_info_request("t1_jj0ti6f,t1_c3sgfl4", json=all_comments_response) 399 | mock_info_request("t3_uypaav,t3_1f55rr,t3_qwer", json=all_posts_response) 400 | 401 | write_archive_file("comments.csv", ["id", "jj0ti6f", "c3sgfl4"]) 402 | write_archive_file("posts.csv", ["id", "uypaav", "1f55rr", "qwer"]) 403 | empty_file_at_path("saved_comments.csv") 404 | empty_file_at_path("saved_posts.csv") 405 | 406 | api_result = CliRunner().invoke( 407 | cli, ["archive", str(archive_dir), "--db", tmp_db_path] 408 | ) 409 | assert not api_result.exception, print(api_result.exception) 410 | 411 | assert list(tmp_db["subreddits"].rows) == [ 412 | {"id": "2t3ad", "name": "patientgamers", "type": "public"}, 413 | {"id": "2qm4e", "name": "askscience", "type": "public"}, 414 | {"id": "32u6q", "name": "KeybaseProofs", "type": "public"}, 415 | {"id": "2qh1e", "name": "videos", "type": "public"}, 416 | ] 417 | assert list(tmp_db["users"].rows) == [stored_user] 418 | assert list(tmp_db["comments"].rows) == [stored_comment, stored_removed_comment] 419 | assert list(tmp_db["posts"].rows) == [ 420 | stored_self_post, 421 | stored_removed_post, 422 | stored_external_post, 423 | ] 424 | 425 | 426 | @pytest.mark.usefixtures("stats_file") 427 | def test_load_username_from_file( 428 | tmp_db: Database, 429 | tmp_db_path, 430 | user_response, 431 | archive_dir, 432 | removed_post_response, 433 | stored_removed_comment, 434 | stored_removed_post, 435 | stored_user, 436 | mock_info_request: MockInfoFunc, 437 | mock_user_request: MockUserFunc, 438 | write_archive_file: WriteArchiveFileFunc, 439 | removed_comment_response, 440 | empty_file_at_path, 441 | ): 442 | mock_info_request("t1_c3sgfl4", json=removed_comment_response) 443 | mock_info_request("t3_1f55rr", json=removed_post_response) 444 | 445 | mock_user_request("xavdid", json=user_response) 446 | 447 | write_archive_file("comments.csv", ["id", "c3sgfl4"]) 448 | write_archive_file("posts.csv", ["id", "1f55rr"]) 449 | empty_file_at_path("saved_comments.csv") 450 | empty_file_at_path("saved_posts.csv") 451 | 452 | api_result = CliRunner().invoke( 453 | cli, ["archive", str(archive_dir), "--db", tmp_db_path] 454 | ) 455 | assert not api_result.exception, print(api_result.exception) 456 | 457 | assert { 458 | "subreddits", 459 | "users", 460 | "comments", 461 | "comments_fts", 462 | "posts", 463 | "posts_fts", 464 | } <= set(tmp_db.table_names()) 465 | 466 | assert list(tmp_db["subreddits"].rows) == [ 467 | {"id": "2qm4e", "name": "askscience", "type": "public"}, 468 | {"id": "2qh1e", "name": "videos", "type": "public"}, 469 | ] 470 | assert list(tmp_db["users"].rows) == [stored_user] 471 | assert list(tmp_db["comments"].rows) == [stored_removed_comment] 472 | assert list(tmp_db["posts"].rows) == [stored_removed_post] 473 | 474 | 475 | def test_missing_username_entirely( 476 | tmp_db: Database, 477 | tmp_db_path, 478 | archive_dir, 479 | removed_post_response, 480 | empty_file_at_path, 481 | mock_info_request: MockInfoFunc, 482 | write_archive_file: WriteArchiveFileFunc, 483 | removed_comment_response, 484 | ): 485 | mock_info_request("t1_c3sgfl4", json=removed_comment_response) 486 | mock_info_request("t3_1f55rr", json=removed_post_response) 487 | 488 | empty_file_at_path("statistics.csv") 489 | 490 | write_archive_file("comments.csv", ["id", "c3sgfl4"]) 491 | write_archive_file("posts.csv", ["id", "1f55rr"]) 492 | empty_file_at_path("saved_comments.csv") 493 | empty_file_at_path("saved_posts.csv") 494 | 495 | api_result = CliRunner().invoke( 496 | cli, ["archive", str(archive_dir), "--db", tmp_db_path] 497 | ) 498 | assert not api_result.exception, print(api_result.exception) 499 | 500 | assert "Unable to guess username" in api_result.output 501 | assert "some data will not be saved." in api_result.output 502 | assert "ignored for now" in api_result.output 503 | 504 | assert tmp_db.table_names() == ["subreddits"] 505 | 506 | assert list(tmp_db["subreddits"].rows) == [ 507 | {"id": "2qm4e", "name": "askscience", "type": "public"}, 508 | {"id": "2qh1e", "name": "videos", "type": "public"}, 509 | ] 510 | assert list(tmp_db["users"].rows) == [] 511 | assert list(tmp_db["comments"].rows) == [] 512 | assert list(tmp_db["posts"].rows) == [] 513 | 514 | 515 | def test_load_saved_data( 516 | tmp_db: Database, 517 | tmp_db_path, 518 | archive_dir, 519 | empty_file_at_path, 520 | mock_info_request: MockInfoFunc, 521 | write_archive_file: WriteArchiveFileFunc, 522 | all_comments_response, 523 | stored_user, 524 | deleted_user, 525 | stored_removed_comment_placeholder_user, 526 | stored_comment, 527 | all_posts_response, 528 | stored_self_post, 529 | stored_removed_post_placeholder_user, 530 | stored_external_post, 531 | ): 532 | empty_file_at_path("comments.csv") 533 | empty_file_at_path("posts.csv") 534 | empty_file_at_path("statistics.csv") 535 | write_archive_file("saved_comments.csv", ["id", "jj0ti6f", "c3sgfl4"]) 536 | write_archive_file("saved_posts.csv", ["id", "uypaav", "1f55rr", "qwer"]) 537 | 538 | mock_info_request("t1_jj0ti6f,t1_c3sgfl4", json=all_comments_response) 539 | mock_info_request("t3_uypaav,t3_1f55rr,t3_qwer", json=all_posts_response) 540 | 541 | result = CliRunner().invoke( 542 | cli, 543 | ["archive", str(archive_dir), "--db", tmp_db_path], 544 | ) 545 | assert not result.exception, result.exception 546 | assert result.stdout # not sure why it's in "out" not "err" 547 | assert "saved 2 new comments" in result.stdout 548 | 549 | assert { 550 | "subreddits", 551 | "users", 552 | "saved_comments", 553 | "saved_comments_fts", 554 | "saved_posts", 555 | "saved_posts_fts", 556 | }.issubset(tmp_db.table_names()) 557 | assert "comments" not in tmp_db.table_names() 558 | assert "posts" not in tmp_db.table_names() 559 | 560 | assert list(tmp_db["subreddits"].rows) == [ 561 | {"id": "2t3ad", "name": "patientgamers", "type": "public"}, 562 | {"id": "2qm4e", "name": "askscience", "type": "public"}, 563 | {"id": "32u6q", "name": "KeybaseProofs", "type": "public"}, 564 | {"id": "2qh1e", "name": "videos", "type": "public"}, 565 | ] 566 | # for some reason, .rows was returning rows inconsistently, so I ordered it 567 | assert list(tmp_db["users"].rows_where(order_by="id")) == [ 568 | deleted_user, 569 | stored_user, 570 | ] 571 | assert list(tmp_db["saved_comments"].rows) == [ 572 | stored_comment, 573 | stored_removed_comment_placeholder_user, 574 | ] 575 | assert list(tmp_db["saved_posts"].rows) == [ 576 | stored_self_post, 577 | stored_removed_post_placeholder_user, 578 | stored_external_post, 579 | ] 580 | 581 | 582 | def test_load_data_skip_saved( 583 | tmp_db: Database, 584 | tmp_db_path, 585 | archive_dir, 586 | empty_file_at_path, 587 | write_archive_file: WriteArchiveFileFunc, 588 | ): 589 | empty_file_at_path("comments.csv") 590 | empty_file_at_path("posts.csv") 591 | empty_file_at_path("statistics.csv") 592 | write_archive_file("saved_comments.csv", ["id", "jj0ti6f", "c3sgfl4"]) 593 | write_archive_file("saved_posts.csv", ["id", "uypaav", "1f55rr", "qwer"]) 594 | 595 | result = CliRunner().invoke( 596 | cli, 597 | ["archive", str(archive_dir), "--db", tmp_db_path, "--skip-saved"], 598 | ) 599 | assert not result.exception, result.exception 600 | assert result.stdout # not sure why it's in "out" not "err" 601 | assert "saved 0 new comments" in result.stdout 602 | 603 | table_names = tmp_db.table_names() 604 | for s in { 605 | "subreddits", 606 | "users", 607 | "saved_comments", 608 | "saved_comments_fts", 609 | "saved_posts", 610 | "saved_posts_fts", 611 | "comments", 612 | "posts", 613 | }: 614 | assert s not in table_names 615 | 616 | assert list(tmp_db["subreddits"].rows) == [] 617 | assert list(tmp_db["users"].rows) == [] 618 | assert list(tmp_db["saved_comments"].rows) == [] 619 | assert list(tmp_db["saved_posts"].rows) == [] 620 | -------------------------------------------------------------------------------- /tests/test_csv_helpers.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | from sqlite_utils import Database 5 | 6 | from reddit_user_to_sqlite.csv_helpers import ( 7 | build_table_name, 8 | get_username_from_archive, 9 | load_unsaved_ids_from_file, 10 | validate_and_build_path, 11 | ) 12 | 13 | 14 | def test_validate_and_build_path(archive_dir, stats_file): 15 | assert validate_and_build_path(archive_dir, "statistics") == stats_file 16 | 17 | 18 | def test_validate_and_build_fails(archive_dir: Path): 19 | with pytest.raises(ValueError) as err: 20 | validate_and_build_path(archive_dir, "posts") 21 | 22 | err_msg = str(err.value) 23 | 24 | assert str(archive_dir) in err_msg 25 | assert 'posts.csv" not found' in err_msg 26 | 27 | 28 | @pytest.mark.usefixtures("comments_file") 29 | def test_load_comment_ids_from_file_empty_db(tmp_db: Database, archive_dir): 30 | assert load_unsaved_ids_from_file(tmp_db, archive_dir, "comments") == [ 31 | "t1_a", 32 | "t1_c", 33 | ] 34 | 35 | 36 | @pytest.mark.usefixtures("comments_file") 37 | def test_load_comment_ids_from_file_non_db(tmp_db: Database, archive_dir): 38 | tmp_db["comments"].insert({"id": "a"}) # type: ignore 39 | 40 | assert load_unsaved_ids_from_file(tmp_db, archive_dir, "comments") == [ 41 | "t1_c", 42 | ] 43 | 44 | 45 | @pytest.mark.usefixtures("saved_comments_file") 46 | def test_load_saved_comment_ids_from_file_empty_db(tmp_db: Database, archive_dir): 47 | assert load_unsaved_ids_from_file( 48 | tmp_db, archive_dir, "comments", prefix="saved_" 49 | ) == [ 50 | "t1_g", 51 | "t1_h", 52 | ] 53 | 54 | 55 | @pytest.mark.usefixtures("saved_comments_file") 56 | def test_load_saved_comment_ids_from_file_non_empty_db(tmp_db: Database, archive_dir): 57 | tmp_db["saved_comments"].insert({"id": "h"}) # type: ignore 58 | 59 | assert load_unsaved_ids_from_file( 60 | tmp_db, archive_dir, "comments", prefix="saved_" 61 | ) == ["t1_g"] 62 | 63 | 64 | def test_load_comment_ids_missing_files(tmp_db: Database, archive_dir): 65 | with pytest.raises(ValueError) as err: 66 | load_unsaved_ids_from_file(tmp_db, archive_dir, "comments") 67 | 68 | err_msg = str(err) 69 | assert 'comments.csv" not found' in err_msg 70 | # LOAD BEARING TEST, DO NOT REMOVE 71 | assert "unzipped Reddit GDPR archive" in err_msg 72 | 73 | 74 | @pytest.mark.usefixtures("posts_file") 75 | def test_load_post_ids_from_file_empty_db(tmp_db: Database, archive_dir): 76 | assert load_unsaved_ids_from_file(tmp_db, archive_dir, "posts") == [ 77 | "t3_d", 78 | "t3_f", 79 | ] 80 | 81 | 82 | @pytest.mark.usefixtures("posts_file") 83 | def test_load_post_ids_from_file_some_db(tmp_db: Database, archive_dir): 84 | tmp_db["posts"].insert({"id": "d"}) # type: ignore 85 | 86 | assert load_unsaved_ids_from_file(tmp_db, archive_dir, "posts") == [ 87 | "t3_f", 88 | ] 89 | 90 | 91 | @pytest.mark.usefixtures("saved_posts_file") 92 | def test_load_saved_post_ids_from_file_empty_db(tmp_db: Database, archive_dir): 93 | assert load_unsaved_ids_from_file( 94 | tmp_db, archive_dir, "posts", prefix="saved_" 95 | ) == [ 96 | "t3_j", 97 | "t3_k", 98 | ] 99 | 100 | 101 | @pytest.mark.usefixtures("saved_posts_file") 102 | def test_load_saved_post_ids_from_file_non_empty_db(tmp_db: Database, archive_dir): 103 | tmp_db["saved_posts"].insert({"id": "j"}) # type: ignore 104 | 105 | assert load_unsaved_ids_from_file( 106 | tmp_db, archive_dir, "posts", prefix="saved_" 107 | ) == ["t3_k"] 108 | 109 | 110 | def test_load_post_ids_missing_files(tmp_db: Database, archive_dir): 111 | with pytest.raises(ValueError) as err: 112 | load_unsaved_ids_from_file(tmp_db, archive_dir, "posts") 113 | 114 | assert 'posts.csv" not found' in str(err.value) 115 | 116 | 117 | @pytest.mark.usefixtures("stats_file") 118 | def test_get_username_from_archive(archive_dir): 119 | assert get_username_from_archive(archive_dir) == "xavdid" 120 | 121 | 122 | def test_get_username_from_archive_no_name(archive_dir: Path): 123 | (archive_dir / "statistics.csv").touch() 124 | assert get_username_from_archive(archive_dir) is None 125 | 126 | 127 | def test_get_username_from_archive_missing_file(archive_dir): 128 | with pytest.raises(ValueError) as err: 129 | get_username_from_archive(archive_dir) 130 | 131 | assert 'statistics.csv" not found' in str(err.value) 132 | 133 | 134 | @pytest.mark.parametrize( 135 | ["table_name", "table_prefix", "expected"], 136 | [ 137 | ("comments", None, "comments"), 138 | ("posts", None, "posts"), 139 | ("comments", "saved_", "saved_comments"), 140 | ("posts", "saved_", "saved_posts"), 141 | ], 142 | ) 143 | def test_build_table_name(table_name, table_prefix, expected): 144 | assert build_table_name(table_name, table_prefix) == expected 145 | -------------------------------------------------------------------------------- /tests/test_helpers.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from reddit_user_to_sqlite.helpers import clean_username, find_user_details_from_items 4 | 5 | 6 | @pytest.mark.parametrize( 7 | "username, expected", 8 | [ 9 | ("/u/xavdid", "xavdid"), 10 | ("u/xavdid", "xavdid"), 11 | ("xavdid", "xavdid"), 12 | ("unbelievable", "unbelievable"), 13 | ], 14 | ) 15 | def test_clean_username(username, expected): 16 | assert clean_username(username) == expected 17 | 18 | 19 | # to verify that fixtures that modify previous fixture results don't mutate them 20 | def test_fixture_modifications(self_post, removed_post): 21 | assert self_post != removed_post 22 | 23 | 24 | def test_unique_fixture_ids(self_post, removed_post, external_post): 25 | # all post types should have unique ids 26 | assert len({p["id"] for p in [self_post, removed_post, external_post]}) == 3 27 | 28 | 29 | def test_find_user_details_from_items(): 30 | assert find_user_details_from_items( 31 | [ 32 | {"asdf": 1}, 33 | {"author_fullname": "t2_abc123", "author": "xavdid"}, 34 | ] 35 | ) == ("xavdid", "t2_abc123") 36 | 37 | 38 | def test_fail_to_find_user_details_from_items(): 39 | assert find_user_details_from_items([{"asdf": 1}, {"author": "xavdid"}]) is None 40 | -------------------------------------------------------------------------------- /tests/test_reddit_api.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock, patch 2 | 3 | import pytest 4 | 5 | from reddit_user_to_sqlite.reddit_api import ( 6 | PagedResponse, 7 | RedditRateLimitException, 8 | _unwrap_response_and_raise, 9 | add_missing_user_fragment, 10 | get_user_id, 11 | load_comments_for_user, 12 | load_info, 13 | load_posts_for_user, 14 | ) 15 | from tests.conftest import MockInfoFunc, MockPagedFunc, MockUserFunc 16 | 17 | 18 | def test_load_comments(mock_paged_request: MockPagedFunc, comment_response, comment): 19 | response = mock_paged_request(resource="comments", json=comment_response) 20 | 21 | assert load_comments_for_user("xavdid") == [comment] 22 | 23 | assert response.call_count == 1 24 | 25 | 26 | @patch("reddit_user_to_sqlite.reddit_api.PAGE_SIZE", new=1) 27 | def test_load_comments_rate_limited( 28 | mock_paged_request: MockPagedFunc, comment_response, comment, rate_limit_headers 29 | ): 30 | good_response = mock_paged_request( 31 | resource="comments", params={"limit": 1}, json=comment_response 32 | ) 33 | bad_response = mock_paged_request( 34 | resource="comments", 35 | params={"limit": 1}, 36 | json={"error": 429}, 37 | headers=rate_limit_headers, 38 | ) 39 | 40 | # despite getting an error, we still got the first comment 41 | assert load_comments_for_user("xavdid") == [comment] 42 | 43 | assert good_response.call_count == 1 44 | assert bad_response.call_count == 1 45 | 46 | 47 | def test_load_posts(mock_paged_request: MockPagedFunc, self_post_response, self_post): 48 | response = mock_paged_request(resource="submitted", json=self_post_response) 49 | 50 | assert load_posts_for_user("xavdid") == [self_post] 51 | assert response.call_count == 1 52 | 53 | 54 | @patch("reddit_user_to_sqlite.reddit_api.PAGE_SIZE", new=1) 55 | def test_loads_10_pages(mock_paged_request: MockPagedFunc, comment_response, comment): 56 | response = mock_paged_request( 57 | resource="comments", params={"limit": 1}, json=comment_response 58 | ) 59 | 60 | assert load_comments_for_user("xavdid") == [comment] * 10 61 | 62 | assert response.call_count == 10 63 | 64 | 65 | @patch("reddit_user_to_sqlite.reddit_api.PAGE_SIZE", new=1) 66 | def test_loads_multiple_pages( 67 | mock_paged_request: MockPagedFunc, comment_response: PagedResponse, comment 68 | ): 69 | comment_response["data"]["after"] = "abc" 70 | first_request = mock_paged_request( 71 | resource="comments", params={"limit": 1}, json=comment_response 72 | ) 73 | 74 | comment_response["data"]["after"] = "def" 75 | second_request = mock_paged_request( 76 | resource="comments", params={"limit": 1, "after": "abc"}, json=comment_response 77 | ) 78 | 79 | comment_response["data"]["children"] = [] 80 | third_request = mock_paged_request( 81 | resource="comments", params={"limit": 1, "after": "def"}, json=comment_response 82 | ) 83 | 84 | comments = load_comments_for_user("xavdid") 85 | 86 | assert first_request.call_count == 1 87 | assert second_request.call_count == 1 88 | assert third_request.call_count == 1 89 | 90 | assert comments == [comment, comment] 91 | 92 | 93 | def test_error_response(mock_paged_request: MockPagedFunc): 94 | mock_paged_request( 95 | resource="comments", json={"error": 500, "message": "you broke reddit"} 96 | ) 97 | 98 | with pytest.raises(ValueError) as err: 99 | load_comments_for_user("xavdid") 100 | 101 | assert ( 102 | str(err.value) == "Received API error from Reddit (code 500): you broke reddit" 103 | ) 104 | 105 | 106 | def test_load_info(mock_info_request: MockInfoFunc, comment_response, comment): 107 | mock_info_request("a,b,c", json=comment_response) 108 | 109 | assert load_info(["a", "b", "c"]) == [comment] 110 | 111 | 112 | @patch("reddit_user_to_sqlite.reddit_api.PAGE_SIZE", new=2) 113 | def test_load_info_pages(mock_info_request: MockInfoFunc, comment_response, comment): 114 | mock_info_request("a,b", json=comment_response, limit=2) 115 | mock_info_request("c,d", json=comment_response, limit=2) 116 | mock_info_request("e", json=comment_response, limit=2) 117 | 118 | assert load_info(["a", "b", "c", "d", "e"]) == [comment] * 3 119 | 120 | 121 | @patch("reddit_user_to_sqlite.reddit_api.PAGE_SIZE", new=2) 122 | def test_load_info_pages_with_rate_limit( 123 | mock_info_request: MockInfoFunc, comment_response, comment, rate_limit_headers 124 | ): 125 | mock_info_request("a,b", json=comment_response, limit=2) 126 | mock_info_request("c,d", json=comment_response, limit=2) 127 | mock_info_request("e", json={"error": 429}, limit=2, headers=rate_limit_headers) 128 | 129 | # call for e fails, but we still got the first ones 130 | assert load_info(["a", "b", "c", "d", "e"]) == [comment] * 2 131 | 132 | 133 | def test_load_info_empty(mock_info_request: MockInfoFunc, empty_response): 134 | mock_info_request("a,b,c,d,e,f,g,h", json=empty_response) 135 | 136 | assert load_info(["a", "b", "c", "d", "e", "f", "g", "h"]) == [] 137 | 138 | 139 | def test_unwrap_and_raise_passes_good_responses_through(): 140 | response = {"neat": True} 141 | assert _unwrap_response_and_raise(MagicMock(json=lambda: response)) == response 142 | 143 | 144 | def test_unwrap_and_raise_raises_unknown_errors(): 145 | with pytest.raises(ValueError) as err: 146 | _unwrap_response_and_raise( 147 | MagicMock(json=lambda: {"error": 123, "message": "cool"}) 148 | ) 149 | assert str(err.value) == "Received API error from Reddit (code 123): cool" 150 | 151 | 152 | def test_unwrap_and_raise_raises_rate_limit_errors(rate_limit_headers): 153 | with pytest.raises(RedditRateLimitException) as err: 154 | _unwrap_response_and_raise( 155 | MagicMock( 156 | json=lambda: {"error": 429, "message": "cool"}, 157 | headers=rate_limit_headers, 158 | ) 159 | ) 160 | 161 | e = err.value 162 | 163 | assert e.used == 4 164 | assert e.remaining == 6 165 | assert e.window_total == 10 166 | assert e.reset_after_seconds == 20 167 | assert e.stats == "Used 4/10 requests (resets in 20 seconds)" 168 | 169 | 170 | def test_get_user_id(mock_user_request: MockUserFunc, user_response): 171 | mock_user_request("xavdid", json=user_response) 172 | 173 | assert get_user_id("xavdid") == "np8mb41h" 174 | 175 | 176 | def test_get_user_id_unknown_user(mock_user_request: MockUserFunc): 177 | mock_user_request("xavdid", json={"message": "Not Found", "error": 404}) 178 | with pytest.raises(ValueError): 179 | get_user_id("xavdid") 180 | 181 | 182 | def test_add_missing_user_fragment(): 183 | items = [{"a": 1}, {"a": 2}, {"a": 3}] 184 | assert add_missing_user_fragment(items, "xavdid", "t2_abc123") == [ # type: ignore 185 | {"a": 1, "author": "xavdid", "author_fullname": "t2_abc123"}, 186 | {"a": 2, "author": "xavdid", "author_fullname": "t2_abc123"}, 187 | {"a": 3, "author": "xavdid", "author_fullname": "t2_abc123"}, 188 | ] 189 | 190 | 191 | def test_add_missing_user_fragment_no_overwrite(): 192 | items = [{"a": 1}, {"author": "david", "author_fullname": "t2_def456"}] 193 | 194 | assert add_missing_user_fragment(items, "xavdid", "t2_abc123") == [ # type: ignore 195 | {"a": 1, "author": "xavdid", "author_fullname": "t2_abc123"}, 196 | {"author": "david", "author_fullname": "t2_def456"}, 197 | ] 198 | -------------------------------------------------------------------------------- /tests/test_sqlite_helpers.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Optional 2 | 3 | import pytest 4 | from pytest import FixtureRequest 5 | from sqlite_utils import Database 6 | from sqlite_utils.db import ForeignKey, NotFoundError 7 | 8 | from reddit_user_to_sqlite.reddit_api import ( 9 | Comment, 10 | Post, 11 | SubredditFragment, 12 | UserFragment, 13 | ) 14 | from reddit_user_to_sqlite.sqlite_helpers import ( 15 | CommentRow, 16 | comment_to_comment_row, 17 | insert_users, 18 | item_to_subreddit_row, 19 | item_to_user_row, 20 | post_to_post_row, 21 | upsert_comments, 22 | upsert_posts, 23 | upsert_subreddits, 24 | ) 25 | 26 | 27 | @pytest.fixture 28 | def make_sr(): 29 | def make_subreddit(name: str, id_=None, type_="public") -> SubredditFragment: 30 | # returns the relevant sub-portions of 31 | return { 32 | "subreddit": name, 33 | "subreddit_id": f"t5_{id_ or name}", 34 | "subreddit_type": type_, 35 | } 36 | 37 | return make_subreddit 38 | 39 | 40 | MakeUserFunc = Callable[[str], UserFragment] 41 | 42 | 43 | @pytest.fixture 44 | def make_user() -> MakeUserFunc: 45 | def _make_user(name: str, id_: Optional[str] = None) -> UserFragment: 46 | return {"author_fullname": f"t2_{id_ or name[::-1]}", "author": name} 47 | 48 | return _make_user 49 | 50 | 51 | def test_insert_subreddits(tmp_db: Database, make_sr): 52 | upsert_subreddits( 53 | tmp_db, 54 | [ 55 | make_sr("Games"), 56 | make_sr("JRPG", type_="private"), 57 | ], 58 | ) 59 | 60 | assert "subreddits" in tmp_db.table_names() 61 | assert list(tmp_db["subreddits"].rows) == [ 62 | {"id": "Games", "name": "Games", "type": "public"}, 63 | {"id": "JRPG", "name": "JRPG", "type": "private"}, 64 | ] 65 | 66 | 67 | @pytest.mark.skip( 68 | "skipped because of a sqlite-utils bug; subreddits to get upserted right now" 69 | ) 70 | def test_repeat_subs_ignored(tmp_db: Database, make_sr): 71 | upsert_subreddits( 72 | tmp_db, 73 | [ 74 | make_sr("Games"), 75 | make_sr("JRPG", type_="private"), 76 | ], 77 | ) 78 | 79 | # updates are ignored 80 | upsert_subreddits( 81 | tmp_db, 82 | [ 83 | make_sr("ames", id_="Games"), 84 | make_sr("RPG", id_="JRPG"), 85 | make_sr("Apple"), 86 | ], 87 | ) 88 | 89 | assert "subreddits" in tmp_db.table_names() 90 | assert list(tmp_db["subreddits"].rows) == [ 91 | {"id": "Games", "name": "Games", "type": "public"}, 92 | {"id": "JRPG", "name": "JRPG", "type": "private"}, 93 | {"id": "Apple", "name": "Apple", "type": "public"}, 94 | ] 95 | 96 | 97 | def test_insert_user(tmp_db: Database, make_user: MakeUserFunc): 98 | insert_users(tmp_db, [make_user("xavdid")]) 99 | 100 | assert "users" in tmp_db.table_names() 101 | assert list(tmp_db["users"].rows) == [ 102 | {"id": "didvax", "username": "xavdid"}, 103 | ] 104 | 105 | 106 | def test_insert_user_missing(tmp_db: Database, make_user: MakeUserFunc): 107 | user = make_user("xavdid") 108 | user.pop("author_fullname") 109 | insert_users(tmp_db, [user]) 110 | 111 | assert "users" not in tmp_db.table_names() 112 | 113 | 114 | def test_insert_comments( 115 | tmp_db: Database, comment: Comment, stored_comment: CommentRow 116 | ): 117 | upsert_subreddits(tmp_db, [comment]) 118 | insert_users(tmp_db, [comment]) 119 | 120 | comment_without_user = comment.copy() 121 | comment.pop("author_fullname") 122 | 123 | upsert_comments(tmp_db, [comment, comment_without_user]) 124 | 125 | assert {"subreddits", "users", "comments"}.issubset(tmp_db.table_names()) 126 | 127 | assert list(tmp_db["comments"].rows) == [stored_comment] 128 | 129 | assert tmp_db["comments"].foreign_keys == [ # type: ignore 130 | ForeignKey("comments", "subreddit", "subreddits", "id"), 131 | ForeignKey("comments", "user", "users", "id"), 132 | ] 133 | 134 | failure_reasons = [] 135 | for k in ["user", "subreddit"]: 136 | try: 137 | tmp_db[f"{k}s"].get(stored_comment[k]) # type: ignore 138 | except NotFoundError: 139 | failure_reasons.append(f"broken foreign key relationship for comment.{k}") 140 | 141 | if failure_reasons: 142 | pytest.fail(", ".join(failure_reasons)) 143 | 144 | 145 | def test_update_comments(tmp_db: Database, comment: Comment, stored_comment): 146 | upsert_subreddits(tmp_db, [comment]) 147 | insert_users(tmp_db, [comment]) 148 | upsert_comments(tmp_db, [comment]) 149 | 150 | assert list(tmp_db["comments"].rows) == [stored_comment] 151 | 152 | assert comment["score"] != 10 153 | comment["score"] = 10 154 | upsert_comments(tmp_db, [comment]) 155 | 156 | updated_comment = tmp_db["comments"].get(comment["id"]) # type: ignore 157 | assert updated_comment["score"] == 10 158 | 159 | 160 | # https://engineeringfordatascience.com/posts/pytest_fixtures_with_parameterize/ 161 | @pytest.mark.parametrize( 162 | ["post_type", "stored_post_type"], 163 | [ 164 | ("self_post", "stored_self_post"), 165 | # ("removed_post", "stored_removed_post"), 166 | ("external_post", "stored_external_post"), 167 | ], 168 | ) 169 | def test_insert_posts( 170 | tmp_db: Database, request: FixtureRequest, post_type: str, stored_post_type: str 171 | ): 172 | post: Post = request.getfixturevalue(post_type) 173 | stored_post = request.getfixturevalue(stored_post_type) 174 | 175 | no_user_post = post.copy() 176 | no_user_post.pop("author_fullname") 177 | 178 | upsert_subreddits(tmp_db, [post]) 179 | insert_users(tmp_db, [post]) 180 | 181 | upsert_posts(tmp_db, [post, no_user_post]) 182 | 183 | assert {"subreddits", "users", "posts"}.issubset(tmp_db.table_names()) 184 | 185 | assert list(tmp_db["posts"].rows) == [stored_post] 186 | 187 | assert tmp_db["posts"].foreign_keys == [ # type: ignore 188 | ForeignKey("posts", "subreddit", "subreddits", "id"), 189 | ForeignKey("posts", "user", "users", "id"), 190 | ] 191 | 192 | failure_reasons = [] 193 | for k in ["user", "subreddit"]: 194 | try: 195 | tmp_db[f"{k}s"].get(stored_post[k]) # type: ignore 196 | except NotFoundError: 197 | failure_reasons.append(f"broken foreign key relationship for comment.{k}") 198 | 199 | if failure_reasons: 200 | pytest.fail(", ".join(failure_reasons)) 201 | 202 | 203 | @pytest.mark.parametrize( 204 | ["item", "expected"], 205 | [ 206 | ( 207 | {"author_fullname": "t1_abc123", "author": "xavdid"}, 208 | {"id": "abc123", "username": "xavdid"}, 209 | ), 210 | ({"author": "xavdid"}, None), 211 | ], 212 | ) 213 | def test_item_to_user_row(item, expected): 214 | assert item_to_user_row(item) == expected 215 | 216 | 217 | @pytest.mark.parametrize( 218 | ["item", "expected"], 219 | [ 220 | ( 221 | { 222 | "subreddit_id": "t3_abc123", 223 | "subreddit": "Games", 224 | "subreddit_type": "public", 225 | }, 226 | {"id": "abc123", "name": "Games", "type": "public"}, 227 | ), 228 | # ({}, None), 229 | ], 230 | ) 231 | def test_item_to_subreddit_row(item, expected): 232 | assert item_to_subreddit_row(item) == expected 233 | 234 | 235 | def test_comment_to_comment_row(comment, stored_comment): 236 | assert comment_to_comment_row(comment) == stored_comment 237 | 238 | 239 | def test_comment_to_comment_row_missing_user(comment): 240 | comment.pop("author_fullname") 241 | assert comment_to_comment_row(comment) is None 242 | 243 | 244 | def test_post_to_post_row(self_post, stored_self_post): 245 | assert post_to_post_row(self_post) == stored_self_post 246 | 247 | 248 | def test_post_to_post_row_missing_user(self_post): 249 | self_post.pop("author_fullname") 250 | assert post_to_post_row(self_post) is None 251 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | env_list = 3 | py3{9,10,11} 4 | minversion = 4.6.0 5 | isolated_build = True 6 | 7 | [testenv] 8 | description = run the tests with pytest 9 | package = wheel 10 | wheel_build_env = .pkg 11 | 12 | commands = 13 | pip install .[test] 14 | python -m pytest {tty:--color=yes} {posargs} 15 | --------------------------------------------------------------------------------