├── tests ├── __init__.py ├── test_utils │ ├── __init__.py │ ├── test_DirInit.py │ ├── test_Tools.py │ ├── test_Global.py │ └── test_Utilities.py ├── test_analytics │ ├── __init__.py │ ├── test_utils │ │ ├── __init__.py │ │ └── test_PrepData.py │ ├── test_Wordcloud.py │ └── test_Frequencies.py ├── test_praw_scrapers │ ├── __init__.py │ ├── test_utils │ │ └── __init__.py │ ├── test_live_scrapers │ │ ├── __init__.py │ │ ├── test_utils │ │ │ ├── __init__.py │ │ │ └── test_StreamGenerator.py │ │ └── test_Livestream.py │ └── test_static_scrapers │ │ ├── __init__.py │ │ ├── test_Basic.py │ │ └── test_Redditor.py ├── test_pushshift_scrapers │ └── __init__.py └── conftest.py ├── urs ├── __init__.py ├── utils │ ├── __init__.py │ ├── DirInit.py │ ├── Global.py │ ├── Tools.py │ ├── Utilities.py │ └── Titles.py ├── analytics │ ├── __init__.py │ ├── utils │ │ └── __init__.py │ ├── Wordcloud.py │ └── Frequencies.py ├── praw_scrapers │ ├── __init__.py │ ├── utils │ │ ├── __init__.py │ │ ├── Objectify.py │ │ └── Validation.py │ ├── live_scrapers │ │ ├── __init__.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── StreamGenerator.py │ │ │ └── DisplayStream.py │ └── static_scrapers │ │ ├── __init__.py │ │ └── Comments.py ├── Version.py └── Urs.py ├── manual ├── .gitignore ├── book.toml └── src │ ├── contributing │ ├── building-on-top-of-urs.md │ ├── making-pull-or-feature-requests.md │ └── before-making-pull-or-feature-requests.md │ ├── utilities │ ├── rate-limit-checking.md │ └── tree.md │ ├── derivative-projects.md │ ├── installation.md │ ├── introduction.md │ ├── additional-information │ ├── 2fa-information.md │ └── error-messages.md │ ├── credentials.md │ ├── scraping-reddit │ ├── scrape-speeds-and-rate-limits.md │ ├── all-attributes-table.md │ ├── submission-comments.md │ ├── subreddit.md │ └── redditor.md │ ├── contributors.md │ ├── README.md │ ├── SUMMARY.md │ ├── livestreaming-reddit │ ├── livestreaming-subreddits-and-redditors.md │ └── general-information.md │ ├── analytical-tools │ ├── frequencies-and-wordclouds.md │ └── general-information.md │ ├── exporting.md │ └── implementation-details │ └── the-forest.md ├── .github ├── FUNDING.yml ├── workflows │ ├── manual.yml │ ├── rust.yml │ └── python.yml ├── ISSUE_TEMPLATE │ ├── FEATURE_REQUEST.md │ └── BUG_REPORT.md ├── CODE_OF_CONDUCT.md ├── STYLE_GUIDE.md └── PULL_REQUEST_TEMPLATE.md ├── poetry.toml ├── .gitignore ├── rustfmt.toml ├── .env ├── Cargo.toml ├── pyproject.toml ├── taisun ├── lib.rs └── comments.rs ├── LICENSE ├── README.md └── supplemental_docs └── The Forest.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /urs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /urs/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /manual/.gitignore: -------------------------------------------------------------------------------- 1 | book 2 | -------------------------------------------------------------------------------- /tests/test_utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /urs/analytics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_analytics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /urs/analytics/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /urs/praw_scrapers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_praw_scrapers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /urs/praw_scrapers/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_analytics/test_utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_pushshift_scrapers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /urs/praw_scrapers/live_scrapers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: JosephLai241 2 | -------------------------------------------------------------------------------- /tests/test_praw_scrapers/test_utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /urs/praw_scrapers/static_scrapers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /poetry.toml: -------------------------------------------------------------------------------- 1 | [virtualenvs] 2 | in-project = true 3 | -------------------------------------------------------------------------------- /urs/praw_scrapers/live_scrapers/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_praw_scrapers/test_live_scrapers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_praw_scrapers/test_static_scrapers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_praw_scrapers/test_live_scrapers/test_utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | 3 | scrapes/* 4 | target/* 5 | 6 | Cargo.lock 7 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | group_imports = "StdExternalCrate" 2 | imports_granularity = "Crate" 3 | -------------------------------------------------------------------------------- /urs/Version.py: -------------------------------------------------------------------------------- 1 | """ 2 | Version 3 | ======= 4 | Defining the version number in one place. 5 | """ 6 | 7 | __version__ = "3.4.0" 8 | -------------------------------------------------------------------------------- /manual/book.toml: -------------------------------------------------------------------------------- 1 | [book] 2 | authors = ["Joseph Lai"] 3 | language = "en" 4 | multilingual = false 5 | src = "src" 6 | title = "URS User Guide" 7 | -------------------------------------------------------------------------------- /.env: -------------------------------------------------------------------------------- 1 | # PRAW Credentials 2 | 3 | # Personal use script (14 characters) 4 | CLIENT_ID="14_CHAR_HERE" 5 | 6 | # Secret key (27 characters) 7 | CLIENT_SECRET="27_CHAR_HERE" 8 | 9 | # App name 10 | USER_AGENT="APP_NAME_HERE" 11 | 12 | # Reddit username 13 | REDDIT_USERNAME="REDDIT_USERNAME_HERE" 14 | 15 | # Reddit password 16 | REDDIT_PASSWORD="REDDIT_PASSWORD_HERE" 17 | -------------------------------------------------------------------------------- /manual/src/contributing/building-on-top-of-urs.md: -------------------------------------------------------------------------------- 1 | # Building on Top of `URS` 2 | 3 | Although I will not approve requests that deviate from the project scope, feel free to reach out if you have built something on top of `URS` or have made modifications to scrape something specific on Reddit. I will add your project to the [Derivative Projects](../derivative-projects.md) section! 4 | -------------------------------------------------------------------------------- /manual/src/utilities/rate-limit-checking.md: -------------------------------------------------------------------------------- 1 | # Check PRAW Rate Limits 2 | 3 | ![Check PRAW Rate Limits Demo GIF][check praw rate limits demo] 4 | 5 | You can quickly check the rate limits for your account by using this flag. 6 | 7 | ``` 8 | poetry run Urs.py --check 9 | ``` 10 | 11 | [check praw rate limits demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/utilities/check_rate_limit_demo.gif?raw=true 12 | -------------------------------------------------------------------------------- /manual/src/derivative-projects.md: -------------------------------------------------------------------------------- 1 | # Derivative Projects 2 | 3 | This is a showcase for projects that are built on top of URS! 4 | 5 | ## [skiwheelr/URS][skiwheelr project link] 6 | 7 | ![skiwheelr project output screenshot][skiwheelr screenshot] 8 | 9 | Contains a bash script built on URS which counts ticker mentions in Subreddits, subsequently cURLs all the relevant links in parallel, and counts the mentions of those. 10 | 11 | [skiwheelr project link]: https://github.com/skiwheelr/URS 12 | [skiwheelr screenshot]: https://i.imgur.com/ChHdAZv.png 13 | -------------------------------------------------------------------------------- /tests/test_utils/test_DirInit.py: -------------------------------------------------------------------------------- 1 | """ 2 | Testing `DirInit.py`. 3 | """ 4 | 5 | 6 | import os 7 | 8 | from urs.utils.DirInit import InitializeDirectory 9 | 10 | 11 | class TestInitializeDirectoryCreateDirsMethod: 12 | """ 13 | Testing InitializeDirectory class create_dirs() method. 14 | """ 15 | 16 | def test_create_dirs_method(self): 17 | test_path = "../scrapes/test_dir/another_test_dir/a_final_dir" 18 | 19 | InitializeDirectory.create_dirs(test_path) 20 | 21 | assert True if os.path.isdir(test_path) else False 22 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | authors = ["Joseph Lai "] 3 | description = "The heavy lifter for URS" 4 | edition = "2021" 5 | homepage = "https://github.com/JosephLai241/URS" 6 | license = "MIT" 7 | name = "taisun" 8 | repository = "https://github.com/JosephLai241/URS" 9 | version = "1.0.0" 10 | 11 | [lib] 12 | crate-type = ["cdylib"] 13 | name = "taisun" 14 | path = "taisun/lib.rs" 15 | 16 | [dependencies] 17 | pyo3 = { version = "0.17.3", features = ["extension-module", "serde"] } 18 | serde = { version = "1.0.148", features = ["derive"] } 19 | serde_json = "1.0.89" 20 | -------------------------------------------------------------------------------- /manual/src/contributing/making-pull-or-feature-requests.md: -------------------------------------------------------------------------------- 1 | # Making Pull or Feature Requests 2 | 3 | You can suggest new features or changes by going to the [Issues tab][issues] and fill out the Feature Request template. If there is a good reason for a new feature, I will consider adding it. 4 | 5 | You are also more than welcome to create a pull request -- adding additional features, improving runtime, or refactoring existing code. If it is approved, I will merge the pull request into the master branch and credit you for contributing to this project. 6 | 7 | [issues]: https://github.com/JosephLai241/URS/issues 8 | -------------------------------------------------------------------------------- /manual/src/installation.md: -------------------------------------------------------------------------------- 1 | > **_NOTE:_ Requires Python 3.11+ and [Poetry][poetry installation page] installed on your system.** 2 | 3 | Run the following commands to install `URS`: 4 | 5 | ``` 6 | git clone --depth=1 https://github.com/JosephLai241/URS.git 7 | cd URS 8 | poetry install 9 | poetry shell 10 | maturin develop --release 11 | ``` 12 | 13 | > **_TIP:_** If `poetry shell` does not activate the virtual environment created by `Poetry`, run the following command to activate it: 14 | > 15 | > ``` 16 | > source .venv/bin/activate 17 | > ``` 18 | 19 | [poetry installation page]: https://python-poetry.org/docs/#installation 20 | -------------------------------------------------------------------------------- /urs/utils/DirInit.py: -------------------------------------------------------------------------------- 1 | """ 2 | Initialize directories 3 | ====================== 4 | Initialize directories in which scraped or analytical data is stored. 5 | """ 6 | 7 | 8 | import os 9 | 10 | 11 | class InitializeDirectory: 12 | """ 13 | Methods for initializing directories for the exported files. 14 | """ 15 | 16 | @staticmethod 17 | def create_dirs(path: str) -> None: 18 | """ 19 | Make directories for scrape files. 20 | 21 | :param str path: The path to the directories in which scrape files are 22 | saved. 23 | """ 24 | 25 | if not os.path.isdir(path): 26 | os.makedirs(path) 27 | -------------------------------------------------------------------------------- /manual/src/introduction.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | This is a comprehensive Reddit scraping tool that integrates multiple features: 4 | 5 | - Scrape Reddit via [`PRAW`][praw] (the official Python Reddit API Wrapper) 6 | - Scrape Subreddits 7 | - Scrape Redditors 8 | - Scrape submission comments 9 | - Livestream Reddit via `PRAW` 10 | - Livestream comments submitted within Subreddits or by Redditors 11 | - Livestream submissions submitted within Subreddits or by Redditors 12 | - Analytical tools for scraped data 13 | - Generate frequencies for words that are found in submission titles, bodies, and/or comments 14 | - Generate a wordcloud from scrape results 15 | 16 | [praw]: https://pypi.org/project/praw/ 17 | -------------------------------------------------------------------------------- /.github/workflows/manual.yml: -------------------------------------------------------------------------------- 1 | name: Deploy Manual 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | 8 | jobs: 9 | deploy: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - uses: actions/checkout@v3 14 | with: 15 | fetch-depth: 0 16 | 17 | - name: Setup mdBook 18 | uses: peaceiris/actions-mdbook@v1 19 | with: 20 | mdbook-version: "latest" 21 | 22 | - name: Build manual 23 | run: mdbook build 24 | working-directory: manual 25 | 26 | - name: Deploy to GitHub Pages 27 | uses: JamesIves/github-pages-deploy-action@v4.2.5 28 | with: 29 | branch: gh-pages 30 | folder: manual/book 31 | -------------------------------------------------------------------------------- /manual/src/contributing/before-making-pull-or-feature-requests.md: -------------------------------------------------------------------------------- 1 | # Before Making Pull or Feature Requests 2 | 3 | Consider the scope of this project before submitting a pull or feature request. `URS` stands for Universal Reddit Scraper. Two important aspects are listed in its name - _universal_ and _scraper_. 4 | 5 | I will not approve feature or pull requests that deviate from its sole purpose. This may include scraping a specific aspect of Reddit or [adding functionality that allows you to post a comment with `URS`][commenting feature request]. Adding either of these requests will no longer allow `URS` to be universal or merely a scraper. However, I am more than happy to approve requests that enhance the current scraping capabilities of `URS`. 6 | 7 | [commenting feature request]: https://github.com/JosephLai241/URS/issues/17 8 | -------------------------------------------------------------------------------- /manual/src/additional-information/2fa-information.md: -------------------------------------------------------------------------------- 1 | # Two-Factor Authentication 2 | 3 | If you choose to use 2FA with your Reddit account, enter your password followed by a colon and then your 2FA token in the `password` field on line 26. For example, if your password is `"p4ssw0rd"` and your 2FA token is `"123456"`, you will enter `"p4ssw0rd:123456"` in the `password` field. 4 | 5 | **2FA is NOT recommended for use with this program.** This is because PRAW will raise an OAuthException after one hour, prompting you to refresh your 2FA token and re-enter your credentials. Additionally, this means your 2FA token would be stored alongside your Reddit username and password, which would defeat the purpose of enabling 2FA in the first place. See [here](https://praw.readthedocs.io/en/latest/getting_started/authentication.html#two-factor-authentication) for more information. 6 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "urs" 3 | version = "3.4.0" 4 | description = "URS (Universal Reddit Scraper): A comprehensive Reddit scraping and OSINT command-line tool" 5 | authors = ["Joseph Lai "] 6 | license = "MIT" 7 | readme = "README.md" 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.11" 11 | colorama = "^0.4.6" 12 | halo = "^0.0.31" 13 | praw = "^7.7.0" 14 | prettytable = "^3.7.0" 15 | python-dotenv = "^1.0.0" 16 | rich = "^13.3.5" 17 | wordcloud = "^1.9.1.1" 18 | 19 | [tool.poetry.group.dev.dependencies] 20 | black = "^23.3.0" 21 | coverage = "^7.2.4" 22 | isort = "^5.12.0" 23 | maturin = "^0.14.17" 24 | pytest = "^7.3.1" 25 | pytest-cov = "^4.0.0" 26 | 27 | [tool.maturin] 28 | features = ["pyo3/extension-module"] 29 | python-source = "urs" 30 | 31 | [build-system] 32 | requires = ["maturin>=0.14,<0.15", "poetry-core"] 33 | build-backend = "maturin" 34 | -------------------------------------------------------------------------------- /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: Rust code checks 2 | 3 | on: 4 | pull_request: 5 | branches-ignore: 6 | - "demo-gifs" 7 | - "gh-pages" 8 | - "rust-demo" 9 | - "samples" 10 | 11 | jobs: 12 | rustfmt: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v3 17 | 18 | - uses: dtolnay/rust-toolchain@stable 19 | 20 | - name: Rustfmt check 21 | uses: mbrobbel/rustfmt-check@master 22 | with: 23 | token: ${{ secrets.GITHUB_TOKEN }} 24 | 25 | clippy: 26 | runs-on: ubuntu-latest 27 | 28 | steps: 29 | - uses: actions/checkout@v3 30 | 31 | - name: Add clippy via rustup 32 | run: rustup component add clippy 33 | 34 | - name: Clippy check 35 | uses: actions-rs/clippy-check@v1 36 | with: 37 | token: ${{ secrets.GITHUB_TOKEN }} 38 | args: --all-features 39 | -------------------------------------------------------------------------------- /taisun/lib.rs: -------------------------------------------------------------------------------- 1 | //! `taisun` - The heavy lifter for `URS`. 2 | 3 | use pyo3::{prelude::*, types::PyDict}; 4 | 5 | use comments::{CommentNode, Forest}; 6 | 7 | mod comments; 8 | 9 | /// This module contains utilities for submission comments scraping. 10 | #[pymodule] 11 | fn comments_utils(_python: Python, module: &PyModule) -> PyResult<()> { 12 | module.add_class::()?; 13 | module.add_class::()?; 14 | 15 | Ok(()) 16 | } 17 | 18 | /// `taisun` - The heavy lifter for `URS`. 19 | #[pymodule] 20 | fn taisun(python: Python, module: &PyModule) -> PyResult<()> { 21 | let comments_utils = pyo3::wrap_pymodule!(comments_utils); 22 | module.add_wrapped(comments_utils)?; 23 | 24 | let sys = PyModule::import(python, "sys")?; 25 | let sys_modules: &PyDict = sys.getattr("modules")?.downcast()?; 26 | sys_modules.set_item("taisun.comments_utils", module.getattr("comments_utils")?)?; 27 | 28 | Ok(()) 29 | } 30 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | """ 2 | Cleanup scripts that are run after tests are done. 3 | """ 4 | 5 | 6 | from pathlib import Path 7 | 8 | import pytest 9 | 10 | 11 | def remove_directories(directory): 12 | """ 13 | Recursively remove directories created by `pytest`. 14 | 15 | Parameters 16 | ---------- 17 | directory: Path 18 | 19 | Returns 20 | ------- 21 | None 22 | """ 23 | 24 | directory = Path(directory) 25 | for item in directory.iterdir(): 26 | remove_directories(item) if item.is_dir() else item.unlink() 27 | 28 | directory.rmdir() 29 | 30 | 31 | @pytest.hookimpl(trylast=True) 32 | def pytest_sessionfinish(): 33 | """ 34 | Clean up after `pytest` is done running tests. 35 | """ 36 | 37 | print("\nCleaning up tests...") 38 | 39 | try: 40 | remove_directories(Path("../scrapes")) 41 | print("Done.") 42 | except Exception as e: 43 | print("An error has occurred: %s" % e) 44 | -------------------------------------------------------------------------------- /tests/test_analytics/test_Wordcloud.py: -------------------------------------------------------------------------------- 1 | """ 2 | Testing `Wordcloud.py`. 3 | """ 4 | 5 | 6 | from urs.analytics import Wordcloud 7 | 8 | 9 | class TestSetUpWordcloudInitializeWordcloudMethod: 10 | """ 11 | Testing SetUpWordcloud class initialize_wordcloud() method. 12 | """ 13 | 14 | def test_initialize_wordcloud_method(self): 15 | pass 16 | 17 | 18 | class TestSetUpWordcloudModifyWordcloudMethod: 19 | """ 20 | Testing SetUpWordcloud class modify_wordcloud() method. 21 | """ 22 | 23 | def test_modify_wordcloud_method(self): 24 | pass 25 | 26 | 27 | class TestFinalizeWordcloudShowWordcloudMethod: 28 | """ 29 | Testing FinalizeWordcloud class show_wordcloud() method. 30 | """ 31 | 32 | def test_show_wordcloud_method(self): 33 | pass 34 | 35 | 36 | class TestFinalizeWordcloudSaveWordcloudMethod: 37 | """ 38 | Testing FinalizeWordcloud class save_wordcloud() method. 39 | """ 40 | 41 | def test_save_wordcloud_method(self): 42 | pass 43 | -------------------------------------------------------------------------------- /manual/src/utilities/tree.md: -------------------------------------------------------------------------------- 1 | # Display Directory Tree 2 | 3 | ![Display Directory Tree Demo GIF][display directory tree demo] 4 | 5 | ## All Flags 6 | 7 | These are all the flags that may be used when displaying the directory tree. 8 | 9 | ``` 10 | [-t []] 11 | ``` 12 | 13 | ## Usage 14 | 15 | If no date is provided, you can quickly view the directory structure for the current date. This is a quick alternative to [`nomad`][nomad] or the `tree` command. 16 | 17 | You can also display a different day's scrapes by providing a date after the `-t` flag. 18 | 19 | ``` 20 | poetry run Urs.py -t [] 21 | ``` 22 | 23 | The following date formats are supported: 24 | 25 | - `YYYY-MM-DD` 26 | - `YYYY/MM/DD` 27 | 28 | An error is displayed if `URS` was not run on the entered date (if the date directory is not found within the `scrapes/` directory). 29 | 30 | [display directory tree demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/utilities/tree_demo.gif?raw=true 31 | [nomad]: https://github.com/JosephLai241/nomad 32 | -------------------------------------------------------------------------------- /tests/test_utils/test_Tools.py: -------------------------------------------------------------------------------- 1 | """ 2 | Testing `Tools.py`. 3 | """ 4 | 5 | 6 | import argparse 7 | import os 8 | 9 | import praw 10 | from dotenv import load_dotenv 11 | 12 | from urs.utils import Global, Tools 13 | 14 | 15 | class Login: 16 | """ 17 | Create a Reddit object with PRAW API credentials. 18 | """ 19 | 20 | @staticmethod 21 | def create_reddit_object(): 22 | load_dotenv() 23 | 24 | return praw.Reddit( 25 | client_id=os.getenv("CLIENT_ID"), 26 | client_secret=os.getenv("CLIENT_SECRET"), 27 | user_agent=os.getenv("USER_AGENT"), 28 | username=os.getenv("USERNAME"), 29 | password=os.getenv("PASSWORD"), 30 | ) 31 | 32 | 33 | class TestRunInitMethod: 34 | """ 35 | Testing Run class __init__() method. 36 | """ 37 | 38 | def test_init_instance_variables(self): 39 | reddit = Login.create_reddit_object() 40 | 41 | try: 42 | Tools.Run(reddit) 43 | assert False 44 | except SystemExit: 45 | assert True 46 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Joseph Lai 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/FEATURE_REQUEST.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature/Enhancement Request 3 | about: Suggest a new feature or enhancement for URS. 4 | title: "Feature/Enhancement Request | ADD A SHORT SUMMARY OF THE NEW FEATURE HERE" 5 | labels: enhancement 6 | assignees: JosephLai241 7 | --- 8 | 9 | # **DO NOT DELETE THIS TEMPLATE** 10 | 11 | ## Describe the New Feature or Enhancement You Would Like 12 | 13 | Delete this line and write a clear description of what you want to happen. 14 | 15 | ## Explain Why You Believe This Would Be a Good Feature For URS 16 | 17 | Delete this line and write your motivation for creating this request. 18 | 19 | ## Is Your Request Related To a Problem? 20 | 21 | **Put "N/A" in this block if this is not applicable.** 22 | 23 | Delete this line and link an issue by using the `#` symbol followed by the issue number. Then add some additional information as to how your request relates to the open issue. 24 | 25 | ## Describe Alternatives You Have Considered 26 | 27 | **Put "N/A" in this block if this is not applicable.** 28 | 29 | Delete this line and write a clear and concise description of any alternative solutions or features you have considered. 30 | 31 | ## Additional Context 32 | 33 | **Put "N/A" in this block if this is not applicable.** 34 | 35 | Delete this line and add any other context, codeblocks, screenshots, etc., about the request here. 36 | -------------------------------------------------------------------------------- /manual/src/credentials.md: -------------------------------------------------------------------------------- 1 | # How to Get PRAW Credentials 2 | 3 | Create your own Reddit account and then head over to [Reddit's apps page](https://old.reddit.com/prefs/apps). 4 | 5 | Click `"are you a developer? create an app... "`. 6 | 7 | ![Create an app screenshot][create an app] 8 | 9 | Name your app, choose `"script"` for the type of app, and type `"http://localhost:8080"` in the redirect URI field since this is a personal use app. You can also add a description and an about URL. 10 | 11 | ![Enter Stuff In Boxes screenshot][enter stuff in boxes] 12 | 13 | Click `"create app"`, then `"edit"` to reveal more information. 14 | 15 | ![Click Edit screenshot][click edit] 16 | 17 | You should see a string of 14 characters on the top left corner underneath `"personal use script"`. That is your API ID. Further down you will see `"secret"` and a string of 27 characters; that is your API password. **Save this information as it will be used in the program in order to access the Reddit API**. 18 | 19 | ![All Info screenshot][all info] 20 | 21 | You will also have to provide your app name and Reddit account username and password in the block of credentials found in `.env`. 22 | 23 | 24 | 25 | [create an app]: https://i.imgur.com/Bf0pKGJ.png 26 | [enter stuff in boxes]: https://i.imgur.com/g0xARWA.png 27 | [click edit]: https://i.imgur.com/1NOyMTN.png 28 | [all info]: https://i.imgur.com/VajTKJu.png 29 | -------------------------------------------------------------------------------- /manual/src/scraping-reddit/scrape-speeds-and-rate-limits.md: -------------------------------------------------------------------------------- 1 | # Scrape Speeds 2 | 3 | Your internet connection speed is the primary bottleneck that will establish the scrape duration; however, there are additional bottlenecks such as: 4 | 5 | - The number of results returned for Subreddit or Redditor scraping. 6 | - The submission's popularity (total number of comments) for submission comments scraping. 7 | 8 | # Rate Limits 9 | 10 | Yes, PRAW has rate limits. These limits are proportional to how much karma you have accumulated -- the higher the karma, the higher the rate limit. This has been implemented to mitigate spammers and bots that utilize PRAW. 11 | 12 | Rate limit information for your account is displayed in a small table underneath the successful login message each time you run any of the PRAW scrapers. I have also added a [`--check` flag](../utilities/rate-limit-checking.md) if you want to quickly view this information. 13 | 14 | `URS` will display an error message as well as the rate limit reset date if you have used all your available requests. 15 | 16 | There are a couple ways to circumvent rate limits: 17 | 18 | - Scrape intermittently 19 | - Use an account with high karma to get your PRAW credentials 20 | - Scrape less results per run 21 | 22 | Available requests are refilled if you use the PRAW scrapers intermittently, which might be the best solution. This can be especially helpful if you have automated `URS` and are not looking at the output on each run. 23 | -------------------------------------------------------------------------------- /tests/test_analytics/test_Frequencies.py: -------------------------------------------------------------------------------- 1 | """ 2 | Testing `Frequencies.py`. 3 | """ 4 | 5 | 6 | from urs.analytics import Frequencies 7 | 8 | 9 | class TestSortGetDataMethod: 10 | """ 11 | Testing Sort class get_data() method. 12 | """ 13 | 14 | def test_get_data_method(self): 15 | pass 16 | 17 | 18 | class TestSortNameAndCreateDirMethod: 19 | """ 20 | Testing Sort class name_and_create_dir() method. 21 | """ 22 | 23 | def test_name_and_create_dir_method(self): 24 | pass 25 | 26 | 27 | class TestSortCreateCsvMethod: 28 | """ 29 | Testing Sort class create_csv() method. 30 | """ 31 | 32 | def test_create_csv_method(self): 33 | plt_dict = {"test": 1, "testing": 2} 34 | 35 | assert Frequencies.Sort().create_csv(plt_dict) == { 36 | "words": ["test", "testing"], 37 | "frequencies": [1, 2], 38 | } 39 | 40 | 41 | class TestSortCreateJsonMethod: 42 | """ 43 | Testing Sort class create_json() method. 44 | """ 45 | 46 | def test_create_json_method(self): 47 | scrape_file = ["test", "something"] 48 | plt_dict = {"test": 1, "testing": 2} 49 | 50 | assert Frequencies.Sort().create_json(plt_dict, scrape_file) == { 51 | "raw_file": "test", 52 | "data": {"test": 1, "testing": 2}, 53 | } 54 | 55 | 56 | class TestExportFrequenciesExportMethod: 57 | """ 58 | Testing ExportFrequencies class export() method. 59 | """ 60 | 61 | def test_export_method(self): 62 | pass 63 | -------------------------------------------------------------------------------- /manual/src/contributors.md: -------------------------------------------------------------------------------- 1 | # Contributors 2 | 3 | | Date | User | Contribution | 4 | | ---------------- | -------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 5 | | March 11, 2020 | [ThereGoesMySanity][theregoesmysanity] | Created a [pull request][theregoesmysanity pull request] adding 2FA information to README | 6 | | October 6, 2020 | [LukeDSchenk][lukedschenk] | Created a [pull request][lukedschenk pull request] fixing `"[Errno 36] File name too long"` issue, making it impossible to save comment scrapes with long titles | 7 | | October 10, 2020 | [IceBerge421][iceberge421] | Created a [pull request][icegerge421 pull request] fixing a cloning error occuring on Windows machines due to illegal file name characters, `"`, found in two scrape samples | 8 | 9 | [iceberge421]: https://github.com/IceBerge421 10 | [icegerge421 pull request]: https://github.com/JosephLai241/URS/pull/20 11 | [lukedschenk]: https://github.com/LukeDSchenk 12 | [lukedschenk pull request]: https://github.com/JosephLai241/URS/pull/19 13 | [theregoesmysanity]: https://github.com/ThereGoesMySanity 14 | [theregoesmysanity pull request]: https://github.com/JosephLai241/URS/pull/9 15 | -------------------------------------------------------------------------------- /urs/praw_scrapers/live_scrapers/utils/StreamGenerator.py: -------------------------------------------------------------------------------- 1 | """ 2 | Stream Generator 3 | ================ 4 | Defining methods for the stream generator which yields new Reddit objects and 5 | converts them to JSON serializable objects when saving to file. 6 | """ 7 | 8 | 9 | from typing import Any, Dict, Generator, Union 10 | 11 | from praw.models.reddit.redditor import RedditorStream 12 | from praw.models.reddit.subreddit import SubredditStream 13 | 14 | from urs.praw_scrapers.utils.Objectify import Objectify 15 | 16 | 17 | class StreamGenerator: 18 | """ 19 | Methods for creating a generator which yields new Reddit objects while 20 | streaming. 21 | """ 22 | 23 | @staticmethod 24 | def stream_submissions( 25 | stream: Union[RedditorStream, SubredditStream] 26 | ) -> Generator[Dict[str, Any], None, None]: 27 | """ 28 | Yield new Reddit submissions. 29 | 30 | :param RedditorStream | SubredditStream stream: The Reddit stream instance. 31 | 32 | :yields: Reddit submission object. 33 | """ 34 | 35 | for submission in stream.submissions(skip_existing=True): 36 | yield Objectify().make_submission(True, submission) 37 | 38 | @staticmethod 39 | def stream_comments( 40 | stream: Union[RedditorStream, SubredditStream] 41 | ) -> Generator[Dict[str, Any], None, None]: 42 | """ 43 | Yield new Reddit comments. 44 | 45 | :param RedditorStream | SubredditStream stream: The Reddit stream instance. 46 | 47 | :yields: Reddit comment object. 48 | """ 49 | 50 | for comment in stream.comments(skip_existing=True): 51 | yield Objectify().make_comment(comment, True) 52 | -------------------------------------------------------------------------------- /manual/src/README.md: -------------------------------------------------------------------------------- 1 | __ __ _ __ ____ 2 | /\ \/\ \/\`'__\/',__\ 3 | \ \ \_\ \ \ \//\__, `\ 4 | \ \____/\ \_\\/\____/ 5 | \/___/ \/_/ \/___/ 6 | 7 | > **U**niversal **R**eddit **S**craper - A comprehensive Reddit scraping command-line tool written in Python. 8 | 9 | ![GitHub Workflow Status (Python)](https://img.shields.io/github/actions/workflow/status/JosephLai241/URS/python.yml?label=Python&logo=python&logoColor=blue) 10 | ![GitHub Workflow Status (Rust)](https://img.shields.io/github/actions/workflow/status/JosephLai241/URS/rust.yml?label=Rust&logo=rust&logoColor=orange) 11 | [![Codecov](https://img.shields.io/codecov/c/gh/JosephLai241/URS?logo=Codecov)][codecov] 12 | [![GitHub release (latest by date)](https://img.shields.io/github/v/release/JosephLai241/URS)][releases] 13 | ![Total lines](https://img.shields.io/tokei/lines/github/JosephLai241/URS) 14 | ![License](https://img.shields.io/github/license/JosephLai241/URS) 15 | 16 | ``` 17 | [-h] 18 | [-e] 19 | [-v] 20 | 21 | [-t []] 22 | [--check] 23 | 24 | [-r <(h|n|c|t|r|s)> []] 25 | [-y] 26 | [--csv] 27 | [--rules] 28 | [-u ] 29 | [-c ] 30 | [--raw] 31 | [-b] 32 | [--csv] 33 | 34 | [-lr ] 35 | [-lu ] 36 | 37 | [--nosave] 38 | [--stream-submissions] 39 | 40 | [-f ] 41 | [--csv] 42 | [-wc []] 43 | [--nosave] 44 | ``` 45 | 46 | [codecov]: https://codecov.io/gh/JosephLai241/URS 47 | [github workflow status]: https://github.com/JosephLai241/URS/actions/workflows/pytest.yml 48 | [praw]: https://pypi.org/project/praw/ 49 | [releases]: https://github.com/JosephLai241/URS/releases 50 | -------------------------------------------------------------------------------- /urs/Urs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | 4 | """ 5 | URS 6 | === 7 | 8 | URS, an acronym for "Universal Reddit Scraper", is a comprehensive Reddit scraping 9 | command-line tool written in Python. 10 | 11 | * Scrape Reddit via PRAW (the official Python Reddit API Wrapper) 12 | + Scrape Subreddits 13 | + Scrape Redditors 14 | + Scrape submission comments 15 | * Livestream Reddit via PRAW 16 | + Livestream comments submitted within Subreddits or by Redditors 17 | + Livestream submissions submitted within Subreddits or by Redditors 18 | * Analytical tools for scraped data 19 | + Generate frequencies for words that are found in submission titles, bodies, and/or comments 20 | + Generate a wordcloud from scrape results 21 | 22 | @author: Joseph Lai 23 | @contact: urs_project@protonmail.com 24 | @github: https://github.com/JosephLai241/URS 25 | """ 26 | 27 | 28 | import os 29 | 30 | import praw 31 | from colorama import init 32 | from dotenv import load_dotenv 33 | 34 | from urs.utils.Logger import LogMain 35 | from urs.utils.Tools import Run 36 | 37 | # Automate sending reset sequences to turn off color changes at the end of 38 | # every print. 39 | init(autoreset=True) 40 | 41 | 42 | class Main: 43 | """ 44 | Run URS. 45 | """ 46 | 47 | @staticmethod 48 | @LogMain.master_timer 49 | def main() -> None: 50 | load_dotenv() 51 | 52 | reddit = praw.Reddit( 53 | client_id=os.getenv("CLIENT_ID"), 54 | client_secret=os.getenv("CLIENT_SECRET"), 55 | user_agent=os.getenv("USER_AGENT"), 56 | username=os.getenv("REDDIT_USERNAME"), 57 | password=os.getenv("REDDIT_PASSWORD"), 58 | ) 59 | 60 | Run(reddit).run_urs() 61 | 62 | 63 | if __name__ == "__main__": 64 | Main.main() 65 | -------------------------------------------------------------------------------- /.github/workflows/python.yml: -------------------------------------------------------------------------------- 1 | name: Python code checks 2 | 3 | on: 4 | push: 5 | branches: 6 | - "master" 7 | pull_request: 8 | branches-ignore: 9 | - "demo-gifs" 10 | - "gh-pages" 11 | - "rust-demo" 12 | - "samples" 13 | 14 | jobs: 15 | formatting-checks: 16 | runs-on: ubuntu-latest 17 | 18 | steps: 19 | - name: Black formatting check 20 | uses: psf/black@stable 21 | with: 22 | options: "--check --verbose --diff --color" 23 | 24 | - name: isort formatting check 25 | uses: isort/isort-action@master 26 | with: 27 | configuration: "--check-only --verbose --diff --color --profile black" 28 | 29 | pytest: 30 | runs-on: ${{ matrix.os }} 31 | 32 | strategy: 33 | matrix: 34 | os: [ubuntu-latest, macOS-latest] 35 | 36 | if: github.ref == 'refs/heads/master' 37 | steps: 38 | - uses: actions/checkout@v3 39 | 40 | - name: Set up Python 3.11 41 | uses: actions/setup-python@v4 42 | with: 43 | python-version: "3.11" 44 | 45 | - name: Install Poetry 46 | uses: abatilo/actions-poetry@v2 47 | 48 | - name: Install dependencies 49 | run: poetry install 50 | 51 | - name: Build taisun 52 | uses: PyO3/maturin-action@v1 53 | with: 54 | command: develop 55 | args: --release 56 | 57 | - name: Run Pytest 58 | env: 59 | CLIENT_ID: ${{ secrets.CLIENT_ID }} 60 | CLIENT_SECRET: ${{ secrets.CLIENT_SECRET }} 61 | USER_AGENT: ${{ secrets.USER_AGENT }} 62 | REDDIT_USERNAME: ${{ secrets.REDDIT_USERNAME }} 63 | REDDIT_PASSWORD: ${{ secrets.REDDIT_PASSWORD }} 64 | run: | 65 | poetry run pytest --cov=./ 66 | 67 | - name: Send coverage data to Codecov 68 | uses: codecov/codecov-action@v1 69 | if: matrix.os == 'ubuntu-latest' 70 | -------------------------------------------------------------------------------- /manual/src/SUMMARY.md: -------------------------------------------------------------------------------- 1 | # Summary 2 | 3 | [URS](./README.md) 4 | 5 | - [Introduction](./introduction.md) 6 | - [Installation](./installation.md) 7 | - [Exporting](./exporting.md) 8 | - [How to Get Reddit API Credentials for PRAW](./credentials.md) 9 | 10 | # Scraping Reddit 11 | 12 | - [Scrape Speeds and Rate Limits](./scraping-reddit/scrape-speeds-and-rate-limits.md) 13 | - [A Table of All Subreddit, Redditor, and Submission Comments Attributes](./scraping-reddit/all-attributes-table.md) 14 | - [Scraping Subreddits](./scraping-reddit/subreddit.md) 15 | - [Scraping Redditors](./scraping-reddit/redditor.md) 16 | - [Scraping Submission Comments](./scraping-reddit/submission-comments.md) 17 | 18 | # Livestreaming Reddit 19 | 20 | - [General Information](./livestreaming-reddit/general-information.md) 21 | - [Livestreaming Subreddits and Redditors](./livestreaming-reddit/livestreaming-subreddits-and-redditors.md) 22 | 23 | # Analytical Tools 24 | 25 | - [General Information](./analytical-tools/general-information.md) 26 | - [Generating Word Frequencies and Wordclouds](./analytical-tools/frequencies-and-wordclouds.md) 27 | 28 | # Utilities 29 | 30 | - [Built-in Tree](./utilities/tree.md) 31 | - [PRAW Rate Limit Check](./utilities/rate-limit-checking.md) 32 | 33 | # Additional Information 34 | 35 | - [2-Factor Authentication](./additional-information/2fa-information.md) 36 | - [Error Messages](./additional-information/error-messages.md) 37 | 38 | # Implementation Details 39 | 40 | - [The Forest](./implementation-details/the-forest.md) 41 | - [Speeding Up Python with Rust](./implementation-details/speeding-up-python-with-rust.md) 42 | 43 | # Contributing 44 | 45 | - [Before Making Pull or Feature Requests](./contributing/before-making-pull-or-feature-requests.md) 46 | - [Building on Top of `URS`](./contributing/building-on-top-of-urs.md) 47 | - [Making Pull or Feature Requests](./contributing/making-pull-or-feature-requests.md) 48 | 49 | --- 50 | 51 | [Contributors](./contributors.md) 52 | [Derivative Projects](./derivative-projects.md) 53 | -------------------------------------------------------------------------------- /tests/test_praw_scrapers/test_live_scrapers/test_utils/test_StreamGenerator.py: -------------------------------------------------------------------------------- 1 | """ 2 | Testing `StreamGenerator.py`. 3 | """ 4 | 5 | 6 | import os 7 | import types 8 | 9 | import praw 10 | from dotenv import load_dotenv 11 | 12 | from urs.praw_scrapers.live_scrapers.utils import StreamGenerator 13 | 14 | 15 | class Login: 16 | """ 17 | Create a Reddit object with PRAW API credentials. 18 | """ 19 | 20 | @staticmethod 21 | def create_reddit_object(): 22 | load_dotenv() 23 | 24 | return praw.Reddit( 25 | client_id=os.getenv("CLIENT_ID"), 26 | client_secret=os.getenv("CLIENT_SECRET"), 27 | user_agent=os.getenv("USER_AGENT"), 28 | username=os.getenv("REDDIT_USERNAME"), 29 | password=os.getenv("REDDIT_PASSWORD"), 30 | ) 31 | 32 | 33 | class TestStreamGeneratorStreamSubmissionsMethod: 34 | """ 35 | Testing StreamGenerator class stream_submissions() method. 36 | """ 37 | 38 | def test_stream_submissions_method(self): 39 | reddit = Login.create_reddit_object() 40 | subreddit = reddit.subreddit("askreddit") 41 | 42 | generator = StreamGenerator.StreamGenerator.stream_submissions(subreddit.stream) 43 | 44 | assert isinstance(generator, types.GeneratorType) 45 | 46 | for obj in generator: 47 | if isinstance(obj, dict): 48 | assert True 49 | break 50 | 51 | 52 | class TestStreamGeneratorStreamCommentsMethod: 53 | """ 54 | Testing StreamGenerator class stream_comments() method. 55 | """ 56 | 57 | def test_stream_comments_method(self): 58 | reddit = Login.create_reddit_object() 59 | subreddit = reddit.subreddit("askreddit") 60 | 61 | generator = StreamGenerator.StreamGenerator.stream_comments(subreddit.stream) 62 | 63 | assert isinstance(generator, types.GeneratorType) 64 | 65 | for obj in generator: 66 | if isinstance(obj, dict): 67 | assert True 68 | break 69 | -------------------------------------------------------------------------------- /manual/src/livestreaming-reddit/livestreaming-subreddits-and-redditors.md: -------------------------------------------------------------------------------- 1 | # Livestreaming Subreddits 2 | 3 | ![Livestream Subreddit Demo GIF][livestream subreddit demo] 4 | 5 | \*_This GIF has been cut for demonstration purposes._ 6 | 7 | ## All Flags 8 | 9 | These are all the flags that may be used when livestreaming Subreddits. 10 | 11 | ``` 12 | [-lr ] 13 | [--nosave] 14 | [--stream-submissions] 15 | ``` 16 | 17 | ## Usage 18 | 19 | ``` 20 | poetry run Urs.py -lr 21 | ``` 22 | 23 | **Default stream objects:** Comments. To stream submissions instead, include the `--stream-submissions` flag. 24 | 25 | You can livestream comments or submissions that are created within a Subreddit. 26 | 27 | Reddit object information will be displayed in a [PrettyTable][prettytable] as they are submitted. 28 | 29 | > **_NOTE:_** PRAW may not be able to catch all new submissions or comments within a high-volume Subreddit, as mentioned in [these disclaimers located in the "Note" boxes][subreddit stream disclaimer]. 30 | 31 | # Livestreaming Redditors 32 | 33 | _Livestream demo was not recorded for Redditors because its functionality is identical to the Subreddit livestream._ 34 | 35 | ## All Flags 36 | 37 | These are all the flags that may be used when livestreaming Redditors. 38 | 39 | ``` 40 | [-lu ] 41 | [--nosave] 42 | [--stream-submissions] 43 | ``` 44 | 45 | ## Usage 46 | 47 | ``` 48 | poetry run Urs.py -lu 49 | ``` 50 | 51 | **Default stream objects:** Comments. To stream submissions instead, include the `--stream-submissions` flag. 52 | 53 | You can livestream comments or submissions that are created by a Redditor. 54 | 55 | Reddit object information will be displayed in a PrettyTable as they are submitted. 56 | 57 | # Do Not Save Livestream to File 58 | 59 | Include the `--nosave` flag if you do not want to save the livestream to file. 60 | 61 | [livestream subreddit demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/praw_scrapers/live_scrapers/livestream_subreddit_demo.gif?raw=true 62 | [prettytable]: https://pypi.org/project/prettytable/ 63 | [subreddit stream disclaimer]: https://praw.readthedocs.io/en/latest/code_overview/other/subredditstream.html#praw.models.reddit.subreddit.SubredditStream 64 | -------------------------------------------------------------------------------- /tests/test_utils/test_Global.py: -------------------------------------------------------------------------------- 1 | """ 2 | Testing `Global.py`. 3 | """ 4 | 5 | 6 | import datetime as dt 7 | 8 | from urs.utils import Global 9 | 10 | 11 | class TestGlobalVariables: 12 | """ 13 | Testing all global variables in Global.py. 14 | """ 15 | 16 | def test_date_variable(self): 17 | assert Global.date == dt.datetime.now().strftime("%Y-%m-%d") 18 | 19 | def test_subreddit_categories_list(self): 20 | assert Global.categories == [ 21 | "Hot", 22 | "New", 23 | "Controversial", 24 | "Top", 25 | "Rising", 26 | "Search", 27 | ] 28 | 29 | def test_subreddit_short_cat_list(self): 30 | categories = ["Hot", "New", "Controversial", "Top", "Rising", "Search"] 31 | assert Global.short_cat == [cat[0] for cat in categories] 32 | 33 | 34 | class TestConvertTimeFunction: 35 | """ 36 | Testing convert_time() function. 37 | """ 38 | 39 | def test_convert_time(self): 40 | unix_time = 1592291124 41 | converted_time = "2020-06-16 07:05:24" 42 | 43 | assert Global.convert_time(unix_time) == converted_time 44 | 45 | 46 | class TestMakeListDictFunction: 47 | """ 48 | Testing make_list_dict() function. 49 | """ 50 | 51 | def test_make_list_dict(self): 52 | item = [1, 2, 3, 4] 53 | correct_list_dict = {1: [], 2: [], 3: [], 4: []} 54 | 55 | assert Global.make_list_dict(item) == correct_list_dict 56 | 57 | 58 | class TestMakeNoneDictFunction: 59 | """ 60 | Testing make_none_dict() function. 61 | """ 62 | 63 | def test_make_none_dict(self): 64 | item = [1, 2, 3, 4] 65 | correct_none_dict = {1: None, 2: None, 3: None, 4: None} 66 | 67 | assert Global.make_none_dict(item) == correct_none_dict 68 | 69 | 70 | class TestStatus: 71 | """ 72 | Testing Status class. 73 | """ 74 | 75 | def test_status_init_method(self): 76 | test_status = Global.Status( 77 | "test after message", "test before message", "test color" 78 | ) 79 | 80 | assert test_status._after_message == "test after message" 81 | assert test_status._before_message == "test before message" 82 | assert test_status._color == "test color" 83 | -------------------------------------------------------------------------------- /manual/src/analytical-tools/frequencies-and-wordclouds.md: -------------------------------------------------------------------------------- 1 | # Generating Word Frequencies 2 | 3 | ![Frequencies Demo GIF][frequencies demo] 4 | 5 | ## All Flags 6 | 7 | These are all the flags that may be used when generating word frequencies. 8 | 9 | ``` 10 | [-f ] 11 | [--csv] 12 | ``` 13 | 14 | ## Usage 15 | 16 | ``` 17 | poetry run Urs.py -f 18 | ``` 19 | 20 | **Supports exporting to CSV.** To export to CSV, include the `--csv` flag. 21 | 22 | You can generate a dictionary of word frequencies created from the words within the target fields. These frequencies are sorted from highest to lowest. 23 | 24 | Frequencies export to JSON by default, but this tool also works well in CSV format. 25 | 26 | Exported files will be saved to the `analytics/frequencies` directory. 27 | 28 | # Generating Wordclouds 29 | 30 | ![Wordcloud Demo GIF][wordcloud demo] 31 | 32 | ## All Flags 33 | 34 | ``` 35 | [-wc []] 36 | [--nosave] 37 | ``` 38 | 39 | ## Usage 40 | 41 | ``` 42 | poetry run Urs.py -wc 43 | ``` 44 | 45 | ## Supported Export Formats 46 | 47 | The following are the supported export formats for wordclouds: 48 | 49 | - `eps` 50 | - `jpeg` 51 | - `jpg` 52 | - `pdf` 53 | - `png` (default) 54 | - `ps` 55 | - `rgba` 56 | - `tif` 57 | - `tiff` 58 | 59 | Taking word frequencies to the next level, you can generate wordclouds based on word frequencies. This tool is independent of the frequencies generator -- you do not need to run the frequencies generator before creating a wordcloud. 60 | 61 | PNG is the default format, but you can also export to any of the options listed above by including the format as the second flag argument. 62 | 63 | ``` 64 | poetry run Urs.py -wc [] 65 | ``` 66 | 67 | Exported files will be saved to the `analytics/wordclouds` directory. 68 | 69 | ## Display Wordcloud Instead of Saving 70 | 71 | Wordclouds are saved to file by default. If you do not want to keep a file, include the `--nosave` flag to only display the wordcloud. 72 | 73 | [frequencies demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/analytical_tools/frequencies_generator_demo.gif?raw=true 74 | [wordcloud demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/analytical_tools/wordcloud_generator_demo.gif?raw=true 75 | -------------------------------------------------------------------------------- /manual/src/livestreaming-reddit/general-information.md: -------------------------------------------------------------------------------- 1 | # Livestreaming Reddit via PRAW 2 | 3 | These tools may be used to livestream comments or submissions submitted within Subreddits or by Redditors. 4 | 5 | **Comments are streamed by default**. To stream submissions instead, include the `--stream-submissions` flag. 6 | 7 | **New comments or submissions will continue to display within your terminal until you abort the stream using `Ctrl + C`**. 8 | 9 | ## File Naming Conventions 10 | 11 | The filenames will follow this format: 12 | 13 | ``` 14 | [SUBREDDIT_OR_REDDITOR]-[comments_OR_submissions]-[START_TIME_IN_HOURS_MINUTES_SECONDS]-[DURATION_IN_HOURS_MINUTES_SECONDS].json 15 | ``` 16 | 17 | This file is saved in the main `livestream` directory into the `subreddits` or `redditors` directory depending on which stream was run. 18 | 19 | Reddit objects will be written to this JSON file in real time. After aborting the stream, the filename will be updated with the start time and duration. 20 | 21 | ## Displayed vs. Saved Attributes 22 | 23 | Displayed comment and submission attributes have been stripped down to essential fields to declutter the output. Here is a table of what is shown during the stream: 24 | 25 | | Comment Attributes | Submission Attributes | 26 | | ---------------------------- | --------------------- | 27 | | `author` | `author` | 28 | | `body` | `created_utc` | 29 | | `created_utc` | `is_self` | 30 | | `is_submitter` | `link_flair_text` | 31 | | `submission_author` | `nsfw` | 32 | | `submission_created_utc` | `selftext` | 33 | | `submission_link_flair_text` | `spoiler` | 34 | | `submission_nsfw` | `stickied` | 35 | | `submission_num_comments` | `title` | 36 | | `submission_score` | `url` | 37 | | `submission_title` | | 38 | | `submission_upvote_ratio` | | 39 | | `submission_url` | | 40 | 41 | Comment and submission attributes that are written to file will include the full list of attributes found in the [Table of All Subreddit, Redditor, and Submission Comments Attributes](../scraping-reddit/all-attributes-table.md). 42 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/BUG_REPORT.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug Report 3 | about: Report a bug that you have encountered while using URS. 4 | title: "Bug Report | ADD A SHORT SUMMARY OF THE BUG HERE" 5 | labels: bug 6 | assignees: JosephLai241 7 | --- 8 | 9 | # **DO NOT DELETE THIS TEMPLATE** 10 | 11 | ## Describe The Bug 12 | 13 | Delete this line and write a clear and concise description for the bug. 14 | 15 | ## Expected Behavior 16 | 17 | Delete this line and write a description of what you expected to happen. 18 | 19 | ## Actual Behavior 20 | 21 | Delete this line and write a description of what actually happened. 22 | 23 | ## Steps To Reproduce 24 | 25 | Delete this line and describe how to reproduce this behavior. Create an outline if there is additional relevant information you would like to share. An example outline is shown below: 26 | 27 | * I ran `a command here`. 28 | * Then I ran `a second command here`. 29 | + Providing additional information about this bullet point here. 30 | 31 | If the command is long, wrap it in a code block like so: 32 | 33 | * Ran the command: 34 | 35 | ``` 36 | Paste long command here. 37 | ``` 38 | 39 | ## Traceback, `urs.log`, or Screenshots 40 | 41 | **Put "N/A" in this block if this is not applicable.** 42 | 43 | Add a codeblock of the *entire* traceback here to help explain your problem. 44 | 45 | ``` 46 | Paste the traceback here. Make sure it is formatted correctly. 47 | ``` 48 | 49 | **The traceback is mandatory**, however you can also add the following information below. 50 | 51 | A log of command history, `urs.log`, is written to the date directory every time you run URS. You can also add the relevant log block within a codeblock to help explain your problem. An example log is shown: 52 | 53 | ``` 54 | [2021-06-22 19:22:32,296] [INFO]: INITIALIZING URS. 55 | [2021-06-22 19:22:32,296] [INFO]: 56 | [2021-06-22 19:22:32,301] [CRITICAL]: RECEIVED INVALID SCRAPE FILE FOR FREQUENCIES. 57 | [2021-06-22 19:22:32,301] [CRITICAL]: ABORTING URS. 58 | ``` 59 | 60 | You can also add a screenshot of your issue like so: 61 | 62 | ![Error Screenshot](PASTE_URL_TO_IMAGE_HERE) 63 | 64 | ## Machine Specs 65 | 66 | * Operating System: Write your operating system here, e.g. Arch Linux x86_64 67 | * Python Version: Write your Python version here, e.g. Python 3.8.2 68 | 69 | ## Additional Context 70 | 71 | **Put "N/A" in this block if this is not applicable.** 72 | 73 | Delete this line and add any other context about the problem here. 74 | -------------------------------------------------------------------------------- /manual/src/analytical-tools/general-information.md: -------------------------------------------------------------------------------- 1 | # Analytical Tools 2 | 3 | This suite of tools can be used _after_ scraping data from Reddit. Both of these tools analyze the frequencies of words found in submission titles and bodies, or comments within JSON scrape data. 4 | 5 | There are a few ways you can quickly get the correct filepath to the scrape file: 6 | 7 | - Drag and drop the file into the terminal. 8 | - Partially type the path and rely on tab completion support to finish the full path for you. 9 | 10 | Running either tool will create the `analytics` directory within the date directory. **This directory is located in the same directory in which the scrape data resides**. For example, if you run the frequencies generator on February 16th for scrape data that was captured on February 14th, `analytics` will be created in the February 14th directory. Command history will still be written in the February 16th `urs.log`. 11 | 12 | The sub-directories `frequencies` or `wordclouds` are created in `analytics` depending on which tool is run. These directories mirror the directories in which the original scrape files reside. For example, if you run the frequencies generator on a Subreddit scrape, the directory structure will look like this: 13 | 14 | ``` 15 | analytics/ 16 | └── frequencies 17 | └── subreddits 18 | └── SUBREDDIT_SCRAPE.json 19 | ``` 20 | 21 | A shortened export path is displayed once `URS` has completed exporting the data, informing you where the file is saved within the `scrapes` directory. You can open `urs.log` to view the full path. 22 | 23 | # Target Fields 24 | 25 | The data varies depending on the scraper, so these tools target different fields for each type of scrape data: 26 | 27 | | Scrape Data | Targets | 28 | | ------------------- | --------------------------------- | 29 | | Subreddit | `selftext`, `title` | 30 | | Redditor | `selftext`, `title`, `body` | 31 | | Submission Comments | `body` | 32 | | Livestream | `selftext` and `title`, or `body` | 33 | 34 | For Subreddit scrapes, data is pulled from the `selftext` and `title` fields for each submission (submission title and body). 35 | 36 | For Redditor scrapes, data is pulled from all three fields because both submission and comment data is returned. The `title` and `body` fields are targeted for submissions, and the `selftext` field is targeted for comments. 37 | 38 | For submission comments scrapes, data is only pulled from the `body` field of each comment. 39 | 40 | For livestream scrapes, comments or submissions may be included depending on user settings. The `selftext` and `title` fields are targeted for submissions, and the `body` field is targeted for comments. 41 | 42 | # File Names 43 | 44 | File names are identical to the original scrape data so that it is easier to distinguish which analytical file corresponds to which scrape. 45 | -------------------------------------------------------------------------------- /manual/src/scraping-reddit/all-attributes-table.md: -------------------------------------------------------------------------------- 1 | # Subreddit, Redditor, and Submission Comments Attributes 2 | 3 | These attributes are included in each scrape. 4 | 5 | | Subreddits (submissions) | Redditors | Submission Comments | 6 | | ------------------------ | -------------------------------- | ------------------- | 7 | | `author` | `comment_karma` | `author` | 8 | | `created_utc` | `created_utc` | `body` | 9 | | `distinguished` | `fullname` | `body_html` | 10 | | `edited` | `has_verified_email` | `created_utc` | 11 | | `id` | `icon_img` | `distinguished` | 12 | | `is_original_content` | `id` | `edited` | 13 | | `is_self` | `is_employee` | `id` | 14 | | `link_flair_text` | `is_friend` | `is_submitter` | 15 | | `locked` | `is_mod` | `link_id` | 16 | | `name` | `is_gold` | `parent_id` | 17 | | `num_comments` | `link_karma` | `score` | 18 | | `nsfw` | `name` | `stickied` | 19 | | `permalink` | `subreddit` | | 20 | | `score` | \*`trophies` | | 21 | | `selftext` | \*`comments` | | 22 | | `spoiler` | \*`controversial` | | 23 | | `stickied` | \*`downvoted` (may be forbidden) | | 24 | | `title` | \*`gilded` | | 25 | | `upvote_ratio` | \*`gildings` (may be forbidden) | | 26 | | `url` | \*`hidden` (may be forbidden) | | 27 | | | \*`hot` | | 28 | | | \*`moderated` | | 29 | | | \*`multireddits` | | 30 | | | \*`new` | | 31 | | | \*`saved` (may be forbidden) | | 32 | | | \*`submissions` | | 33 | | | \*`top` | | 34 | | | \*`upvoted` (may be forbidden) | | 35 | 36 | \*_Includes additional attributes; see the [Scraping Redditors](./redditor.md) section for more information._ 37 | -------------------------------------------------------------------------------- /urs/praw_scrapers/live_scrapers/utils/DisplayStream.py: -------------------------------------------------------------------------------- 1 | """ 2 | Display stream 3 | ============== 4 | Defining methods to format data that will appear in the terminal. 5 | """ 6 | 7 | 8 | from typing import Any, Dict, List 9 | 10 | from prettytable import PrettyTable 11 | 12 | 13 | class DisplayStream: 14 | """ 15 | Methods to format and display Reddit stream objects. 16 | """ 17 | 18 | @staticmethod 19 | def _populate_table( 20 | include_fields: List[str], 21 | obj: Dict[str, Any], 22 | prefix: str, 23 | pretty_stream: PrettyTable, 24 | ) -> None: 25 | """ 26 | Populate the PrettyTable rows with Reddit object metadata. 27 | 28 | :param list[str] include_fields: A `list[str]` containing dictionary keys 29 | that will be added to the `PrettyTable` row. 30 | :param dict[str, Any] obj: A `dict[str, Any]` containing Reddit comment 31 | submission data. 32 | :param str prefix: The prefix to prepend to an attribute. 33 | :param PrettyTable pretty_stream: A `PrettyTable` instance. 34 | """ 35 | 36 | for attribute, data in obj.items(): 37 | if attribute in include_fields: 38 | pretty_stream.add_row([prefix + attribute, data]) 39 | 40 | @staticmethod 41 | def display(obj: Dict[str, Any]) -> None: 42 | """ 43 | Format and print string containing stream information. 44 | 45 | :param dict[str, Any] obj: A `dict[str, Any]` containing Reddit comment 46 | submission data. 47 | """ 48 | 49 | pretty_stream = PrettyTable() 50 | pretty_stream.field_names = [f"{obj['type'].capitalize()} Attribute", "Data"] 51 | 52 | if obj["type"] == "submission": 53 | include_fields = [ 54 | "author", 55 | "created_utc", 56 | "link_flair_text", 57 | "nsfw", 58 | "selftext", 59 | "spoiler", 60 | "title", 61 | "url", 62 | ] 63 | elif obj["type"] == "comment": 64 | include_fields = [ 65 | "author", 66 | "body", 67 | "created_utc", 68 | "is_submitter", 69 | ] 70 | 71 | submission_fields = [ 72 | "author", 73 | "created_utc", 74 | "link_flair_text", 75 | "nsfw", 76 | "num_comments", 77 | "score", 78 | "title", 79 | "upvote_ratio", 80 | "url", 81 | ] 82 | 83 | DisplayStream._populate_table( 84 | submission_fields, obj["submission"], "submission_", pretty_stream 85 | ) 86 | 87 | DisplayStream._populate_table(include_fields, obj, "", pretty_stream) 88 | 89 | pretty_stream.sortby = f"{obj['type'].capitalize()} Attribute" 90 | pretty_stream.align = "l" 91 | pretty_stream.max_width = 120 92 | 93 | print(pretty_stream) 94 | -------------------------------------------------------------------------------- /manual/src/additional-information/error-messages.md: -------------------------------------------------------------------------------- 1 | # Error Messages 2 | 3 | This document will briefly go over all the potential error messages you might run into while using URS. 4 | 5 | # Table of Contents 6 | 7 | - [Global Errors](#global-errors) 8 | - [Invalid Arguments](#invalid-arguments) 9 | - [Export Error](#export-error) 10 | - [PRAW Errors](#praw-errors) 11 | - [Invalid API Credentials or No Internet Connection](#invalid-api-credentials-or-no-internet-connection) 12 | - [No Reddit Objects Left to Scrape](#no-reddit-objects-left-to-scrape) 13 | - [Rate Limit Reached](#rate-limit-reached) 14 | - [Analytical Tool Errors](#analytical-tool-errors) 15 | - [Invalid File](#invalid-file) 16 | 17 | # Global Errors 18 | 19 | ## Invalid Arguments 20 | 21 | __ 22 | /'__`\ 23 | /\ __/ 24 | \ \____\ 25 | \/____/... [ERROR MESSAGE] 26 | 27 | Please recheck args or refer to help for usage examples. 28 | 29 | This message is displayed if you have entered invalid arguments. The specific error will follow `...`. 30 | 31 | You can use the `-h` flag to see the help message or the `-e` flag to display example usage. 32 | 33 | ## Export Error 34 | 35 | __ 36 | /\ \ 37 | \ \ \ 38 | \ \ \ 39 | \ \_\ 40 | \/\_\ 41 | \/_/... An error has occurred while exporting scraped data. 42 | 43 | [ERROR MESSAGE] 44 | 45 | This message is displayed if an error occured while exporting the data. This applies to the scraper tools or word frequencies tool. The specific error will be printed under the art. 46 | 47 | # PRAW Errors 48 | 49 | ## Invalid API Credentials or No Internet Connection 50 | 51 | _____ 52 | /\ '__`\ 53 | \ \ \L\ \ 54 | \ \ ,__/... Please recheck API credentials or your internet connection. 55 | \ \ \/ 56 | \ \_\ 57 | \/_/ 58 | 59 | Prawcore exception: [EXCEPTION] 60 | 61 | This message is displayed if you enter invalid API credentials or if you are not connected to the internet. The exception is printed under the art. 62 | 63 | Recheck the environment variables in `.env` to make sure your API credentials are correct. 64 | 65 | ## No Reddit Objects Left to Scrape 66 | 67 | ___ 68 | /' _ `\ 69 | /\ \/\ \ 70 | \ \_\ \_\ 71 | \/_/\/_/... No [OBJECTS] to scrape! Exiting. 72 | 73 | This message is displayed if the Reddit objects you have passed in have failed validation (does not exist), are skipped, and there are no longer any objects left for URS to process for that specific scraper. 74 | 75 | ## Rate Limit Reached 76 | 77 | __ 78 | /\ \ 79 | \ \ \ 80 | \ \ \ __ 81 | \ \ \L\ \ 82 | \ \____/ 83 | \/___/... You have reached your rate limit. 84 | 85 | Please try again when your rate limit is reset: [DATE] 86 | 87 | PRAW has rate limits. This message is displayed if you have reached the rate limit set for your account. The reset date will vary depending on when you ran URS. The date I received during testing is usually 24 hours later. 88 | 89 | # Analytical Tool Errors 90 | 91 | ## Invalid File 92 | 93 | __ 94 | /\_\ 95 | \/\ \ 96 | \ \ \ 97 | \ \_\ 98 | \/_/... [ERROR MESSAGE] 99 | 100 | This message is displayed when you have passed in an invalid file to generate word frequencies or a wordcloud for. The specific error will follow `...`. 101 | -------------------------------------------------------------------------------- /tests/test_utils/test_Utilities.py: -------------------------------------------------------------------------------- 1 | """ 2 | Testing `Utilities.py`. 3 | """ 4 | 5 | 6 | import os 7 | 8 | from rich.tree import Tree 9 | 10 | from urs.utils.Utilities import DateTree 11 | 12 | 13 | class TestDateTreeCheckDateFormatMethod: 14 | """ 15 | Testing DateTree class _check_date_format() method. 16 | """ 17 | 18 | def test_check_date_format_dash_format(self): 19 | test_date = "06-28-2021" 20 | test_search_date = DateTree._check_date_format(test_date) 21 | 22 | assert test_search_date == test_date 23 | 24 | def test_check_date_format_slash_format(self): 25 | test_date = "06/28/2021" 26 | test_search_date = DateTree._check_date_format(test_date) 27 | 28 | assert test_search_date == "06-28-2021" 29 | 30 | def test_check_date_wrong_format(self): 31 | test_date = "06.28.2021" 32 | 33 | try: 34 | _ = DateTree._check_date_format(test_date) 35 | assert False 36 | except TypeError: 37 | assert True 38 | 39 | def test_check_date_short_date_wrong_format(self): 40 | test_date = "06-28-21" 41 | 42 | try: 43 | _ = DateTree._check_date_format(test_date) 44 | assert False 45 | except TypeError: 46 | assert True 47 | 48 | 49 | class TestDateTreeFindDateDirectoryMethod: 50 | """ 51 | Testing DateTree class _find_date_directory() method. 52 | """ 53 | 54 | def test_find_date_directory_directory_exists(self): 55 | os.mkdir("../scrapes/06-28-2021") 56 | dir_exists = DateTree._find_date_directory("06-28-2021") 57 | 58 | assert dir_exists == True 59 | 60 | def test_find_date_directory_directory_does_not_exist(self): 61 | os.rmdir("../scrapes/06-28-2021") 62 | dir_exists = DateTree._find_date_directory("06-28-2021") 63 | 64 | assert dir_exists == False 65 | 66 | 67 | class TestDateTreeCreateDirectoryTreeMethod: 68 | """ 69 | Testing DateTree class _create_directory_tree() method. 70 | """ 71 | 72 | def test_create_directory_tree(self): 73 | os.makedirs("../scrapes/06-28-2021/testing/nested/directories/tree") 74 | 75 | test_tree = Tree("test") 76 | 77 | try: 78 | DateTree._create_directory_tree("../scrapes/06-28-2021", test_tree) 79 | assert True 80 | except Exception as e: 81 | print( 82 | f"An exception was thrown when testing DateTree._create_directory_tree(): {e}" 83 | ) 84 | assert False 85 | 86 | 87 | class TestDateTreeDisplayTreeMethod: 88 | """ 89 | Testing DateTree class display_tree() method. 90 | """ 91 | 92 | def test_display_tree_method_valid_search_date(self): 93 | try: 94 | DateTree.display_tree("06-28-2021") 95 | assert True 96 | except Exception as e: 97 | print(f"An exception was thrown when testing DateTree.display_tree(): {e}") 98 | assert False 99 | 100 | def test_display_tree_method_search_date_not_found(self): 101 | try: 102 | DateTree.display_tree("00-00-0000") 103 | assert False 104 | except SystemExit: 105 | assert True 106 | 107 | def test_display_tree_method_invalid_search_date(self): 108 | try: 109 | DateTree.display_tree("00.00.0000") 110 | assert False 111 | except SystemExit: 112 | assert True 113 | -------------------------------------------------------------------------------- /manual/src/scraping-reddit/submission-comments.md: -------------------------------------------------------------------------------- 1 | # Table of Contents 2 | 3 | - [Submission Comments](#submission-comments) 4 | - [All Flags](#all-flags) 5 | - [Usage](#usage) 6 | - [File Naming Conventions](#file-naming-conventions) 7 | - [Number of Comments Returned](#number-of-comments-returned) 8 | - [Structured Comments](#structured-comments) 9 | - [Raw Comments](#raw-comments) 10 | 11 | # Submission Comments 12 | 13 | ![Submission Comments Demo GIF][submission comments demo] 14 | 15 | \*_This GIF has been cut for demonstration purposes._ 16 | 17 | ## All Flags 18 | 19 | These are all the flags that may be used when scraping submission comments. 20 | 21 | ``` 22 | [-c ] 23 | [--raw] 24 | ``` 25 | 26 | ## Usage 27 | 28 | ``` 29 | poetry run Urs.py -c 30 | ``` 31 | 32 | Submission metadata will be included in the `submission_metadata` field and includes the following attributes: 33 | 34 | - `author` 35 | - `created_utc` 36 | - `distinguished` 37 | - `edited` 38 | - `is_original_content` 39 | - `is_self` 40 | - `link_flair_text` 41 | - `locked` 42 | - `nsfw` 43 | - `num_comments` 44 | - `permalink` 45 | - `score` 46 | - `selftext` 47 | - `spoiler` 48 | - `stickied` 49 | - `subreddit` 50 | - `title` 51 | - `upvote_ratio` 52 | 53 | If the submission contains a gallery, the attributes `gallery_data` and `media_metadata` will be included. 54 | 55 | Comments are written to the `comments` field. They are sorted by "Best", which is the default sorting option when you visit a submission. 56 | 57 | PRAW returns submission comments in level order, which means scrape speeds are proportional to the submission's popularity. 58 | 59 | ## File Naming Conventions 60 | 61 | The file names will generally follow this format: 62 | 63 | ``` 64 | [POST_TITLE]-[N_RESULTS]-result(s).json 65 | ``` 66 | 67 | Scrape data is exported to the `comments` directory. 68 | 69 | ## Number of Comments Returned 70 | 71 | You can scrape all comments from a submission by passing in `0` for ``. Subsequently, `[N_RESULTS]-result(s)` in the file name will be replaced with `all`. 72 | 73 | Otherwise, specify the number of results you want returned. If you passed in a specific number of results, the structured export will return up to `` top level comments and include all of its replies. 74 | 75 | ## Structured Comments 76 | 77 | **This is the default export style.** Structured scrapes resemble comment threads on Reddit. This style takes just a little longer to export compared to the raw format because `URS` uses [depth-first search][depth-first search] to create the comment `Forest` after retrieving all comments from a submission. 78 | 79 | If you want to learn more about how it works, refer to [The Forest](../implementation-details/the-forest.md), where I describe how I implemented the `Forest`, and [Speeding up Python With Rust](../implementation-details/speeding-up-python-with-rust.md) to learn about how I drastically improved the performance of the `Forest` by rewriting it in Rust. 80 | 81 | ## Raw Comments 82 | 83 | Raw scrapes do not resemble comment threads, but returns all comments on a submission in level order: all top-level comments are listed first, followed by all second-level comments, then third, etc. 84 | 85 | You can export to raw format by including the `--raw` flag. `-raw` will also be appended to the end of the file name. 86 | 87 | [depth-first search]: https://www.interviewcake.com/concept/java/dfs 88 | [submission comments demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/praw_scrapers/static_scrapers/submission_comments_demo.gif?raw=true 89 | -------------------------------------------------------------------------------- /.github/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at urs_project@protonmail.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /manual/src/exporting.md: -------------------------------------------------------------------------------- 1 | # Exporting 2 | 3 | ## Table of Contents 4 | 5 | - [Export File Format](#export-file-format) 6 | - [Exporting to CSV](#exporting-to-csv) 7 | - [Export Directory Structure](#export-directory-structure) 8 | - [PRAW Scrapers](#praw-scrapers) 9 | - [PRAW Livestream Scrapers](#praw-livestream-scrapers) 10 | - [Analytical Tools](#analytical-tools) 11 | - [Example Directory Structure](#example-directory-structure) 12 | 13 | ## Export File Format 14 | 15 | **All files except for those generated by the wordcloud tool are exported to JSON by default**. Wordcloud files are exported to PNG by default. 16 | 17 | `URS` supports exporting to CSV as well, but JSON is the more versatile option. 18 | 19 | ### Exporting to CSV 20 | 21 | You will have to include the `--csv` flag to export to CSV. 22 | 23 | You can only export to CSV when using: 24 | 25 | - The Subreddit scrapers. 26 | - The word frequencies generator. 27 | 28 | These tools are also suitable for CSV format and are optimized to do so if you want to use that format instead. 29 | 30 | The `--csv` flag is ignored if it is present while using any of the other scrapers. 31 | 32 | ## Export Directory Structure 33 | 34 | All exported files are saved within the `scrapes` directory and stored in a sub-directory labeled with the date. Many more sub-directories may be created in the date directory. Sub-directories are only created when its respective tool is run. For example, if you only use the Subreddit scraper, only the `subreddits` directory is created. 35 | 36 | ### PRAW Scrapers 37 | 38 | The `subreddits`, `redditors`, or `comments` directories may be created. 39 | 40 | ### PRAW Livestream Scrapers 41 | 42 | The `livestream` directory is created when you run any of the livestream scrapers. Within it, the `subreddits` or `redditors` directories may be created. 43 | 44 | ### Analytical Tools 45 | 46 | The `analytics` directory is created when you run any of the analytical tools. Within it, the `frequencies` or `wordclouds` directories may be created. See the [Analytical Tools](./analytical-tools/general-information.md) section for more information. 47 | 48 | ### Example Directory Structure 49 | 50 | This is the [samples][samples] directory structure generated by [`nomad`][nomad], a modern `tree` alternative I wrote in [Rust][rust]. 51 | 52 | ``` 53 | scrapes/ 54 | └── 06-02-2021 55 | ├── analytics 56 | │   ├── frequencies 57 | │   │   ├── comments 58 | │   │   │   └── What’s something from the 90s you miss_-all.json 59 | │   │   ├── livestream 60 | │   │   │   └── subreddits 61 | │   │   │   └── askreddit-comments-20_44_11-00_01_10.json 62 | │   │   └── subreddits 63 | │   │   └── cscareerquestions-search-'job'-past-year-rules.json 64 | │   └── wordcloud 65 | │   ├── comments 66 | │   │   └── What’s something from the 90s you miss_-all.png 67 | │   ├── livestream 68 | │   │   └── subreddits 69 | │   │   └── askreddit-comments-20_44_11-00_01_10.png 70 | │   └── subreddits 71 | │   └── cscareerquestions-search-'job'-past-year-rules.png 72 | ├── comments 73 | │   └── What’s something from the 90s you miss_-all.json 74 | ├── livestream 75 | │   └── subreddits 76 | │   ├── askreddit-comments-20_44_11-00_01_10.json 77 | │   └── askreddit-submissions-20_46_12-00_01_52.json 78 | ├── redditors 79 | │   └── spez-5-results.json 80 | ├── subreddits 81 | │   ├── askreddit-hot-10-results.json 82 | │   └── cscareerquestions-search-'job'-past-year-rules.json 83 | └── urs.log 84 | ``` 85 | 86 | [nomad]: https://github.com/JosephLai241/nomad 87 | [samples]: https://github.com/JosephLai241/URS/tree/samples 88 | [rust]: https://www.rust-lang.org/ 89 | -------------------------------------------------------------------------------- /urs/utils/Global.py: -------------------------------------------------------------------------------- 1 | """ 2 | Global variables 3 | ================ 4 | Variables, functions, and classes that are used throughout this program. 5 | """ 6 | 7 | 8 | import datetime as dt 9 | from typing import Any, Dict, List, Union 10 | 11 | from halo import Halo 12 | 13 | # Get current date. 14 | date = dt.datetime.now().strftime("%Y-%m-%d") 15 | 16 | # Subreddit categories. 17 | categories = ["Hot", "New", "Controversial", "Top", "Rising", "Search"] 18 | short_cat = [cat[0] for cat in categories] 19 | 20 | 21 | def convert_time(raw_timestamp: float) -> str: 22 | """ 23 | Convert UNIX time to readable format. 24 | 25 | :param float raw_timestamp: A UNIX timestamp. 26 | 27 | :returns: The timestamp converted into a readable format. 28 | :rtype: `str` 29 | """ 30 | 31 | return dt.datetime.fromtimestamp(raw_timestamp).strftime("%Y-%m-%d %H:%M:%S") 32 | 33 | 34 | def confirm_settings() -> Union[str, None]: 35 | """ 36 | Confirm scraping options. 37 | 38 | :raises ValueError: Raised if the confirmation input is invalid. 39 | 40 | :returns: A `str` denoting whether to confirm settings and continue scraping, 41 | or `None` if the operation is cancelled. 42 | :rtype: `str | None` 43 | """ 44 | 45 | options = ["y", "n"] 46 | 47 | while True: 48 | try: 49 | confirm = input("\nConfirm options? [Y/N] ").strip().lower() 50 | 51 | if confirm == options[0]: 52 | return confirm 53 | elif confirm == options[1]: 54 | break 55 | elif confirm not in options: 56 | raise ValueError 57 | except ValueError: 58 | print("Not an option! Try again.") 59 | 60 | 61 | def make_list_dict(keys: List[str]) -> Dict[str, List[Any]]: 62 | """ 63 | Initialize a dictionary of keys with empty lists as values. 64 | 65 | :param list[str] keys: A `list[str]` of keys used to initialize a dictionary. 66 | 67 | :returns: A `dict[str, list[any]]` initialized with the keys in the `keys` 68 | `list[str]` and empty arrays as its values. 69 | """ 70 | 71 | return dict((key, []) for key in keys) 72 | 73 | 74 | def make_none_dict(keys: List[str]) -> Dict[str, None]: 75 | """ 76 | Initialize a dictionary of keys with `None` as values. 77 | 78 | :param list[str] keys: A `list[str]` of keys used to initialize a dictionary. 79 | 80 | :returns: A `dict[str, list[any]]` initialized with the keys in the `keys` 81 | `list[str]` and `None` as its values. 82 | """ 83 | 84 | return dict((key, None) for key in keys) 85 | 86 | 87 | class Status: 88 | """ 89 | Methods for defining status spinners. 90 | """ 91 | 92 | def __init__(self, after_message: str, before_message: str, color: str) -> None: 93 | """ 94 | Initialize variables used in later methods: 95 | 96 | self._after_message: success message 97 | self._before_message: status message 98 | self._color: the color of the spinner 99 | 100 | self._spinner: Halo instance 101 | 102 | :param str after_message: The success message to display. 103 | :param str before_message: The status message to display. 104 | :param str color: The spinner's color. 105 | """ 106 | 107 | self._after_message = after_message 108 | self._before_message = before_message 109 | self._color = color 110 | 111 | self.spinner = Halo(color=self._color, text=self._before_message) 112 | 113 | def start(self) -> None: 114 | """ 115 | Start the spinner. 116 | """ 117 | 118 | self.spinner.start() 119 | 120 | def succeed(self) -> None: 121 | """ 122 | Display the success spinner message. 123 | """ 124 | 125 | self.spinner.succeed(self._after_message) 126 | -------------------------------------------------------------------------------- /manual/src/scraping-reddit/subreddit.md: -------------------------------------------------------------------------------- 1 | # Table of Contents 2 | 3 | - [Subreddits](#subreddits) 4 | - [All Flags](#all-flags) 5 | - [Basic Usage](#basic-usage) 6 | - [Filename Naming Conventions](#filename-naming-conventions) 7 | - [Time Filters](#time-filters) 8 | - [Filename Naming Conventions](#filename-naming-conventions-1) 9 | - [Subreddit Rules and Post Requirements](#subreddit-rules-and-post-requirements) 10 | - [Bypassing the Final Settings Check](#bypassing-the-final-settings-check) 11 | 12 | # Subreddits 13 | 14 | ![Subreddit Demo GIF][subreddit demo] 15 | 16 | ## All Flags 17 | 18 | These are all the flags that may be used when scraping Subreddits. 19 | 20 | ``` 21 | [-r <(h|n|c|t|r|s)> []] 22 | [-y] 23 | [--csv] 24 | [--rules] 25 | ``` 26 | 27 | ## Basic Usage 28 | 29 | ``` 30 | poetry run Urs.py -r <(h|n|c|t|r|s)> 31 | ``` 32 | 33 | **Supports exporting to CSV.** To export to CSV, include the `--csv` flag. 34 | 35 | Specify Subreddits, the submission category, and how many results are returned from each scrape. I have also added a search option where you can search for keywords within a Subreddit. 36 | 37 | These are the submission categories: 38 | 39 | - Hot 40 | - New 41 | - Controversial 42 | - Top 43 | - Rising 44 | - Search 45 | 46 | ## Filename Naming Conventions 47 | 48 | The file names for all categories except for Search will follow this format: 49 | 50 | ``` 51 | [SUBREDDIT]-[POST_CATEGORY]-[N_RESULTS]-result(s).[FILE_FORMAT] 52 | ``` 53 | 54 | If you searched for keywords, file names will follow this format: 55 | 56 | ``` 57 | [SUBREDDIT]-Search-'[KEYWORDS]'.[FILE_FORMAT] 58 | ``` 59 | 60 | Scrape data is exported to the `subreddits` directory. 61 | 62 | > **_NOTE:_** Up to 100 results are returned if you search for keywords within a Subreddit. You will not be able to specify how many results to keep. 63 | 64 | # Time Filters 65 | 66 | Time filters may be applied to some categories. Here is a table of the categories on which you can apply a time filter as well as the valid time filters. 67 | 68 | | Categories | Time Filters | 69 | | ------------- | ------------- | 70 | | Controversial | All (default) | 71 | | Search | Day | 72 | | Top | Hour | 73 | | | Month | 74 | | | Week | 75 | | | Year | 76 | 77 | Specify the time filter after the number of results returned or keywords you want to search for: 78 | 79 | ``` 80 | poetry run Urs.py -r <(c|t|s)> [] 81 | ``` 82 | 83 | If no time filter is specified, the default time filter `all` is applied. The Subreddit settings table will display `None` for categories that do not offer the additional time filter option. 84 | 85 | ## Filename Naming Conventions 86 | 87 | If you specified a time filter, `-past-[TIME_FILTER]` will be appended to the file name before the file format like so: 88 | 89 | ``` 90 | [SUBREDDIT]-[POST_CATEGORY]-[N_RESULTS]-result(s)-past-[TIME_FILTER].[FILE_FORMAT] 91 | ``` 92 | 93 | Or if you searched for keywords: 94 | 95 | ``` 96 | [SUBREDDIT]-Search-'[KEYWORDS]'-past-[TIME_FILTER].[FILE_FORMAT] 97 | ``` 98 | 99 | # Subreddit Rules and Post Requirements 100 | 101 | You can also include the Subreddit's rules and post requirements in your scrape data by including the `--rules` flag. **This is only compatible with JSON**. This data will be included in the `subreddit_rules` field. 102 | 103 | If rules are included in your file, `-rules` will be appended to the end of the file name. 104 | 105 | # Bypassing the Final Settings Check 106 | 107 | After submitting the arguments and Reddit validation, `URS` will display a table of Subreddit scraping settings as a final check before executing. You can include the `-y` flag to bypass this and immediately scrape. 108 | 109 | [subreddit demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/praw_scrapers/static_scrapers/Subreddit_demo.gif?raw=true 110 | -------------------------------------------------------------------------------- /urs/utils/Tools.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tools 3 | ===== 4 | Running all tools that URS has to offer. 5 | """ 6 | 7 | 8 | import logging 9 | from argparse import ArgumentParser, Namespace 10 | from typing import Tuple 11 | 12 | from praw import Reddit 13 | 14 | from urs.analytics.Frequencies import GenerateFrequencies 15 | from urs.analytics.Wordcloud import GenerateWordcloud 16 | from urs.praw_scrapers.live_scrapers.Livestream import Livestream 17 | from urs.praw_scrapers.static_scrapers.Basic import RunBasic 18 | from urs.praw_scrapers.static_scrapers.Comments import RunComments 19 | from urs.praw_scrapers.static_scrapers.Redditor import RunRedditor 20 | from urs.praw_scrapers.static_scrapers.Subreddit import RunSubreddit 21 | from urs.praw_scrapers.utils.Validation import Validation 22 | from urs.utils.Cli import CheckCli, Parser 23 | from urs.utils.Titles import MainTitle 24 | from urs.utils.Utilities import DateTree 25 | 26 | 27 | class Run: 28 | """ 29 | Methods to call CLI and all tools. 30 | """ 31 | 32 | def __init__(self, reddit: Reddit) -> None: 33 | """ 34 | Initialize variables used in instance methods: 35 | 36 | self._reddit: Reddit instance 37 | self._args: argparse Namespace object 38 | self._parser: argparse ArgumentParser object 39 | 40 | :param Reddit reddit: PRAW `Reddit` object. 41 | """ 42 | 43 | self._reddit = reddit 44 | self._args, self._parser = self._introduce_then_args() 45 | 46 | def _introduce_then_args(self) -> Tuple[Namespace, ArgumentParser]: 47 | """ 48 | Print title, then run checks for CLI args and PRAW credentials. 49 | 50 | :returns: The `Namespace` and `ArgumentParser` objects. 51 | :rtype: `(Namespace, ArgumentParser)` 52 | """ 53 | 54 | MainTitle.title() 55 | 56 | args, parser = Parser().parse_args() 57 | CheckCli().check_args(args) 58 | 59 | return args, parser 60 | 61 | def run_urs(self) -> None: 62 | """ 63 | Switch for running all URS tools. 64 | """ 65 | 66 | if self._args.check: 67 | """ 68 | Run rate limit check. 69 | """ 70 | 71 | logging.info("RUNNING API CREDENTIALS CHECK.") 72 | logging.info("") 73 | 74 | Validation.validate_user(self._parser, self._reddit) 75 | 76 | elif self._args.tree: 77 | """ 78 | Display visual directory tree for a date (default is the current date). 79 | """ 80 | 81 | DateTree.display_tree(self._args.tree) 82 | 83 | elif ( 84 | self._args.subreddit 85 | or self._args.redditor 86 | or self._args.comments 87 | or self._args.basic 88 | ): 89 | """ 90 | Run PRAW scrapers. 91 | """ 92 | 93 | Validation.validate_user(self._parser, self._reddit) 94 | 95 | if self._args.subreddit: 96 | RunSubreddit.run(self._args, self._reddit) 97 | if self._args.redditor: 98 | RunRedditor.run(self._args, self._reddit) 99 | if self._args.comments: 100 | RunComments.run(self._args, self._reddit) 101 | elif self._args.basic: 102 | RunBasic.run(self._args, self._parser, self._reddit) 103 | 104 | elif self._args.live_subreddit or self._args.live_redditor: 105 | """ 106 | Run PRAW livestream scrapers. 107 | """ 108 | 109 | Validation.validate_user(self._parser, self._reddit) 110 | Livestream.stream(self._args, self._reddit) 111 | 112 | elif self._args.frequencies or self._args.wordcloud: 113 | """ 114 | Run analytical tools. 115 | """ 116 | 117 | if self._args.frequencies: 118 | GenerateFrequencies.generate(self._args) 119 | if self._args.wordcloud: 120 | GenerateWordcloud.generate(self._args) 121 | -------------------------------------------------------------------------------- /.github/STYLE_GUIDE.md: -------------------------------------------------------------------------------- 1 | # URS Style Guide 2 | 3 | ## Table of Contents 4 | 5 | - [Code Formatting](#code-formatting) 6 | - [`Black` Formatting](#black-formatting) 7 | - [`isort` Formatting](#isort-formatting) 8 | - [Docstring and Type Hint Etiquette](#docstring-and-type-hint-etiquette) 9 | - [Unit Testing Code](#unit-testing-code) 10 | 11 | ## Code Formatting 12 | 13 | The rules for code formatting are very simple -- **all formatting rules are delegated to [`Black`][black] and [`isort`][isort]**. 14 | 15 | ### `Black` Formatting 16 | 17 | Use the standard formatting rules when formatting code with `Black`. Formatting code manually is a very simple command: 18 | 19 | ``` 20 | black urs/ 21 | ``` 22 | 23 | ### `isort` Formatting 24 | 25 | When formatting imports with `isort`, you will have to specify the `profile` setting when running the command to allow for interoperability between code styles. In this case, you will have to tell `isort` to use the `black` profile since we are formatting everything else with `Black`. The command looks something like this: 26 | 27 | ``` 28 | isort urs/ --profile black 29 | ``` 30 | 31 | ## Docstring and Type Hint Etiquette 32 | 33 | **Every single function needs a docstring describing what the function does, its parameters, and what it returns (if applicable)**, even if the function name is self-explanatory. The docstring format used by `URS` is the reStructuredText (RST) format. See the [Real Python reStructuredText example][real python restructuredtext example] for an idea as to how it looks. 34 | 35 | Docstrings have a max character count of 80 characters. If the function description docstring exceeds 80 characters, continue typing on a new line. If parameter, exception, or return docstrings exceed 80 characters, create a new line, tab in (4 spaces), and continue typing. 36 | 37 | Parameters, exceptions, and return statements should be grouped together/separated by a newline. Refer to the Python codeblock below for an example. 38 | 39 | **Every single function also requires type hints for its parameters and return type**, even if the parameter name is self-explanatory. See this [Real Python type hint tutorial][real python type hint tutorial] if you are unfamiliar with type hints. 40 | 41 | Below is an example of a properly documented function: 42 | 43 | ```python 44 | def add_two_numbers(first_number: int, second_number: int) -> int: 45 | """ 46 | Returns the sum of two numbers. 47 | 48 | :param int first_number: The first number to add. 49 | :param int second_number: The second number to add. 50 | 51 | :raises ValueError: Raised if either `first_number` or `second_number` is not 52 | an `int`. 53 | 54 | :returns: The sum of two numbers 55 | :rtype: `int` 56 | """ 57 | 58 | if not isinstance(first_number, int) or not isinstance(second_number, int): 59 | raise ValueError("Can only add two integers together!") 60 | 61 | return first_number + second_number 62 | ``` 63 | 64 | ## Unit Testing Code 65 | 66 | Every method in URS has to be wrapped in a class for unit testing. This makes it easier to add and group tests if features are added to a method in the future. 67 | 68 | Showing an example would be the best way to describe how unit tests should be named and structured: 69 | 70 | `_list_switch()` is a method found in `Cli.py` within the `GetScrapeSettings` class: 71 | 72 | ```python 73 | class GetScrapeSettings(): 74 | """ 75 | Methods for creating data structures to store scrape settings. 76 | """ 77 | 78 | def _list_switch(self, args, index): 79 | ... 80 | ``` 81 | 82 | The unit test for this function is located in the `tests/` directory in the file `test_Cli.py` and looks like this: 83 | 84 | ```python 85 | class TestGetScrapeSettingsListSwitchMethod(): 86 | """ 87 | Testing GetScrapeSettings class _list_switch() method. 88 | """ 89 | 90 | def test_list_switch_method_first_switch(self): 91 | ... 92 | 93 | def test_list_switch_method_second_switch(self): 94 | ... 95 | 96 | def test_list_switch_method_third_switch(self): 97 | ... 98 | ``` 99 | 100 | The unit test class will use the following naming convention: 101 | 102 | ```python 103 | class Test[CamelCaseClassName][CamelCaseMethodName]Method(): 104 | ... 105 | ``` 106 | 107 | Include a block comment under the unit test class using the following convention: 108 | 109 | ``` 110 | Testing [ClassName] class [method_name()] method. 111 | ``` 112 | 113 | The unit test method will use the following naming convention: 114 | 115 | ```python 116 | def test_[underscored_method_name]_[underscored_test_case](self): 117 | ... 118 | ``` 119 | 120 | 121 | 122 | [black]: https://black.readthedocs.io/en/stable/ 123 | [isort]: https://pycqa.github.io/isort/ 124 | [real python type hint tutorial]: https://realpython.com/lessons/type-hinting/ 125 | [real python restructuredtext example]: https://realpython.com/documenting-python-code/#restructuredtext-example 126 | -------------------------------------------------------------------------------- /tests/test_praw_scrapers/test_static_scrapers/test_Basic.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | from io import StringIO 4 | 5 | from urs.praw_scrapers.static_scrapers import Basic 6 | 7 | ### Function names are pretty self-explanatory, so I will not be adding comments 8 | ### above the functions. 9 | 10 | ### Includes a total of 30 tests. 11 | 12 | 13 | class MakeArgs: 14 | """ 15 | Making dummy args to test Basic.py functions. 16 | """ 17 | 18 | @staticmethod 19 | def parser_for_testing_basic(): 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument("--basic", action="store_true") 22 | return parser 23 | 24 | 25 | class TestPrintSubsFindSubsMethod: 26 | """ 27 | Testing PrintSubs class _find_subs() method. 28 | """ 29 | 30 | def test_find_subs_only_returning_found_subreddits(self): 31 | pass 32 | 33 | def test_find_subs_only_returning_not_found_subreddits(self): 34 | pass 35 | 36 | def test_find_subs_returning_both_found_and_not_found_subreddits(self): 37 | pass 38 | 39 | 40 | class TestPrintSubsPrintSubredditsMethod: 41 | """ 42 | Testing PrintSubs class print_subreddits() method. 43 | """ 44 | 45 | def test_print_subreddits_only_printing_found_subreddits(self): 46 | pass 47 | 48 | def test_print_subreddits_only_printing_not_found_subreddits(self): 49 | pass 50 | 51 | def test_print_subreddits_printing_both_found_and_not_found_subreddits(self): 52 | pass 53 | 54 | 55 | class TestGetInputGetSubredditsMethod: 56 | """ 57 | Testing GetInput class get_subreddits() method. 58 | """ 59 | 60 | def test_get_input_get_subreddits_no_input_from_user(self): 61 | pass 62 | 63 | def test_get_input_get_subreddits_valid_input(self): 64 | pass 65 | 66 | 67 | class TestGetInputUpdateMasterMethod: 68 | """ 69 | Testing GetInput class _update_master() method. 70 | """ 71 | 72 | def test_update_master_not_search_category(self): 73 | cat_i = 0 74 | test_master = {"test_subreddit": []} 75 | search_for = 10 76 | sub = "test_subreddit" 77 | 78 | Basic.GetInput()._update_master(cat_i, test_master, search_for, sub) 79 | 80 | assert test_master == {"test_subreddit": [["h", 10, None]]} 81 | 82 | def test_update_master_search_category(self): 83 | cat_i = 5 84 | test_master = {"test_subreddit": []} 85 | search_for = "test string" 86 | sub = "test_subreddit" 87 | 88 | Basic.GetInput()._update_master(cat_i, test_master, search_for, sub) 89 | 90 | assert test_master == {"test_subreddit": [["s", "test string", "all"]]} 91 | 92 | 93 | class TestGetInputGetSearchMethod: 94 | """ 95 | Testing GetInput class _get_search() method. 96 | """ 97 | 98 | def test_get_input_search_for_is_a_number(self): 99 | pass 100 | 101 | def test_get_input_search_for_is_a_string(self): 102 | pass 103 | 104 | def test_get_input_search_for_no_input(self): 105 | pass 106 | 107 | 108 | class TestGetInputGetNResultsMethod: 109 | """ 110 | Testing GetInput class _get_n_results() method. 111 | """ 112 | 113 | def test_get_n_results_normal_input(self): 114 | pass 115 | 116 | def test_get_n_results_invalid_input(self): 117 | pass 118 | 119 | def test_get_n_results_no_input(self): 120 | pass 121 | 122 | 123 | class TestGetInputGetSettingsMethod: 124 | """ 125 | Testing GetInput class get_settings() method. 126 | """ 127 | 128 | def test_get_settings_selected_search_option(self): 129 | pass 130 | 131 | def test_get_settings_selected_other_category_option(self): 132 | pass 133 | 134 | def test_get_settings_invalid_option_out_of_range(self): 135 | pass 136 | 137 | def test_get_settings_invalid_option_is_not_a_number(self): 138 | pass 139 | 140 | 141 | class TestConfirmInputConfirmSubredditsMethod: 142 | """ 143 | Testing ConfirmInput class confirm_subreddits() method. 144 | """ 145 | 146 | def test_confirm_subreddits_selected_yes(self): 147 | pass 148 | 149 | def test_confirm_subreddits_selected_no(self): 150 | pass 151 | 152 | def test_confirm_subreddits_invalid_option(self): 153 | pass 154 | 155 | 156 | class TestConfirmInputAnotherMethod: 157 | """ 158 | Testing ConfirmInput class another() method. 159 | """ 160 | 161 | def test_another_selected_yes(self): 162 | pass 163 | 164 | def test_another_selected_no(self): 165 | pass 166 | 167 | def test_another_invalid_option(self): 168 | pass 169 | 170 | 171 | class TestRunBasicCreateSettingsMethod: 172 | """ 173 | Testing RunBasic class _create_settings() method. 174 | """ 175 | 176 | def test_create_settings(self): 177 | pass 178 | 179 | 180 | class TestRunBasicPrintConfirmMethod: 181 | """ 182 | Testing RunBasic class _print_confirm() method. 183 | """ 184 | 185 | def test_print_confirm(self): 186 | pass 187 | -------------------------------------------------------------------------------- /urs/analytics/Wordcloud.py: -------------------------------------------------------------------------------- 1 | """ 2 | Wordcloud Generator 3 | =================== 4 | Generate a wordcloud based on word frequencies extracted from scraped data. 5 | """ 6 | 7 | 8 | from argparse import Namespace 9 | from pathlib import Path 10 | from typing import List 11 | 12 | import matplotlib.pyplot as plt 13 | from colorama import Fore, Style 14 | from halo import Halo 15 | from wordcloud import WordCloud 16 | 17 | from urs.analytics.utils.PrepData import GetPath, PrepData 18 | from urs.utils.Global import Status 19 | from urs.utils.Logger import LogAnalytics 20 | from urs.utils.Titles import AnalyticsTitles 21 | 22 | 23 | class SetUpWordcloud: 24 | """ 25 | Methods for setting up the wordcloud. 26 | """ 27 | 28 | @staticmethod 29 | def initialize_wordcloud(file: List[str], scrape_type: str) -> WordCloud: 30 | """ 31 | Initialize wordcloud by setting dimensions, max font size, and generating 32 | it from word frequencies. 33 | 34 | :param list[str] file: A `list[str]` containing scrape files and file 35 | formats to generate wordclouds with. 36 | :param str scrape_type: The scrape type. 37 | 38 | :returns: A `WordCloud` instance. 39 | :rtype: `WordCloud` 40 | """ 41 | 42 | frequencies = PrepData.prep(file[0], scrape_type) 43 | 44 | initialize_status = Status( 45 | "Generated wordcloud.", "Generating wordcloud.", "white" 46 | ) 47 | 48 | initialize_status.start() 49 | wordcloud = WordCloud( 50 | height=1200, max_font_size=400, width=1600 51 | ).generate_from_frequencies(frequencies) 52 | initialize_status.succeed() 53 | 54 | return wordcloud 55 | 56 | @staticmethod 57 | def modify_wordcloud(wc: WordCloud): 58 | """ 59 | Further modify wordcloud preferences. 60 | 61 | :param WordCloud wc: The `WordCloud` instance. 62 | 63 | :returns: A `matplotlib.pyplot` instance. 64 | :rtype: `matplotlib.pyplot` 65 | """ 66 | 67 | plt.imshow(wc, interpolation="bilinear") 68 | plt.axis("off") 69 | 70 | return plt 71 | 72 | 73 | class FinalizeWordcloud: 74 | """ 75 | Methods for either saving or displaying the wordcloud. 76 | """ 77 | 78 | @LogAnalytics.log_show("wordcloud") 79 | def show_wordcloud(self, plt) -> None: 80 | """ 81 | Display wordcloud. 82 | 83 | :param matplotlib.pyplot plt: A `matplotlib.pyplot` instance. 84 | """ 85 | 86 | Halo().info(Style.BRIGHT + Fore.GREEN + "Displaying wordcloud.") 87 | print() 88 | 89 | plt.show() 90 | 91 | @LogAnalytics.log_save("wordcloud") 92 | def save_wordcloud( 93 | self, analytics_dir: str, scrape_file: List[str], wc: WordCloud 94 | ) -> str: 95 | """ 96 | Save wordcloud to file. 97 | 98 | :param str analytics_dir: the path to the directory in which the analytical 99 | data will be written. 100 | :param list[str] scrape_file: A `list[str]` containing scrape files and 101 | file formats to generate wordclouds with. 102 | :param WordCloud wc: The `WordCloud` instance. 103 | 104 | :returns: The filename for the exported wordcloud. 105 | :rtype: `str` 106 | """ 107 | 108 | filename = GetPath.name_file(analytics_dir, scrape_file[0]) 109 | 110 | split_path = list(Path(filename).parts) 111 | 112 | split_filename = split_path[-1].split(".") 113 | split_filename[-1] = scrape_file[-1] 114 | 115 | split_path[-1] = ".".join(split_filename) 116 | new_filename = "/".join(split_path) 117 | 118 | export_status = Status( 119 | Style.BRIGHT + Fore.GREEN + f"Wordcloud exported to {new_filename}.", 120 | "Exporting wordcloud.", 121 | "white", 122 | ) 123 | 124 | export_status.start() 125 | wc.to_file(new_filename) 126 | export_status.succeed() 127 | print() 128 | 129 | return new_filename 130 | 131 | 132 | class GenerateWordcloud: 133 | """ 134 | Methods for generating a wordcloud. 135 | """ 136 | 137 | @staticmethod 138 | @LogAnalytics.generator_timer("wordcloud") 139 | def generate(args: Namespace) -> None: 140 | """ 141 | Generate wordcloud. 142 | 143 | :param Namespace args: A `Namespace` object containing all arguments used 144 | in the CLI. 145 | """ 146 | 147 | AnalyticsTitles.wc_title() 148 | 149 | for scrape_file in args.wordcloud: 150 | analytics_dir, scrape_type = GetPath.get_scrape_type( 151 | scrape_file[0], "wordcloud" 152 | ) 153 | wc = SetUpWordcloud.initialize_wordcloud(scrape_file, scrape_type) 154 | plt = SetUpWordcloud.modify_wordcloud(wc) 155 | 156 | FinalizeWordcloud().show_wordcloud( 157 | plt 158 | ) if args.nosave else FinalizeWordcloud().save_wordcloud( 159 | analytics_dir, scrape_file, wc 160 | ) 161 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | # Summary 2 | 3 | Delete this line and write your summary here. Section your summary by relevance if it is lengthy. 4 | 5 | # Motivation/Context 6 | 7 | Delete this line and write your motivation/context here. Section your motivation/context by relevance if it is lengthy. 8 | 9 | # New Dependencies 10 | 11 | ``` 12 | Delete this line and paste your new dependencies here. Put "None" here if there are no new dependencies. 13 | ``` 14 | 15 | # Issue Fix or Enhancement Request 16 | 17 | **Put "N/A" in this block if this is not applicable.** 18 | 19 | If it fixes an open issue, link the issue and write a summary for the bug and fix like so: 20 | 21 | - Fixes #issue_number_here. 22 | - **Bug:** Write a brief description of the bug. 23 | - **Fix:** Write a brief description of the fix. 24 | - If applicable, add additional information for the fix. 25 | 26 | Alternatively, if it resolves an open feature/enhancement request, link the request in this pull request like so: 27 | 28 | - Resolves #issue_number_here. 29 | - **Enhancement/Feature Request:** Write a brief description of the enhancement/feature request. 30 | - **Enhancement or Feature:** Write a brief description of what is new in this pull request. 31 | - If applicable, add additional information for the enhancement or feature. 32 | 33 | If neither of the above apply, use the templates described above and replace the issue number with a summary of the new changes you have made. 34 | 35 | # Type of Change 36 | 37 | **Please delete options that are not relevant.** 38 | 39 | - [x] Bug Fix (non-breaking change which fixes an issue) 40 | - [x] Bug Fix - Breaking Change (breaking change causes existing functionality to not work as expected) 41 | - [x] Code Refactor 42 | - [x] New Feature (non-breaking change which adds functionality) 43 | - [x] New Feature - Breaking Change (breaking change causes existing functionality to not work as expected) 44 | - [x] This change requires a documentation update 45 | 46 | # Breaking Change 47 | 48 | **Put "N/A" in this block if this is not applicable.** 49 | 50 | Delete this line and describe how URS breaks. Then provide a code block or screenshots of the **_entire_** traceback underneath your description. Section your description by relevance if it is lengthy. 51 | 52 | ``` 53 | Paste entire traceback here. Make sure the traceback is formatted correctly. 54 | ``` 55 | 56 | # List the Most Significant Changes That Have Been Made 57 | 58 | **Please delete sections and/or fields that are not relevant.** 59 | 60 | ## Added 61 | 62 | - Summary of some new feature 63 | - Description of the addition 64 | - Summary of some new feature 65 | - Description of the addition 66 | - Summary of some new feature 67 | - Description of the addition 68 | - Summary of some new feature 69 | - Description of the addition 70 | 71 | ## Changed 72 | 73 | - Summary of something that changed 74 | - Description of the change 75 | - Summary of something that changed 76 | - Description of the change 77 | - Summary of something that changed 78 | - Description of the change 79 | - Summary of something that changed 80 | - Description of the change 81 | 82 | ## Deprecated 83 | 84 | - Summary of something that has been deprecated 85 | - Summary of what has been deprecated 86 | - Summary of something that has been deprecated 87 | - Summary of what has been deprecated 88 | - Summary of something that has been deprecated 89 | - Summary of what has been deprecated 90 | - Summary of something that has been deprecated 91 | - Summary of what has been deprecated 92 | 93 | # How Has This Been Tested? 94 | 95 | **Put "N/A" in this block if this is not applicable.** 96 | 97 | Please describe the tests that you ran to verify your changes. Provide instructions so I can reproduce. Please also list any relevant details for your test configuration. Section your tests by relevance if it is lengthy. An example outline is shown below: 98 | 99 | - Summary of a test here 100 | - Details here with relevant test commands underneath. 101 | - Ran `test command here`. 102 | - If applicable, more details about the command underneath. 103 | - Then ran `another test command here`. 104 | 105 | ## Test Configuration 106 | 107 | **Put "N/A" in this block if this is not applicable.** 108 | 109 | - Python version: 3.x.x 110 | 111 | If applicable, describe more configuration settings. An example outline is shown below: 112 | 113 | - Summary goes here. 114 | - Configuration 1. 115 | - Configuration 2. 116 | - If applicable, provide extra details underneath a configuration. 117 | - Configuration 3. 118 | 119 | # `pyproject.toml` 120 | 121 | ```toml 122 | Paste your new `pyproject.toml` here. Put "N/A" in this block if this is not applicable. 123 | ``` 124 | 125 | # Checklist 126 | 127 | Tip: You can check off items by writing an "x" in the brackets, e.g. `[x]`. 128 | 129 | - [ ] My code follows the [style guidelines][style guide] of this project. 130 | - [ ] I have performed a self-review of my own code, including testing to ensure my fix is effective or that my feature works. 131 | - [ ] My changes generate no new warnings. 132 | - [ ] I have commented my code, providing a summary of the functionality of each method, particularly in areas that may be hard to understand. 133 | - [ ] I have made corresponding changes to the documentation. 134 | - [ ] I have performed a self-review of this Pull Request template, ensuring the Markdown file renders correctly. 135 | 136 | 137 | 138 | [style guide]: STYLE_GUIDE.md 139 | -------------------------------------------------------------------------------- /urs/analytics/Frequencies.py: -------------------------------------------------------------------------------- 1 | """ 2 | Frequencies generator 3 | ===================== 4 | Get frequencies for words that are found in submission titles, bodies, and/or 5 | comments within scraped data. 6 | """ 7 | 8 | 9 | from argparse import Namespace 10 | from typing import Any, Dict, List, Literal, Tuple, Union 11 | 12 | from colorama import Fore, Style 13 | from halo import Halo 14 | 15 | from urs.analytics.utils.PrepData import GetPath, PrepData 16 | from urs.utils.Export import Export 17 | from urs.utils.Global import Status 18 | from urs.utils.Logger import LogAnalytics 19 | from urs.utils.Titles import AnalyticsTitles 20 | 21 | 22 | class Sort: 23 | """ 24 | Methods for sorting the frequencies data. 25 | """ 26 | 27 | def get_data(self, scrape_file: List[str]) -> Tuple[str, Dict[str, int]]: 28 | """ 29 | Get data from scrape file. 30 | 31 | :param list[str] scrape_file: The path to the directory in which the analytical 32 | data will be written. 33 | 34 | :returns: The path to the directory in which the analytical data will be 35 | written, and a `dict[str, int]` containing extracted scrape data. 36 | :rtype: `(str, dict[str, int])` 37 | """ 38 | 39 | analytics_dir, scrape_type = GetPath.get_scrape_type( 40 | scrape_file[0], "frequencies" 41 | ) 42 | 43 | return analytics_dir, PrepData.prep(scrape_file[0], scrape_type) 44 | 45 | def name_and_create_dir( 46 | self, analytics_dir: str, args: Namespace, scrape_file: List[str] 47 | ) -> Tuple[Literal["csv", "json"], str]: 48 | """ 49 | Name the new file and create the analytics directory. 50 | 51 | :param str analytics_dir: The path to the directory in which the analytical 52 | data will be written. 53 | :param Namespace args: A `Namespace` object containing all arguments used 54 | in the CLI. 55 | :param list[str] scrape_file: A `list[str]` containing scrape files and 56 | file formats to generate wordclouds with. 57 | 58 | :returns: The file format and the filename. 59 | :rtype: `(str, str)` 60 | """ 61 | 62 | f_type = "csv" if args.csv else "json" 63 | 64 | filename = GetPath.name_file(analytics_dir, scrape_file[0]) 65 | 66 | return f_type, filename 67 | 68 | def create_csv(self, plt_dict: Dict[str, int]) -> Dict[str, List[Union[str, int]]]: 69 | """ 70 | Create CSV structure for exporting. 71 | 72 | :param dict[str, int] plt_dict: A `dict[str, int]` containing word frequency 73 | data. 74 | 75 | :returns: A `dict[str, list[str | int]]` containing word frequency data. 76 | :rtype: `Dict[str, List[Union[str, int]]]` 77 | """ 78 | 79 | overview = {"words": [], "frequencies": []} 80 | 81 | for word, frequency in plt_dict.items(): 82 | overview["words"].append(word) 83 | overview["frequencies"].append(frequency) 84 | 85 | return overview 86 | 87 | def create_json( 88 | self, plt_dict: Dict[str, int], scrape_file: List[str] 89 | ) -> Dict[str, Any]: 90 | """ 91 | Create JSON structure for exporting. 92 | 93 | :param dict[str, int] plt_dict: A `dict[str, int]` containing word frequency 94 | data. 95 | :param list[str] scrape_file: A `list[str]` containing files and file 96 | formats to generate wordclouds with. 97 | 98 | :returns: A `dict[str, list[str | int]]` containing word frequency data. 99 | :rtype: `Dict[str, List[Union[str, int]]]` 100 | """ 101 | 102 | return {"raw_file": scrape_file[0], "data": plt_dict} 103 | 104 | 105 | class ExportFrequencies: 106 | """ 107 | Methods for exporting the frequencies data. 108 | """ 109 | 110 | @staticmethod 111 | @LogAnalytics.log_export 112 | def export(data: Dict[str, Any], f_type: str, filename: str) -> None: 113 | """ 114 | Write data dictionary to JSON or CSV. 115 | 116 | :param dict[str, Any] data: A dictionary containing word frequency data. 117 | :param str f_type: The file format. 118 | :param str filename: The file name. 119 | """ 120 | 121 | Export.write_json(data, filename) if f_type == "json" else Export.write_csv( 122 | data, filename 123 | ) 124 | 125 | 126 | class GenerateFrequencies: 127 | """ 128 | Methods for generating word frequencies. 129 | """ 130 | 131 | @staticmethod 132 | @LogAnalytics.generator_timer("frequencies") 133 | def generate(args: Namespace) -> None: 134 | """ 135 | Generate frequencies. 136 | 137 | :param Namespace args: A `Namespace` object containing all arguments used 138 | in the CLI. 139 | """ 140 | 141 | AnalyticsTitles.f_title() 142 | 143 | for scrape_file in args.frequencies: 144 | analytics_dir, plt_dict = Sort().get_data(scrape_file) 145 | f_type, filename = Sort().name_and_create_dir( 146 | analytics_dir, args, scrape_file 147 | ) 148 | 149 | Halo().info("Generating frequencies.") 150 | print() 151 | data = ( 152 | Sort().create_csv(plt_dict) 153 | if args.csv 154 | else Sort().create_json(plt_dict, scrape_file) 155 | ) 156 | 157 | export_status = Status( 158 | Style.BRIGHT 159 | + Fore.GREEN 160 | + f"Frequencies exported to {'/'.join(filename.split('/')[filename.split('/').index('scrapes'):])}.", 161 | "Exporting frequencies.", 162 | "white", 163 | ) 164 | 165 | export_status.start() 166 | ExportFrequencies.export(data, f_type, filename) 167 | export_status.succeed() 168 | print() 169 | -------------------------------------------------------------------------------- /tests/test_analytics/test_utils/test_PrepData.py: -------------------------------------------------------------------------------- 1 | """ 2 | Testing `PrepData.py`. 3 | """ 4 | 5 | 6 | from urs.analytics.utils import PrepData 7 | 8 | 9 | class TestGetPathGetScrapeTypeMethod: 10 | """ 11 | Testing GetPath class get_scrape_type() method. 12 | """ 13 | 14 | def test_get_scrape_type_method_valid_filepath(self): 15 | test_path = "../scrapes/some_date/test/some_other_dir/some_file.json" 16 | 17 | analytics_dir, scrape_dir = PrepData.GetPath.get_scrape_type( 18 | test_path, "frequencies" 19 | ) 20 | 21 | assert ( 22 | analytics_dir 23 | == "../scrapes/some_date/analytics/frequencies/test/some_other_dir" 24 | ) 25 | assert scrape_dir == "test" 26 | 27 | def test_get_scrape_type_method_invalid_directory(self): 28 | test_path = "../scrapes/some_date/test/some_other_dir/some_file.txt" 29 | 30 | try: 31 | PrepData.GetPath.get_scrape_type(test_path, "frequencies") 32 | assert False 33 | except SystemExit: 34 | assert True 35 | 36 | def test_get_scrape_type_method_invalid_file_type(self): 37 | test_path = "../scrapes/some_date/analytics/some_other_dir/some_file.json" 38 | 39 | try: 40 | PrepData.GetPath.get_scrape_type(test_path, "wordcloud") 41 | assert False 42 | except SystemExit: 43 | assert True 44 | 45 | 46 | class TestGetPathNameFileMethod: 47 | """ 48 | Testing GetPath class name_file() method. 49 | """ 50 | 51 | def test_name_file_method(self): 52 | test_analytics = ( 53 | "../scrapes/some_date/analytics/frequencies/test/some_other_dir" 54 | ) 55 | test_path = "../something/another_thing/a_third_thing/test.json" 56 | 57 | filename = PrepData.GetPath.name_file(test_analytics, test_path) 58 | 59 | assert ( 60 | filename 61 | == "..\\scrapes\\some_date\\analytics\\frequencies\\test\\some_other_dir/test.json" 62 | if "\\" in filename 63 | else "../scrapes/some_date/analytics/frequencies/test/some_other_dir/test.json" 64 | ) 65 | 66 | 67 | class TestExtractExtractMethod: 68 | """ 69 | Testing Extract class extract() method. 70 | """ 71 | 72 | def test_extract_method(self): 73 | pass 74 | 75 | 76 | class TestCleanDataRemoveExtrasMethod: 77 | """ 78 | Testing CleanData class _remove_extras() method. 79 | """ 80 | 81 | def test_remove_extras_method(self): 82 | test = "[t(e)s,t:i;n.}g{at`r]ing" 83 | 84 | assert PrepData.CleanData._remove_extras(test) == "t e s t i n g a s t r ing" 85 | 86 | 87 | class TestCleanDataCountWordsMethod: 88 | """ 89 | Testing CleanData class count_words() method. 90 | """ 91 | 92 | def test_count_words_method(self): 93 | plt_dict = dict() 94 | obj = { 95 | "first": "Some text here in the first field [(,", 96 | "second": "Another line of words here", 97 | } 98 | 99 | PrepData.CleanData.count_words("second", obj, plt_dict) 100 | 101 | assert plt_dict["Another"] == 1 102 | 103 | 104 | class TestPrepSubredditPrepSubredditMethod: 105 | """ 106 | Testing PrepSubreddit class prep_subreddit() method. 107 | """ 108 | 109 | def test_prep_subreddit_method(self): 110 | data = [ 111 | {"selftext": "This is a test selftext", "title": "This is a test title"}, 112 | {"selftext": "This is a test selftext", "title": "This is a test title"}, 113 | ] 114 | 115 | word_count = PrepData.PrepSubreddit.prep_subreddit(data) 116 | 117 | assert word_count["This"] == 4 118 | 119 | 120 | class TestPrepRedditorPrepRedditorMethod: 121 | """ 122 | Testing PrepRedditor class prep_redditor() method. 123 | """ 124 | 125 | def test_prep_redditor_method(self): 126 | data = { 127 | "interactions": { 128 | "comments": [ 129 | { 130 | "type": "comment", 131 | "body": "This is a test body", 132 | } 133 | ], 134 | "submissions": [ 135 | { 136 | "type": "submission", 137 | "selftext": "This is a test selftext", 138 | "title": "This is a test title", 139 | } 140 | ], 141 | "hidden": ["FORBIDDEN"], 142 | } 143 | } 144 | 145 | word_count = PrepData.PrepRedditor.prep_redditor(data) 146 | 147 | assert word_count["This"] == 3 148 | assert word_count["selftext"] == 1 149 | assert word_count["body"] == 1 150 | assert "FORBIDDEN" not in word_count.keys() 151 | 152 | 153 | class TestPrepCommentsPrepCommentsMethod: 154 | """ 155 | Testing PrepComments class prep_comments() method. 156 | """ 157 | 158 | def test_prep_comments_method_prep_raw_comments(self): 159 | data = { 160 | "scrape_settings": {"style": "raw"}, 161 | "data": { 162 | "comments": [ 163 | {"body": "This is a test body"}, 164 | {"body": "This is a test body"}, 165 | ] 166 | }, 167 | } 168 | 169 | word_count = PrepData.PrepComments.prep_comments(data) 170 | 171 | assert word_count["This"] == 2 172 | 173 | def test_prep_comments_method_prep_structured_comments(self): 174 | data = { 175 | "scrape_settings": {"style": "structured"}, 176 | "data": { 177 | "comments": [ 178 | { 179 | "body": "This is a test body", 180 | "replies": [{"body": "This is a test body", "replies": []}], 181 | } 182 | ] 183 | }, 184 | } 185 | 186 | word_count = PrepData.PrepComments.prep_comments(data) 187 | 188 | assert word_count["test"] == 2 189 | -------------------------------------------------------------------------------- /manual/src/scraping-reddit/redditor.md: -------------------------------------------------------------------------------- 1 | # Table of Contents 2 | 3 | - [Redditors](#redditors) 4 | - [All Flags](#all-flags) 5 | - [Usage](#usage) 6 | - [Redditor Interaction Attributes](#redditor-interaction-attributes) 7 | - [Reddit Object Attributes](#reddit-object-attributes) 8 | - [File Naming Conventions](#file-naming-conventions) 9 | 10 | # Redditors 11 | 12 | ![Redditor Demo GIF][redditor demo] 13 | 14 | \*_This GIF has been cut for demonstration purposes._ 15 | 16 | > **_NOTE:_** If you are not allowed to access a Redditor's lists, PRAW will raise a 403 HTTP Forbidden exception and the program will just append `"FORBIDDEN"` underneath that section in the exported file. 17 | 18 | ## All Flags 19 | 20 | These are all the flags that may be used when scraping Redditors. 21 | 22 | ``` 23 | [-u ] 24 | ``` 25 | 26 | > **_NOTE:_** The number of results returned are applied to all attributes. I have not implemented code to allow users to specify different number of results returned for individual attributes. 27 | 28 | ## Usage 29 | 30 | ``` 31 | poetry run Urs.py -u 32 | ``` 33 | 34 | Redditor information will be included in the `information` field and includes the following attributes: 35 | 36 | - `comment_karma` 37 | - `created_utc` 38 | - `fullname` 39 | - `has_verified_email` 40 | - `icon_img` 41 | - `id` 42 | - `is_employee` 43 | - `is_friend` 44 | - `is_mod` 45 | - `is_gold` 46 | - `link_karma` 47 | - `name` 48 | - `subreddit` 49 | - `trophies` 50 | 51 | ## Redditor Interaction Attributes 52 | 53 | Redditor interactions will be included in the `interactions` field. Here is a table of all Redditor interaction attributes that are also included, how they are sorted, and what type of Reddit objects are included in each. 54 | 55 | | Attribute Name | Sorted By/Time Filter | Reddit Objects | 56 | | -------------- | ------------------------------------------- | ------------------------ | 57 | | Comments | Sorted By: New | Comments | 58 | | Controversial | Time Filter: All | Comments and submissions | 59 | | Downvoted | Sorted By: New | Comments and submissions | 60 | | Gilded | Sorted By: New | Comments and submissions | 61 | | Gildings | Sorted By: New | Comments and submissions | 62 | | Hidden | Sorted By: New | Comments and submissions | 63 | | Hot | Determined by other Redditors' interactions | Comments and submissions | 64 | | Moderated | N/A | Subreddits | 65 | | Multireddits | N/A | Multireddits | 66 | | New | Sorted By: New | Comments and submissions | 67 | | Saved | Sorted By: New | Comments and submissions | 68 | | Submissions | Sorted By: New | Submissions | 69 | | Top | Time Filter: All | Comments and submissions | 70 | | Upvoted | Sorted By: New | Comments and submissions | 71 | 72 | These attributes contain comments or submissions. Subreddit attributes are also included within both. 73 | 74 | ## Reddit Object Attributes 75 | 76 | This is a table of all attributes that are included for each Reddit object: 77 | 78 | | Subreddits | Comments | Submissions | Multireddits | Trophies | 79 | | ----------------------- | --------------- | --------------------- | ------------------ | ------------- | 80 | | `can_assign_link_flair` | `body` | `author` | `can_edit` | `award_id` | 81 | | `can_assign_user_flair` | `body_html` | `created_utc` | `copied_from` | `description` | 82 | | `created_utc` | `created_utc` | `distinguished` | `created_utc` | `icon_40` | 83 | | `description` | `distinguished` | `edited` | `description_html` | `icon_70` | 84 | | `description_html` | `edited` | `id` | `description_md` | `name` | 85 | | `display_name` | `id` | `is_original_content` | `display_name` | `url` | 86 | | `id` | `is_submitter` | `is_self` | `name` | | 87 | | `name` | `link_id` | `link_flair_text` | `nsfw` | | 88 | | `nsfw` | `parent_id` | `locked` | `subreddits` | | 89 | | `public_description` | `score` | `name` | `visibility` | | 90 | | `spoilers_enabled` | `stickied` | `num_comments` | | | 91 | | `subscribers` | \*`submission` | `nsfw` | | | 92 | | `user_is_banned` | `subreddit_id` | `permalink` | | | 93 | | `user_is_moderator` | | `score` | | | 94 | | `user_is_subscriber` | | `selftext` | | | 95 | | | | `spoiler` | | | 96 | | | | `stickied` | | | 97 | | | | \*`subreddit` | | | 98 | | | | `title` | | | 99 | | | | `upvote_ratio` | | | 100 | | | | `url` | | | 101 | 102 | \* Contains additional metadata. 103 | 104 | ## File Naming Conventions 105 | 106 | The file names will follow this format: 107 | 108 | ``` 109 | [USERNAME]-[N_RESULTS]-result(s).json 110 | ``` 111 | 112 | Scrape data is exported to the `redditors` directory. 113 | 114 | [redditor demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/praw_scrapers/static_scrapers/Redditor_demo.gif?raw=true 115 | -------------------------------------------------------------------------------- /urs/utils/Utilities.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities 3 | ========= 4 | Miscellaneous utilities for URS. 5 | """ 6 | 7 | 8 | import logging 9 | from pathlib import Path, PurePath 10 | from typing import List, Tuple 11 | 12 | import rich 13 | from colorama import Fore, Style 14 | from halo import Halo 15 | from rich.filesize import decimal 16 | from rich.tree import Tree 17 | 18 | from urs.utils.Global import Status 19 | from urs.utils.Titles import Errors 20 | 21 | 22 | class DateTree: 23 | """ 24 | Methods for creating a visual representation of a target date directory located 25 | within the `scrapes` directory. 26 | """ 27 | 28 | @staticmethod 29 | def _check_date_format(date: str) -> str: 30 | """ 31 | Check if the date format is valid. Revise date separation character if 32 | '/' was used instead of '-'. 33 | 34 | :param str date: The date of the scrapes directory. 35 | 36 | :raises TypeError: Raised if an invalid date format is entered. 37 | 38 | :returns: The date to search for. 39 | :rtype: `str` 40 | """ 41 | 42 | split_date = [char for char in date] 43 | 44 | if not any(char in split_date for char in ["-", "/"]) or len(split_date) < 10: 45 | raise TypeError 46 | 47 | if "/" in split_date: 48 | for i in range(len(split_date)): 49 | if split_date[i] == "/": 50 | split_date[i] = "-" 51 | 52 | return "".join(split_date) 53 | 54 | @staticmethod 55 | def _find_date_directory(date: str) -> bool: 56 | """ 57 | Traverse the `scrapes/` directory to find the corresponding date directory. 58 | 59 | :param str date: The date of the scrapes directory. 60 | 61 | :returns: Whether the date directory exists within the `scrapes/` directory. 62 | :rtype: `bool` 63 | """ 64 | 65 | dir_exists = False 66 | 67 | scrapes_dir = f"{Path(Path.cwd()).parents[0]}/scrapes" 68 | for path in Path(scrapes_dir).iterdir(): 69 | if path.is_dir() and PurePath(path).name == date: 70 | dir_exists = True 71 | 72 | return dir_exists 73 | 74 | @staticmethod 75 | def _create_stack(directory: str, tree: Tree) -> List[Tuple[Path, Tree]]: 76 | """ 77 | Create a stack containing paths within a directory. 78 | 79 | :param str directory: The path to the directory. 80 | :param Tree tree: The `Tree` instance. 81 | 82 | :returns: A `list[(Path, Tree)]` of paths and sub-`Tree`s. 83 | :rtype: `list[(Path, Tree)]` 84 | """ 85 | 86 | return [ 87 | (path, tree) 88 | for path in sorted( 89 | Path(directory).iterdir(), 90 | key=lambda path: (path.is_file(), path.name.lower()), 91 | ) 92 | ] 93 | 94 | @staticmethod 95 | def _create_directory_tree(date_dir: str, tree: Tree) -> None: 96 | """ 97 | Create the directory Tree based on the date_dir Path using iterative 98 | depth-first search. 99 | 100 | :param str date_dir: The path to the directory. 101 | :param Tree tree: The `Tree` instance. 102 | """ 103 | 104 | build_tree_status = Status( 105 | "Displaying directory tree.", 106 | f"Building directory tree for {date_dir}.", 107 | "cyan", 108 | ) 109 | 110 | stack = DateTree._create_stack(date_dir, tree) 111 | 112 | visited = set() 113 | visited.add(Path(date_dir)) 114 | 115 | build_tree_status.start() 116 | while stack: 117 | current = stack.pop(0) 118 | current_path, current_tree = current[0], current[1] 119 | 120 | if current_path in visited: 121 | continue 122 | elif current_path.is_dir(): 123 | sub_tree = current_tree.add(f"[bold blue]{current_path.name}") 124 | sub_paths = DateTree._create_stack(current_path, sub_tree) 125 | 126 | stack = sub_paths + stack 127 | elif current_path.is_file(): 128 | file_size = current_path.stat().st_size 129 | current_tree.add(f"[bold]{current_path.name} [{decimal(file_size)}]") 130 | 131 | visited.add(current_path) 132 | 133 | build_tree_status.succeed() 134 | print() 135 | 136 | @staticmethod 137 | def display_tree(search_date: str) -> None: 138 | """ 139 | Display the scrapes directory for a specific date. 140 | 141 | :param str search_date: The date within the `scrapes/` directory to search 142 | for. 143 | """ 144 | 145 | logging.info(f"Running tree command...") 146 | logging.info("") 147 | 148 | try: 149 | search_date = DateTree._check_date_format(search_date) 150 | 151 | find_dir_halo = Halo( 152 | color="white", 153 | text=f"Searching for {search_date} directory within `scrapes`.", 154 | ) 155 | 156 | find_dir_halo.start() 157 | 158 | dir_exists = DateTree._find_date_directory(search_date) 159 | if dir_exists: 160 | find_dir_halo.succeed(text=f"URS was run on {search_date}.") 161 | 162 | date_dir = f"{Path(Path.cwd()).parents[0]}/scrapes/{search_date}" 163 | 164 | tree = Tree(f"[bold blue]scrapes/") 165 | dir_tree = tree.add(f"[bold blue]{search_date}") 166 | 167 | DateTree._create_directory_tree(date_dir, dir_tree) 168 | 169 | rich.print(tree) 170 | logging.info( 171 | f"Displayed directory tree for scrapes run on {search_date}." 172 | ) 173 | logging.info("") 174 | print() 175 | else: 176 | error_messsage = f"URS was not run on {search_date}." 177 | find_dir_halo.fail(Fore.RED + Style.BRIGHT + error_messsage) 178 | print() 179 | 180 | logging.critical(error_messsage) 181 | logging.critical("ABORTING URS.\n") 182 | 183 | quit() 184 | except TypeError: 185 | logging.critical("INVALID DATE FORMAT.") 186 | logging.critical("ABORTING URS.\n") 187 | 188 | Errors.e_title( 189 | "INVALID DATE FORMAT. ACCEPTED FORMATS: MM-DD-YYYY or MM/DD/YYYY." 190 | ) 191 | quit() 192 | -------------------------------------------------------------------------------- /urs/praw_scrapers/utils/Objectify.py: -------------------------------------------------------------------------------- 1 | """ 2 | Create Reddit objects 3 | ===================== 4 | Defining methods to create JSON serializable objects from Reddit metadata. 5 | """ 6 | 7 | 8 | from typing import Any, Dict 9 | 10 | from praw.models import Comment, Multireddit, Submission, Subreddit 11 | 12 | from urs.utils.Global import convert_time 13 | 14 | 15 | class Objectify: 16 | """ 17 | Methods for creating JSON serializable objects from Reddit metadata. 18 | """ 19 | 20 | def make_comment(self, comment: Comment, include_all: bool) -> Dict[str, Any]: 21 | """ 22 | Make a comment item. 23 | 24 | :param Comment comment: PRAW Comment object. 25 | :param bool include_all: Whether the `"type"` field should be included. 26 | 27 | :returns: A `dict[str, Any]` containing comment metadata. 28 | :rtype: `dict[str, Any]` 29 | """ 30 | 31 | comment_object = { 32 | "author": "u/" + comment.author.name 33 | if hasattr(comment.author, "name") 34 | else "[deleted]", 35 | "body": comment.body, 36 | "body_html": comment.body_html, 37 | "created_utc": convert_time(comment.created_utc), 38 | "distinguished": comment.distinguished, 39 | "edited": comment.edited 40 | if comment.edited == False 41 | else convert_time(comment.edited), 42 | "id": comment.id, 43 | "is_submitter": comment.is_submitter, 44 | "link_id": comment.link_id, 45 | "parent_id": comment.parent_id, 46 | "score": comment.score, 47 | "stickied": comment.stickied, 48 | } 49 | 50 | if include_all: 51 | comment_object["submission"] = self.make_submission( 52 | include_all, comment.submission 53 | ) 54 | comment_object["subreddit_id"] = comment.subreddit_id 55 | comment_object["type"] = "comment" 56 | 57 | comment_object = dict(sorted(comment_object.items())) 58 | 59 | return comment_object 60 | 61 | def make_multireddit(self, multireddit: Multireddit) -> Dict[str, Any]: 62 | """ 63 | Make a multireddit item. 64 | 65 | :param Multireddit multireddit: PRAW Multireddit object. 66 | 67 | :returns: A `dict[str, Any]` containing Multireddit data. 68 | :rtype: `Dict[str, Any]` 69 | """ 70 | 71 | multireddit_object = { 72 | "can_edit": multireddit.can_edit, 73 | "copied_from": multireddit.copied_from, 74 | "created_utc": convert_time(multireddit.created_utc), 75 | "description_html": multireddit.description_html, 76 | "description_md": multireddit.description_md, 77 | "display_name": multireddit.display_name, 78 | "name": multireddit.name, 79 | "nsfw": multireddit.over_18, 80 | "subreddits": [], 81 | "visibility": multireddit.visibility, 82 | } 83 | 84 | if multireddit.subreddits: 85 | for subreddit in multireddit.subreddits: 86 | subreddit = self.make_subreddit(subreddit) 87 | multireddit_object["subreddits"].append(subreddit) 88 | 89 | return multireddit_object 90 | 91 | def make_submission( 92 | self, include_all: bool, submission: Submission 93 | ) -> Dict[str, Any]: 94 | """ 95 | Make a submission object. 96 | 97 | :param bool include_all: Whether the `"type"` field should be included. 98 | :param Submission submission: PRAW Submission object. 99 | 100 | :returns: A `dict[str, Any]` containing Submission data. 101 | :rtype: `Dict[str, Any]` 102 | """ 103 | 104 | submission_object = { 105 | "author": "u/" + submission.author.name 106 | if hasattr(submission.author, "name") 107 | else "[deleted]", 108 | "created_utc": convert_time(submission.created_utc), 109 | "distinguished": submission.distinguished, 110 | "edited": submission.edited 111 | if submission.edited == False 112 | else convert_time(submission.edited), 113 | "id": submission.id, 114 | "is_original_content": submission.is_original_content, 115 | "is_self": submission.is_self, 116 | "link_flair_text": submission.link_flair_text, 117 | "locked": submission.locked, 118 | "name": submission.name, 119 | "nsfw": submission.over_18, 120 | "num_comments": submission.num_comments, 121 | "permalink": submission.permalink, 122 | "score": submission.score, 123 | "selftext": submission.selftext, 124 | "spoiler": submission.spoiler, 125 | "stickied": submission.stickied, 126 | "title": submission.title, 127 | "upvote_ratio": submission.upvote_ratio, 128 | "url": submission.url, 129 | } 130 | 131 | if include_all: 132 | submission_object["subreddit"] = self.make_subreddit(submission.subreddit) 133 | submission_object["type"] = "submission" 134 | 135 | submission_object = dict(sorted(submission_object.items())) 136 | 137 | return submission_object 138 | 139 | def make_subreddit(self, subreddit: Subreddit) -> Dict[str, Any]: 140 | """ 141 | Make a Subreddit object. 142 | 143 | :param Subreddit subreddit: PRAW Subreddit object. 144 | 145 | :returns: A `dict[str, Any]` containing Subreddit data. 146 | :rtype: `Dict[str, Any]` 147 | """ 148 | 149 | return { 150 | "can_assign_link_flair": subreddit.can_assign_link_flair, 151 | "can_assign_user_flair": subreddit.can_assign_user_flair, 152 | "created_utc": convert_time(subreddit.created_utc), 153 | "description": subreddit.description, 154 | "description_html": subreddit.description_html, 155 | "display_name": subreddit.display_name, 156 | "id": subreddit.id, 157 | "name": subreddit.name, 158 | "nsfw": subreddit.over18, 159 | "public_description": subreddit.public_description, 160 | "spoilers_enabled": subreddit.spoilers_enabled, 161 | "subscribers": subreddit.subscribers, 162 | "user_is_banned": subreddit.user_is_banned, 163 | "user_is_moderator": subreddit.user_is_moderator, 164 | "user_is_subscriber": subreddit.user_is_subscriber, 165 | } 166 | -------------------------------------------------------------------------------- /urs/utils/Titles.py: -------------------------------------------------------------------------------- 1 | """ 2 | Titles 3 | ====== 4 | Display ASCII art that is used throughout this program. 5 | """ 6 | 7 | 8 | from colorama import Fore, Style 9 | from prawcore import PrawcoreException 10 | 11 | 12 | class MainTitle: 13 | """ 14 | Method for printing the main URS title. 15 | """ 16 | 17 | @staticmethod 18 | def title() -> None: 19 | """ 20 | Print URS title. 21 | """ 22 | 23 | print( 24 | Fore.WHITE 25 | + Style.BRIGHT 26 | + r""" 27 | __ __ _ __ ____ 28 | /\ \/\ \/\`'__\/',__\ 29 | \ \ \_\ \ \ \//\__, `\ 30 | \ \____/\ \_\\/\____/ 31 | \/___/ \/_/ \/___/ 32 | """ 33 | ) 34 | 35 | 36 | class PRAWTitles: 37 | """ 38 | Methods for printing PRAW scraper titles. 39 | """ 40 | 41 | @staticmethod 42 | def r_title() -> None: 43 | """ 44 | Print Subreddit scraper title. 45 | """ 46 | 47 | print( 48 | Fore.WHITE 49 | + Style.BRIGHT 50 | + r""" 51 | _ __ 52 | /\`'__\ 53 | \ \ \/ 54 | \ \_\ 55 | \/_/ 56 | """ 57 | ) 58 | 59 | @staticmethod 60 | def u_title() -> None: 61 | """ 62 | Print Redditor scraper title. 63 | """ 64 | 65 | print( 66 | Fore.WHITE 67 | + Style.BRIGHT 68 | + r""" 69 | __ __ 70 | /\ \/\ \ 71 | \ \ \_\ \ 72 | \ \____/ 73 | \/___/ 74 | """ 75 | ) 76 | 77 | @staticmethod 78 | def c_title() -> None: 79 | """ 80 | Print comments scraper title. 81 | """ 82 | 83 | print( 84 | Fore.WHITE 85 | + Style.BRIGHT 86 | + r""" 87 | ___ 88 | /'___\ 89 | /\ \__/ 90 | \ \____\ 91 | \/____/ 92 | """ 93 | ) 94 | 95 | @staticmethod 96 | def b_title() -> None: 97 | """ 98 | Print basic scraper title. 99 | """ 100 | 101 | print( 102 | Fore.WHITE 103 | + Style.BRIGHT 104 | + r""" 105 | __ 106 | /\ \ 107 | \ \ \____ 108 | \ \ '__`\ 109 | \ \ \L\ \ 110 | \ \_,__/ 111 | \/___/... Only scrapes Subreddits. 112 | """ 113 | ) 114 | 115 | @staticmethod 116 | def lr_title() -> None: 117 | """ 118 | Print Subreddit livestream title. 119 | """ 120 | 121 | print( 122 | Fore.WHITE 123 | + Style.BRIGHT 124 | + r""" 125 | ___ 126 | /\_ \ 127 | \//\ \ _ __ ⏺️ 128 | \ \ \ /\`'__\ 129 | \_\ \_\ \ \/ 130 | /\____\\ \_\ 131 | \/____/ \/_/ 132 | """ 133 | ) 134 | 135 | @staticmethod 136 | def lu_title() -> None: 137 | """ 138 | Print Redditor livestream title. 139 | """ 140 | 141 | print( 142 | Fore.WHITE 143 | + Style.BRIGHT 144 | + r""" 145 | ___ 146 | /\_ \ 147 | \//\ \ __ __⏺️ 148 | \ \ \ /\ \/\ \ 149 | \_\ \_\ \ \_\ \ 150 | /\____\\ \____/ 151 | \/____/ \/___/ 152 | """ 153 | ) 154 | 155 | 156 | class AnalyticsTitles: 157 | """ 158 | Methods for printing analytical tool titles. 159 | """ 160 | 161 | @staticmethod 162 | def f_title() -> None: 163 | """ 164 | Print frequencies title. 165 | """ 166 | 167 | print( 168 | Fore.WHITE 169 | + Style.BRIGHT 170 | + r""" 171 | ___ 172 | /'___\ 📈 173 | /\ \__/ 174 | \ \ ,__\ 175 | \ \ \_/ 176 | \ \_\ 177 | \/_/ 178 | """ 179 | ) 180 | 181 | @staticmethod 182 | def wc_title() -> None: 183 | """ 184 | Print wordcloud title. 185 | """ 186 | 187 | print( 188 | Fore.WHITE 189 | + Style.BRIGHT 190 | + r""" 191 | __ __ __ ___ 🖌️ 192 | /\ \/\ \/\ \ /'___\ 193 | \ \ \_/ \_/ \/\ \__/ 194 | \ \___x___/'\ \____\ 195 | \/__//__/ \/____/ 196 | """ 197 | ) 198 | 199 | 200 | class Errors: 201 | """ 202 | Methods for printing error titles. 203 | """ 204 | 205 | @staticmethod 206 | def e_title(invalid_message: str) -> None: 207 | """ 208 | Print error title. 209 | 210 | :param str invalid_message: The specific error message in arguments. 211 | """ 212 | 213 | print( 214 | Fore.RED 215 | + Style.BRIGHT 216 | + rf""" 217 | __ 218 | /'__`\ 219 | /\ __/ 220 | \ \____\ 221 | \/____/... {invalid_message} 222 | 223 | Please recheck args or refer to help or usage examples. 224 | """ 225 | ) 226 | 227 | @staticmethod 228 | def n_title(reddit_object: str) -> None: 229 | """ 230 | Print exiting title when there are no Reddit objects left to scrape. 231 | 232 | :param str reddit_object: The Reddit object type. 233 | """ 234 | 235 | print( 236 | Fore.RED 237 | + Style.BRIGHT 238 | + rf""" 239 | ___ 240 | /' _`\ 241 | /\ \/\ \ 242 | \ \_\ \_\ 243 | \/_/\/_/... No {reddit_object} to scrape! Aborting URS. 244 | """ 245 | ) 246 | 247 | @staticmethod 248 | def i_title(error: str) -> None: 249 | """ 250 | Print invalid file title. 251 | 252 | :param str error: The specific error associated with invalid files. 253 | """ 254 | 255 | print( 256 | Fore.RED 257 | + Style.BRIGHT 258 | + rf""" 259 | __ 260 | /\_\ 261 | \/\ \ 262 | \ \ \ 263 | \ \_\ 264 | \/_/... {error} 265 | """ 266 | ) 267 | 268 | @staticmethod 269 | def p_title(error: PrawcoreException) -> None: 270 | """ 271 | Print PRAW error title. 272 | 273 | :param PrawcoreException error: The `PrawcoreException` raised when API 274 | validation fails. 275 | """ 276 | 277 | print( 278 | Fore.RED 279 | + Style.BRIGHT 280 | + rf""" 281 | _____ 282 | /\ '__`\ 283 | \ \ \L\ \ 284 | \ \ ,__/... Please recheck API credentials or your internet connection. 285 | \ \ \/ 286 | \ \_\ 287 | \/_/ 288 | 289 | Prawcore exception: {error} 290 | """ 291 | ) 292 | 293 | @staticmethod 294 | def l_title(reset_timestamp: str) -> None: 295 | """ 296 | Print rate limit error title. 297 | 298 | :param str reset_timestamp: The reset timestamp provided by PRAW. 299 | """ 300 | 301 | print( 302 | Fore.RED 303 | + Style.BRIGHT 304 | + rf""" 305 | __ 306 | /\ \ 307 | \ \ \ 308 | \ \ \ __ 309 | \ \ \L\ \ 310 | \ \____/ 311 | \/___/... You have reached your rate limit. 312 | 313 | Please try again when your rate limit is reset: {reset_timestamp} 314 | """ 315 | ) 316 | 317 | @staticmethod 318 | def ex_title(error: Exception) -> None: 319 | """ 320 | Print export error title. 321 | 322 | :param Exception error: The `Exception` raised while exporting scrape data. 323 | """ 324 | 325 | print( 326 | Fore.RED 327 | + Style.BRIGHT 328 | + rf""" 329 | __ 330 | /\ \ 331 | \ \ \ 332 | \ \ \ 333 | \ \_\ 334 | \/\_\ 335 | \/_/... An error has occurred while exporting scraped data. 336 | 337 | {error} 338 | """ 339 | ) 340 | -------------------------------------------------------------------------------- /tests/test_praw_scrapers/test_static_scrapers/test_Redditor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Testing `Redditor.py`. 3 | """ 4 | 5 | 6 | import os 7 | 8 | import praw 9 | from dotenv import load_dotenv 10 | 11 | from urs.praw_scrapers.static_scrapers import Redditor 12 | from urs.utils import Global 13 | 14 | 15 | class Login: 16 | """ 17 | Create a Reddit object with PRAW API credentials. 18 | """ 19 | 20 | @staticmethod 21 | def create_reddit_object(): 22 | load_dotenv() 23 | 24 | return praw.Reddit( 25 | client_id=os.getenv("CLIENT_ID"), 26 | client_secret=os.getenv("CLIENT_SECRET"), 27 | user_agent=os.getenv("USER_AGENT"), 28 | username=os.getenv("REDDIT_USERNAME"), 29 | password=os.getenv("REDDIT_PASSWORD"), 30 | ) 31 | 32 | 33 | class TestGetInteractionsMakeJsonSkeletonMethod: 34 | """ 35 | Testing GetInteractions class _make_json_skeleton() method. 36 | """ 37 | 38 | def test_make_json_skeleton(self): 39 | reddit = Login.create_reddit_object() 40 | spez = reddit.redditor("spez") 41 | 42 | test_skeleton = { 43 | "scrape_settings": {"redditor": "spez", "n_results": 1}, 44 | "data": {"information": None, "interactions": {}}, 45 | } 46 | 47 | redditor, skeleton = Redditor.GetInteractions._make_json_skeleton( 48 | 1, reddit, "spez" 49 | ) 50 | 51 | assert redditor == spez 52 | assert skeleton == test_skeleton 53 | 54 | 55 | class TestGetInteractionsGetTrophiesMethod: 56 | """ 57 | Testing GetInteractions class _get_trophies() method. 58 | """ 59 | 60 | def test_get_trophies(self): 61 | reddit = Login.create_reddit_object() 62 | spez = reddit.redditor("spez") 63 | 64 | trophies = Redditor.GetInteractions._get_trophies(spez) 65 | 66 | assert isinstance(trophies, list) == True 67 | assert len(trophies) > 0 68 | 69 | 70 | class TestGetUserSubredditMethod: 71 | """ 72 | Testing GetInteractions class _get_user_subreddit() method. 73 | """ 74 | 75 | def test_get_user_subreddit(self): 76 | reddit = Login.create_reddit_object() 77 | spez = reddit.redditor("spez") 78 | 79 | redditor_subreddit = Redditor.GetInteractions._get_user_subreddit(spez) 80 | 81 | dict_fields = [ 82 | "can_assign_link_flair", 83 | "can_assign_user_flair", 84 | "created_utc", 85 | "description", 86 | "description_html", 87 | "display_name", 88 | "id", 89 | "name", 90 | "nsfw", 91 | "public_description", 92 | "spoilers_enabled", 93 | "subscribers", 94 | "user_is_banned", 95 | "user_is_moderator", 96 | "user_is_subscriber", 97 | ] 98 | 99 | assert isinstance(redditor_subreddit, dict) == True 100 | 101 | for key in redditor_subreddit.keys(): 102 | assert key in dict_fields 103 | 104 | 105 | class TestGetInteractionsGetUserInfoMethod: 106 | """ 107 | Testing GetInteractions class _get_user_info() method. 108 | """ 109 | 110 | def test_get_user_info(self): 111 | reddit = Login.create_reddit_object() 112 | spez = reddit.redditor("spez") 113 | 114 | skeleton = { 115 | "scrape_settings": {"redditor": "spez", "n_results": 1}, 116 | "data": {"information": None, "interactions": {}}, 117 | } 118 | 119 | Redditor.GetInteractions._get_user_info(spez, skeleton) 120 | 121 | assert skeleton["data"]["information"] != None 122 | 123 | information_fields = [ 124 | "comment_karma", 125 | "created_utc", 126 | "fullname", 127 | "has_verified_email", 128 | "icon_img", 129 | "id", 130 | "is_employee", 131 | "is_friend", 132 | "is_mod", 133 | "is_gold", 134 | "link_karma", 135 | "name", 136 | "subreddit", 137 | "trophies", 138 | ] 139 | for field in skeleton["data"]["information"].keys(): 140 | assert True if field in information_fields else False 141 | 142 | 143 | class TestGetInteractionsMakeInteractionsListsMethod: 144 | """ 145 | Testing GetInteractions class _make_interactions_lists() method. 146 | """ 147 | 148 | def test_make_interactions_lists(self): 149 | skeleton = {"data": {"interactions": {}}} 150 | 151 | Redditor.GetInteractions._make_interactions_lists(skeleton) 152 | 153 | interaction_titles = [ 154 | "comments", 155 | "controversial", 156 | "downvoted", 157 | "gilded", 158 | "gildings", 159 | "hidden", 160 | "hot", 161 | "moderated", 162 | "multireddits", 163 | "new", 164 | "saved", 165 | "submissions", 166 | "top", 167 | "upvoted", 168 | ] 169 | for field in skeleton["data"]["interactions"].keys(): 170 | assert True if field in interaction_titles else False 171 | 172 | assert skeleton["data"]["interactions"][field] == [] 173 | 174 | 175 | class TestGetInteractionsGetUserInteractionsMethod: 176 | """ 177 | Testing GetInteractions class _get_user_interactions() method. 178 | """ 179 | 180 | def test_get_user_interactions(self): 181 | reddit = Login.create_reddit_object() 182 | spez = reddit.redditor("spez") 183 | 184 | skeleton = { 185 | "scrape_settings": {"redditor": "spez", "n_results": 1}, 186 | "data": {"information": None, "interactions": {}}, 187 | } 188 | 189 | Redditor.GetInteractions._get_user_interactions(1, spez, skeleton) 190 | 191 | assert skeleton["data"]["information"] == None 192 | assert skeleton["data"]["interactions"] 193 | 194 | 195 | class TestGetInteractionsGetMethod: 196 | """ 197 | Testing GetInteractions class get() method. 198 | """ 199 | 200 | def test_get(self): 201 | reddit = Login.create_reddit_object() 202 | spez = reddit.redditor("spez") 203 | 204 | skeleton = Redditor.GetInteractions.get(1, reddit, spez) 205 | 206 | assert skeleton["scrape_settings"]["redditor"] == "spez" 207 | assert skeleton["scrape_settings"]["n_results"] == 1 208 | 209 | assert skeleton["data"]["information"] != None 210 | 211 | assert len(skeleton["data"]["interactions"]["comments"]) 212 | assert len(skeleton["data"]["interactions"]["controversial"]) 213 | assert len(skeleton["data"]["interactions"]["gilded"]) 214 | assert skeleton["data"]["interactions"]["gildings"][0] == "FORBIDDEN" 215 | assert skeleton["data"]["interactions"]["hidden"][0] == "FORBIDDEN" 216 | assert not skeleton["data"]["interactions"]["hot"] 217 | assert len(skeleton["data"]["interactions"]["moderated"]) 218 | assert "multireddits" in skeleton["data"]["interactions"].keys() 219 | assert not skeleton["data"]["interactions"]["new"] 220 | assert skeleton["data"]["interactions"]["saved"][0] == "FORBIDDEN" 221 | assert not skeleton["data"]["interactions"]["submissions"] 222 | assert len(skeleton["data"]["interactions"]["top"]) 223 | assert skeleton["data"]["interactions"]["upvoted"][0] == "FORBIDDEN" 224 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ``` 2 | __ __ _ __ ____ 3 | /\ \/\ \/\`'__\/',__\ 4 | \ \ \_\ \ \ \//\__, `\ 5 | \ \____/\ \_\\/\____/ 6 | \/___/ \/_/ \/___/ 7 | ``` 8 | 9 | > **U**niversal **R**eddit **S**craper - A comprehensive Reddit scraping command-line tool written in Python. 10 | 11 | ![GitHub Workflow Status (Python)](https://img.shields.io/github/actions/workflow/status/JosephLai241/URS/python.yml?label=Python&logo=python&logoColor=blue) 12 | ![GitHub Workflow Status (Rust)](https://img.shields.io/github/actions/workflow/status/JosephLai241/URS/rust.yml?label=Rust&logo=rust&logoColor=orange) 13 | [![Codecov](https://img.shields.io/codecov/c/gh/JosephLai241/URS?logo=Codecov)][codecov] 14 | [![GitHub release (latest by date)](https://img.shields.io/github/v/release/JosephLai241/URS)][releases] 15 | ![Total lines](https://img.shields.io/tokei/lines/github/JosephLai241/URS) 16 | ![License](https://img.shields.io/github/license/JosephLai241/URS) 17 | 18 | # Sponsors 19 | 20 |

21 | 22 | 23 | 24 |

25 | 26 |

27 | Thordata’s tools are particularly useful in scenarios that require large-scale web scraping through their 28 | Web Scraper API 29 | , API-based data extraction, or reliable 30 | Proxy 31 | infrastructure. 32 | If you plan to use Thordata's tools, you can support the project via this affiliate link. 33 |

34 | 35 | ## Previous Sponsors 36 | 37 | - [lolfilmworks] 38 | 39 | # Table of Contents 40 | 41 | - [Contact](#contact) 42 | - [Introduction](#introduction) 43 | - [Usage Overview](#usage-overview) 44 | - ["Where’s the Manual?"](#wheres-the-manual) 45 | - [`URS` Manual](#urs-manual) 46 | - [Demo GIFs](#demo-gifs) 47 | - [Subreddit Scraping](#subreddit-scraping) 48 | - [Redditor Scraping](#redditor-scraping) 49 | - [Submission Comments Scraping](#submission-comments-scraping) 50 | - [Livestreaming Reddit](#livestreaming-reddit) 51 | - [Generating Word Frequencies](#generating-word-frequencies) 52 | - [Generating Wordclouds](#generating-wordclouds) 53 | - [Checking PRAW Rate Limits](#checking-praw-rate-limits) 54 | - [Displaying Directory Tree](#displaying-directory-tree) 55 | 56 | # Contact 57 | 58 | Whether you are using `URS` for enterprise or personal use, I am very interested in hearing about your use case and how it has helped you achieve a goal. Additionally, please send me an email if you would like to [contribute][contributing manual link], have questions, or want to share something you have built on top of it. 59 | 60 | You can send me an email by clicking on the badge. I look forward to hearing from you! 61 | 62 | [![ProtonMail](https://img.shields.io/badge/ProtonMail-urs__project%40protonmail.com-informational?logo=protonmail)][urs project email] 63 | 64 | # Introduction 65 | 66 | This is a comprehensive Reddit scraping tool that integrates multiple features: 67 | 68 | - Scrape Reddit via [`PRAW`][praw] (the official Python Reddit API Wrapper) 69 | - Scrape Subreddits 70 | - Scrape Redditors 71 | - Scrape submission comments 72 | - Livestream Reddit via `PRAW` 73 | - Livestream comments submitted within Subreddits or by Redditors 74 | - Livestream submissions submitted within Subreddits or by Redditors 75 | - Analytical tools for scraped data 76 | - Generate frequencies for words that are found in submission titles, bodies, and/or comments 77 | - Generate a wordcloud from scrape results 78 | 79 | # Usage Overview 80 | 81 | ``` 82 | [-h] 83 | [-e] 84 | [-v] 85 | 86 | [-t []] 87 | [--check] 88 | 89 | [-r <(h|n|c|t|r|s)> []] 90 | [-y] 91 | [--csv] 92 | [--rules] 93 | [-u ] 94 | [-c ] 95 | [--raw] 96 | [-b] 97 | [--csv] 98 | 99 | [-lr ] 100 | [-lu ] 101 | 102 | [--nosave] 103 | [--stream-submissions] 104 | 105 | [-f ] 106 | [--csv] 107 | [-wc []] 108 | [--nosave] 109 | ``` 110 | 111 | # "Where’s the Manual?" 112 | 113 | ### [`URS` Manual][urs manual] 114 | 115 | This `README` has become too long to comfortably contain all usage information for this tool. Consequently, the information that used to be in this file has been moved to a separate manual created with [mdBook], a Rust command-line tool for creating books from Markdown files. 116 | 117 | > **_Note:_** You can also find the link in the About sidebar in this repository. 118 | 119 | # Demo GIFs 120 | 121 | Here are all the demo GIFs recorded for `URS`. 122 | 123 | > **_Note:_** The `nd` command is [`nomad`][nomad], a modern `tree` alternative I wrote in Rust. 124 | 125 | ## [Subreddit Scraping][subreddit scraping manual link] 126 | 127 | ![subreddit demo] 128 | 129 | ## [Redditor Scraping][redditor scraping manual link] 130 | 131 | ![redditor demo] 132 | 133 | ## [Submission Comments Scraping][submission comments scraping manual link] 134 | 135 | ![submission comments demo] 136 | 137 | ## [Livestreaming Reddit][livestream scraping manual link] 138 | 139 | ![livestream subreddit demo] 140 | 141 | ## [Generating Word Frequencies][frequencies scraping manual link] 142 | 143 | ![frequencies demo] 144 | 145 | ## [Generating Wordclouds][wordcloud scraping manual link] 146 | 147 | ![wordcloud demo] 148 | 149 | ## [Checking PRAW Rate Limits][check praw rate limits manual link] 150 | 151 | ![check praw rate limits demo] 152 | 153 | ## [Displaying Directory Tree][display directory tree manual link] 154 | 155 | ![display directory tree demo] 156 | 157 | [check praw rate limits demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/utilities/check_rate_limit_demo.gif 158 | [check praw rate limits manual link]: https://josephlai241.github.io/URS/utilities/rate-limit-checking.html 159 | [codecov]: https://codecov.io/gh/JosephLai241/URS 160 | [contributing manual link]: https://josephlai241.github.io/URS/contributing/before-making-pull-or-feature-requests.html 161 | [display directory tree demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/utilities/tree_demo.gif 162 | [display directory tree manual link]: https://josephlai241.github.io/URS/utilities/tree.html 163 | [frequencies demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/analytical_tools/frequencies_generator_demo.gif 164 | [frequencies scraping manual link]: https://josephlai241.github.io/URS/analytical-tools/frequencies-and-wordclouds.html#generating-word-frequencies 165 | [livestream scraping manual link]: https://josephlai241.github.io/URS/livestreaming-reddit/general-information.html 166 | [livestream subreddit demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/praw_scrapers/live_scrapers/livestream_subreddit_demo.gif 167 | [lolfilmworks]: https://github.com/lolfilmworks 168 | [mdbook]: https://github.com/rust-lang/mdBook 169 | [nomad]: https://github.com/JosephLai241/nomad 170 | [praw]: https://praw.readthedocs.io/en/stable/ 171 | [redditor demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/praw_scrapers/static_scrapers/Redditor_demo.gif 172 | [redditor scraping manual link]: https://josephlai241.github.io/URS/scraping-reddit/redditor.html 173 | [releases]: https://github.com/JosephLai241/URS/releases 174 | [submission comments demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/praw_scrapers/static_scrapers/submission_comments_demo.gif 175 | [submission comments scraping manual link]: https://josephlai241.github.io/URS/scraping-reddit/submission-comments.html 176 | [subreddit demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/praw_scrapers/static_scrapers/Subreddit_demo.gif 177 | [subreddit scraping manual link]: https://josephlai241.github.io/URS/scraping-reddit/subreddit.html 178 | [urs manual]: https://josephlai241.github.io/URS 179 | [urs project email]: mailto:urs_project@protonmail.com 180 | [wordcloud demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/analytical_tools/wordcloud_generator_demo.gif 181 | [wordcloud scraping manual link]: https://josephlai241.github.io/URS/analytical-tools/frequencies-and-wordclouds.html#generating-wordclouds 182 | -------------------------------------------------------------------------------- /urs/praw_scrapers/utils/Validation.py: -------------------------------------------------------------------------------- 1 | """ 2 | PRAW validation 3 | =============== 4 | Validation methods for PRAW credentials and scrapers. 5 | """ 6 | 7 | 8 | import logging 9 | from argparse import ArgumentParser 10 | from typing import Dict, List, Tuple, Union 11 | 12 | from colorama import Fore, Style 13 | from halo import Halo 14 | from praw import Reddit, models 15 | from prawcore import NotFound, PrawcoreException 16 | from prettytable import PrettyTable 17 | 18 | from urs.utils.Global import Status 19 | from urs.utils.Logger import LogError 20 | from urs.utils.Titles import Errors 21 | 22 | 23 | class Validation: 24 | """ 25 | Methods for validating PRAW credentials and Subreddits, Redditors, and URLs. 26 | """ 27 | 28 | @staticmethod 29 | @LogError.log_rate_limit 30 | def get_rate_info(reddit: Reddit) -> Dict[str, Union[str, int, None]]: 31 | """ 32 | Get user rate limit information. Quits the program if the user does not 33 | have any requests left in the current rate limit window. 34 | 35 | :param Reddit reddit: Reddit instance. 36 | 37 | :returns: PRAW rate limits. 38 | :rtype: `dict[str, str | int | None]` 39 | """ 40 | 41 | return models.Auth(_data=dict(), reddit=reddit).limits 42 | 43 | @staticmethod 44 | def print_rate_limit(reddit: Reddit) -> None: 45 | """ 46 | Print user rate limit information. This includes the number of requests 47 | remaining, a timestamp for when the rate limit counters will be reset, and 48 | the number of requests that have been made in the current rate limit window. 49 | 50 | :param Reddit reddit: Reddit instance. 51 | """ 52 | 53 | user_limits = Validation.get_rate_info(reddit) 54 | 55 | pretty_limits = PrettyTable() 56 | pretty_limits.field_names = ["Remaining Requests", "Used Requests"] 57 | pretty_limits.add_row([int(user_limits["remaining"]), int(user_limits["used"])]) 58 | 59 | pretty_limits.align = "c" 60 | 61 | print(pretty_limits) 62 | 63 | @staticmethod 64 | def validate_user(parser: ArgumentParser, reddit: Reddit) -> None: 65 | """ 66 | Check if PRAW credentials are valid, then print rate limit PrettyTable. 67 | 68 | :param ArgumentParser parser: The `ArgumentParser` object. 69 | :param Reddit reddit: Reddit instance. 70 | """ 71 | 72 | login_spinner = Halo(color="white", text="Logging in.") 73 | login_spinner.start() 74 | 75 | try: 76 | redditor = reddit.user.me() 77 | 78 | login_spinner.succeed( 79 | Style.BRIGHT + Fore.GREEN + f"Successfully logged in as u/{redditor}." 80 | ) 81 | print() 82 | 83 | Validation.print_rate_limit(reddit) 84 | 85 | logging.info(f"Successfully logged in as u/{redditor}.") 86 | logging.info("") 87 | except PrawcoreException as error: 88 | login_spinner.fail(Style.BRIGHT + Fore.RED + "Failed to log in.") 89 | 90 | Errors.p_title(error) 91 | logging.critical("LOGIN FAILED.") 92 | logging.critical(f"PRAWCORE EXCEPTION: {error}.") 93 | logging.critical("ABORTING URS.\n") 94 | parser.exit() 95 | 96 | @staticmethod 97 | def _check_subreddits( 98 | invalid: List[str], object_list: List[str], reddit: Reddit, valid: List[str] 99 | ) -> None: 100 | """ 101 | Check if Subreddits are valid. 102 | 103 | :param list[str] invalid: An empty `list[str]` to store invalid Subreddits. 104 | :param list[str] object_list: A list of Subreddits to validate. 105 | :param Reddit reddit: Reddit instance. 106 | :param list[str] valid: An empty `list[str]` to store valid Subreddits. 107 | """ 108 | 109 | for sub in object_list: 110 | try: 111 | reddit.subreddits.search_by_name(sub, exact=True) 112 | valid.append(sub) 113 | except NotFound: 114 | invalid.append(sub) 115 | 116 | @staticmethod 117 | def _check_redditors( 118 | invalid: List[str], object_list: List[str], reddit: Reddit, valid: List[str] 119 | ) -> None: 120 | """ 121 | Check if Redditors are valid. 122 | 123 | :param list[str] invalid: An empty `list[str]` to store invalid Redditors. 124 | :param list[str] object_list: A list of Redditors to validate. 125 | :param Reddit reddit: Reddit instance. 126 | :param list[str] valid: An empty `list[str]` to store valid Redditors. 127 | """ 128 | 129 | for user in object_list: 130 | try: 131 | reddit.redditor(user).id 132 | valid.append(user) 133 | except NotFound: 134 | invalid.append(user) 135 | 136 | @staticmethod 137 | def _check_submissions( 138 | invalid: List[str], object_list: List[str], reddit: Reddit, valid: List[str] 139 | ) -> None: 140 | """ 141 | Check if submission URLs are valid. 142 | 143 | :param list[str] invalid: An empty `list[str]` to store invalid submissions. 144 | :param list[str] object_list: A list of submissions to validate. 145 | :param Reddit reddit: Reddit instance. 146 | :param list[str] valid: An empty `list[str]` to store valid submissions. 147 | """ 148 | 149 | for post in object_list: 150 | try: 151 | reddit.submission(url=post).title 152 | valid.append(post) 153 | except Exception: 154 | invalid.append(post) 155 | 156 | @staticmethod 157 | def check_existence( 158 | object_list: List[str], reddit: Reddit, scraper_type: str 159 | ) -> Tuple[List[str], List[str]]: 160 | """ 161 | Check whether Reddit objects are valid. 162 | 163 | :param list[str] object_list: A `list[str]` of Reddit objects to check. 164 | :param Reddit reddit: Reddit instance. 165 | :param str scraper_type: The scraper type. 166 | 167 | :raises NotFound: Raised if invalid Subreddits or Redditors were provided. 168 | :raises Exception: Raised if invalid submission URLs were provided 169 | 170 | :returns: A `list[str]` of invalid and valid Reddit objects 171 | :rtype: `(list[str], list[str])` 172 | """ 173 | 174 | invalid = [] 175 | valid = [] 176 | 177 | if scraper_type == "subreddit": 178 | Validation._check_subreddits(invalid, object_list, reddit, valid) 179 | elif scraper_type == "redditor": 180 | Validation._check_redditors(invalid, object_list, reddit, valid) 181 | elif scraper_type == "comments": 182 | Validation._check_submissions(invalid, object_list, reddit, valid) 183 | 184 | return invalid, valid 185 | 186 | @staticmethod 187 | def validate( 188 | object_list: List[str], reddit: Reddit, scraper_type: str 189 | ) -> Tuple[List[str], List[str]]: 190 | """ 191 | Check if Subreddit(s), Redditor(s), or submission(s) exist and catch PRAW 192 | exceptions. Log invalid Reddit objects to `urs.log` if applicable. 193 | 194 | :param list[str] object_list: A `list[str]` of Reddit objects to check. 195 | :param Reddit reddit: Reddit instance. 196 | :param str scrape_type: The scraper type. 197 | 198 | :returns: A `list[str]` of invalid and valid Reddit objects. 199 | :rtype: `(list[str], list[str])` 200 | """ 201 | 202 | object_type = ( 203 | "submission" if scraper_type == "comments" else scraper_type.capitalize() 204 | ) 205 | 206 | check_status = Status( 207 | f"Finished {object_type} validation.", 208 | f"Validating {object_type}(s)", 209 | "white", 210 | ) 211 | 212 | check_status.start() 213 | 214 | logging.info(f"Validating {object_type}(s)...") 215 | logging.info("") 216 | 217 | invalid, valid = Validation.check_existence(object_list, reddit, scraper_type) 218 | 219 | check_status.succeed() 220 | print() 221 | 222 | if invalid: 223 | warning_message = ( 224 | f"The following {object_type}s were not found and will be skipped:" 225 | ) 226 | 227 | print(Fore.YELLOW + Style.BRIGHT + warning_message) 228 | print(Fore.YELLOW + Style.BRIGHT + "-" * len(warning_message)) 229 | print(*invalid, sep="\n") 230 | 231 | logging.warning(f"Failed to validate the following {object_type}s:") 232 | logging.warning(f"{invalid}") 233 | logging.warning("Skipping.") 234 | logging.info("") 235 | 236 | if not valid: 237 | logging.critical(f"ALL {object_type.upper()}S FAILED VALIDATION.") 238 | Errors.n_title(object_type + "s") 239 | logging.critical(f"NO {object_type.upper()}S LEFT TO SCRAPE.") 240 | logging.critical("ABORTING URS.\n") 241 | 242 | quit() 243 | 244 | return invalid, valid 245 | -------------------------------------------------------------------------------- /supplemental_docs/The Forest.md: -------------------------------------------------------------------------------- 1 | # The Forest 2 | 3 | ## Table of Contents 4 | 5 | * [Introduction](#introduction) 6 | + [Motivation](#motivation) 7 | + [Inspiration](#inspiration) 8 | * [How the Forest Works](#how-the-forest-works) 9 | + [The `CommentNode`](#the-commentnode) 10 | + [The `Forest`](#the-forest-1) 11 | + [Serializing the `Forest`](#serializing-the-forest) 12 | 13 | ## Introduction 14 | 15 | ### Motivation 16 | 17 | I am a self-taught software developer who just recently graduated from college and am currently looking for my first full-time job. I do not have a computer science degree, so I have had to teach myself a ton of concepts that I would have learned if I got the degree. A class I wish I was able to take in college is data structures and algorithms because that seems to be all the buzz when it comes to the technical interview, which I unfortunately struggle with greatly due to my lack of experience and practice. 18 | 19 | Recently (March 2021) I have been teaching myself DSA. Implementing simple examples of each topic within DSA was not so bad (I am currently working on a study guide/reference repository containing these implementations in both Python and Rust that I will make public soon), but practicing Leetcode problems was and still is a difficult process for me. I will continue to power through the struggle because my livelihood and future career depends on it, though. 20 | 21 | While it has not been a smooth journey, I have come to realize how useful DSA is and am implementing what I have learned in a real-world use case. I do not think I would have been able to figure out a solution to the structured comments scraper's prior shortcomings if I had not studied this area within computer science. I recently implemented my first [trie][trie] and was fascinated by how abstract data structures worked. I immediately realized I needed to use a tree data structure for the structured comments scraper in order to take it to the next level, which is the purpose of [this pull request][Pull Request]. 22 | 23 | ### Inspiration 24 | 25 | The `Forest` is named after PRAW's [`CommentForest`][CommentForest]. The `CommentForest` does not return comments in structured format, so I wrote my own implementation of it. 26 | 27 | The trie was a huge inspiration for the `Forest`. I will quickly explain my implementation of the trie node. 28 | 29 | ```python 30 | class TrieNode(): 31 | def __init__(self, char, is_word): 32 | self.char = char 33 | self.is_word = is_word 34 | self.children = dict() 35 | ``` 36 | 37 | Each node of the trie contains a character, a boolean flag indicating whether the node denotes the end of a word, and holds a dictionary filled with child nodes as values and their respective characters as keys. I could have used an array and the indices within it to emulate a dictionary, but I figured I could save some access time at the cost of extra space. 38 | 39 | Anyways, the trie implementation is very similar to how the `Forest` works. 40 | 41 | ## How the Forest Works 42 | 43 | I will strip docstring comments from the source code to keep it relatively short. 44 | 45 | ### The `CommentNode` 46 | 47 | I created a class `CommentNode` to store each comment's metadata and replies: 48 | 49 | ```python 50 | class CommentNode(): 51 | def __init__(self, metadata): 52 | for key, value in metadata.items(): 53 | self.__setattr__(key, value) 54 | 55 | self.replies = [] 56 | ``` 57 | 58 | I used `__setattr__()` because the root node defers from the standard comment node schema. By using `__setattr__()`, `CommentNode` attributes will be dynamically set based on the `metadata` dictionary that has been passed in. `self.replies` holds additional `CommentNode`s. 59 | 60 | ### The `Forest` 61 | 62 | Next, I created a class `Forest` which holds the root node and includes methods for insertion. 63 | 64 | **The Root Node** 65 | 66 | First, let's go over the root node. 67 | 68 | ```python 69 | class Forest(): 70 | def __init__(self): 71 | self.root = CommentNode({ "id": "abc123" }) 72 | ``` 73 | 74 | The only key in the dictionary passed into `CommentNode` is `id`, therefore the root `CommentNode` will only contain the attributes `self.id` and `self.replies`. A mock submission ID is shown. The actual source code will pull the submission's ID based on the URL that was passed into the `-c` flag and set the `id` value accordingly. 75 | 76 | Before I get to the insertion methods, I will explain how comments and their replies are linked. 77 | 78 | **How PRAW Comments Are Linked** 79 | 80 | PRAW returns all submission comments by level order. This means all top levels are returned first, followed by all second-level replies, then third, so on and so forth. 81 | 82 | I will create some mock comment objects to demonstrate. Here is a top level comment corresponding to the mock submisssion ID. Note the `parent_id` contains the submission's `id`, which is stored in `self.root.id`: 83 | 84 | ```json 85 | { 86 | "author": "u/asdfasdfasdfasdf", 87 | "body": "A top level comment here.", 88 | "created_utc": "06-06-2006 06:06:06", 89 | "distinguished": null, 90 | "edited": false, 91 | "id": "qwerty1", 92 | "is_submitter": false, 93 | "link_id": "t3_asdfgh", 94 | "parent_id": "t3_abc123", 95 | "score": 666, 96 | "stickied": false 97 | } 98 | ``` 99 | 100 | Here is a second-level reply to the top comment. Note the `parent_id` contains the top comment's `id`: 101 | 102 | ```json 103 | { 104 | "author": "u/hjklhjklhjklhjkl", 105 | "body": "A reply here.", 106 | "created_utc": "06-06-2006 18:06:06", 107 | "distinguished": null, 108 | "edited": false, 109 | "id": "hjkl234", 110 | "is_submitter": true, 111 | "link_id": "t3_1a2b3c", 112 | "parent_id": "t1_qwerty1", 113 | "score": 6, 114 | "stickied": false 115 | } 116 | ``` 117 | 118 | This pattern continues all the way down to the last level of comments. It is now very easy to link the correct comments together. I do this by calling `split("_", 1)` on the `parent_id` and then getting the second item in the split list to compare values. I also specify the `maxsplit` parameter to force one split. 119 | 120 | **The Insertion Methods** 121 | 122 | I then defined the methods for `CommentNode` insertion. 123 | 124 | ```python 125 | def _dfs_insert(self, new_comment): 126 | stack = [] 127 | stack.append(self.root) 128 | 129 | visited = set() 130 | visited.add(self.root) 131 | 132 | found = False 133 | while not found: 134 | current_comment = stack.pop(0) 135 | 136 | for reply in current_comment.replies: 137 | if new_comment.parent_id.split("_", 1)[1] == reply.id: 138 | reply.replies.append(new_comment) 139 | found = True 140 | else: 141 | if reply not in visited: 142 | stack.insert(0, reply) 143 | visited.add(reply) 144 | 145 | def seed(self, new_comment): 146 | parent_id = new_comment.parent_id.split("_", 1)[1] 147 | 148 | self.root.replies.append(new_comment) \ 149 | if parent_id == getattr(self.root, "id") \ 150 | else self._dfs_insert(new_comment) 151 | ``` 152 | 153 | I implemented the [depth-first search][Depth-First Search] algorithm to find a comment's parent node and insert it into the parent node's `replies` array. I defined a separate `visited` set to keep track of visited `CommentNode`s to avoid an infinite loop of inserting `CommentNode`s that were already visited into the `stack`. At first I wrote a recursive version of depth-first search, but then opted for an iterative version because it would not scale well for submissions that included large amounts of comments, ie. stack overflow. 154 | 155 | Within the `seed` method, I first check if the `CommentNode` is a top level comment by comparing its parent ID to the submission ID. Depth-first search is triggered if the `CommentNode` is not a top level comment. 156 | 157 | ### Serializing the `Forest` 158 | 159 | Since Python's built-in JSON module can only handle primitive types that have a direct JSON equivalent, a custom encoder is necessary to convert the `Forest` into JSON format. I defined this in `Export.py`. 160 | 161 | ```python 162 | from json import JSONEncoder 163 | 164 | class EncodeNode(JSONEncoder): 165 | def default(self, object): 166 | return object.__dict__ 167 | ``` 168 | 169 | The `default()` method overrides `JSONEncoder`'s `default()` method and serializes the `CommentNode` by converting it into a dictionary, which is a primitive type that has a direct JSON equivalent: 170 | 171 | ```python 172 | EncodeNode().encode(CommentNode) 173 | ``` 174 | 175 | This ensures the node is correctly encoded before I call the `seed()` method to insert a new `CommentNode` into the `replies` arrays of its respective parent `CommentNode`. 176 | 177 | I can then use this custom `JSONEncoder` subclass while exporting by specifying it within `json.dump()` with the `cls` kwarg: 178 | 179 | ```python 180 | with open(filename, "w", encoding = "utf-8") as results: 181 | json.dump(data, results, indent = 4, cls = EncodeNode) 182 | ``` 183 | 184 | This was how the structured comments export was implemented. Refer to the source code located in `urs/praw_scrapers/Comments.py` to see more. I hope this was somewhat interesting and/or informative. Thanks for reading! 185 | 186 | 187 | [Pull Request]: https://github.com/JosephLai241/URS/pull/24 188 | 189 | [CommentForest]: https://praw.readthedocs.io/en/latest/code_overview/other/commentforest.html 190 | [trie]: https://www.interviewcake.com/concept/java/trie 191 | [Depth-First Search]: https://www.interviewcake.com/concept/java/dfs 192 | -------------------------------------------------------------------------------- /taisun/comments.rs: -------------------------------------------------------------------------------- 1 | //! This module provides computational functions pertaining to submission comments. 2 | 3 | use pyo3::{ 4 | exceptions::PyValueError, 5 | prelude::*, 6 | types::{PyBool, PyDict, PyString}, 7 | }; 8 | use serde::{Deserialize, Serialize}; 9 | 10 | use std::collections::{HashSet, VecDeque}; 11 | 12 | /// An enum used for the `edited` field in the `CommentNode`. The `edited` field may be a `bool` 13 | /// (`False`) indicating the comment was not edited, or a `String` representing the date of the 14 | /// change. 15 | #[derive(Clone, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)] 16 | #[serde(untagged)] 17 | pub enum BoolOrDate { 18 | /// Hold a boolean type value. 19 | Bool(bool), 20 | /// Hold a string type value. 21 | Str(String), 22 | } 23 | 24 | impl ToPyObject for BoolOrDate { 25 | /// Convert either the Rust `bool` or `String` into a Python `bool` or `str`. 26 | fn to_object(&self, py: Python<'_>) -> PyObject { 27 | match self { 28 | Self::Bool(boolean) => PyBool::new(py, *boolean).into(), 29 | Self::Str(string) => PyString::new(py, string).into(), 30 | } 31 | } 32 | } 33 | 34 | /// A node object that contains comment metadata for the comment `Forest`. 35 | #[derive(Clone, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)] 36 | #[pyclass] 37 | pub struct CommentNode { 38 | /// This comment's author. 39 | pub author: String, 40 | /// The body of the comment, as Markdown. 41 | pub body: String, 42 | /// The body of the comment, as HTML. 43 | pub body_html: String, 44 | /// The comment's created UTC timestamp. 45 | pub created_utc: String, 46 | /// Whether the comment is distinguished. 47 | pub distinguished: Option, 48 | /// Whether the comment has been edited. This is set to a UTC timestamp if it has been 49 | /// edited. 50 | pub edited: BoolOrDate, 51 | /// The comment's ID. 52 | pub id: String, 53 | /// Whether the comment author is also the author of the submission (OP). 54 | pub is_submitter: bool, 55 | /// The submission ID that the comment belongs to. 56 | pub link_id: String, 57 | /// The comment's parent ID. 58 | pub parent_id: String, 59 | /// The comment's score. 60 | pub score: i32, 61 | /// Whether the comment is stickied. 62 | pub stickied: bool, 63 | /// The comment's replies. 64 | #[serde(skip)] 65 | pub replies: Vec, 66 | } 67 | 68 | #[pymethods] 69 | impl CommentNode { 70 | /// Create a new `CommentNode`. 71 | #[new] 72 | fn new(comment_data: String) -> PyResult { 73 | serde_json::from_str(&comment_data).map_or_else( 74 | |error| { 75 | Err(PyValueError::new_err(format!( 76 | "Could not deserialize comment data to the CommentNode struct! {}", 77 | error 78 | ))) 79 | }, 80 | Ok, 81 | ) 82 | } 83 | 84 | /// Return this `CommentNode` in a Python `dict`. This overrides the built-in Python `__dict__` 85 | /// dunder method. 86 | #[getter] 87 | fn __dict__(&self, py: Python) -> PyResult { 88 | let dict = PyDict::new(py); 89 | 90 | dict.set_item("author", self.author.clone())?; 91 | dict.set_item("body", self.body.clone())?; 92 | dict.set_item("body_html", self.body_html.clone())?; 93 | dict.set_item("created_utc", self.created_utc.clone())?; 94 | dict.set_item("distinguished", self.distinguished.clone())?; 95 | dict.set_item("edited", self.edited.clone())?; 96 | dict.set_item("id", self.id.clone())?; 97 | dict.set_item("is_submitter", self.is_submitter)?; 98 | dict.set_item("link_id", self.link_id.clone())?; 99 | dict.set_item("parent_id", self.parent_id.clone())?; 100 | dict.set_item("score", self.score)?; 101 | dict.set_item("stickied", self.stickied)?; 102 | dict.set_item("replies", self.replies.clone())?; 103 | 104 | Ok(dict.into()) 105 | } 106 | 107 | /// Get this `CommentNode`'s `replies`. 108 | #[getter] 109 | fn replies(&self) -> Vec { 110 | self.replies.clone() 111 | } 112 | } 113 | 114 | impl ToPyObject for CommentNode { 115 | /// Convert the `CommentNode` into a Python `Object`. 116 | fn to_object(&self, py: Python<'_>) -> PyObject { 117 | let dict = PyDict::new(py); 118 | 119 | dict.set_item("author", self.author.clone()) 120 | .expect("Could not set the author attribute in the PyObject!"); 121 | dict.set_item("body", self.body.clone()) 122 | .expect("Could not set the body attribute in the PyObject!"); 123 | dict.set_item("body_html", self.body_html.clone()) 124 | .expect("Could not set the body_html attribute in the PyObject!"); 125 | dict.set_item("created_utc", self.created_utc.clone()) 126 | .expect("Could not set the created_utc attribute in the PyObject!"); 127 | dict.set_item("distinguished", self.distinguished.clone()) 128 | .expect("Could not set the distinguished attribute in the PyObject!"); 129 | dict.set_item("edited", self.edited.clone()) 130 | .expect("Could not set the edited attribute in the PyObject!"); 131 | dict.set_item("id", self.id.clone()) 132 | .expect("Could not set the id attribute in the PyObject!"); 133 | dict.set_item("is_submitter", self.is_submitter) 134 | .expect("Could not set the is_submitter attribute in the PyObject!"); 135 | dict.set_item("link_id", self.link_id.clone()) 136 | .expect("Could not set the link_id attribute in the PyObject!"); 137 | dict.set_item("parent_id", self.parent_id.clone()) 138 | .expect("Could not set the parent_id attribute in the PyObject!"); 139 | dict.set_item("score", self.score) 140 | .expect("Could not set the score attribute in the PyObject!"); 141 | dict.set_item("stickied", self.stickied) 142 | .expect("Could not set the stickied attribute in the PyObject!"); 143 | dict.set_item("replies", self.replies.clone()) 144 | .expect("Could not set the replies attribute in the PyObject!"); 145 | 146 | dict.into() 147 | } 148 | } 149 | 150 | /// The comment `Forest` - a data structure that resembles comment threads as seen on Reddit. 151 | #[derive(Debug, Deserialize, Serialize)] 152 | #[pyclass] 153 | pub struct Forest { 154 | /// The root of the forest. 155 | pub root: CommentNode, 156 | } 157 | 158 | #[pymethods] 159 | impl Forest { 160 | /// Create a new `Forest`. 161 | #[new] 162 | fn new(submission_id: String) -> PyResult { 163 | let root = CommentNode { 164 | author: "".to_string(), 165 | body: "".to_string(), 166 | body_html: "".to_string(), 167 | created_utc: "".to_string(), 168 | distinguished: None, 169 | edited: BoolOrDate::Bool(false), 170 | id: submission_id, 171 | is_submitter: true, 172 | link_id: "".to_string(), 173 | parent_id: "".to_string(), 174 | score: 0, 175 | stickied: false, 176 | replies: vec![], 177 | }; 178 | 179 | Ok(Self { root }) 180 | } 181 | 182 | /// An iterative implementation of depth-first search that inserts a new comment into the 183 | /// `Forest`. 184 | fn _dfs_insert(&mut self, new_comment: CommentNode) { 185 | let root_id = &self.root.id.clone(); 186 | 187 | let mut stack: VecDeque<&mut CommentNode> = VecDeque::new(); 188 | stack.push_front(&mut self.root); 189 | 190 | let mut visited: HashSet = HashSet::new(); 191 | visited.insert(root_id.to_string()); 192 | 193 | let target_id = &new_comment 194 | .parent_id 195 | .split('_') 196 | .last() 197 | .unwrap_or(&new_comment.parent_id) 198 | .to_string(); 199 | 200 | let mut found = false; 201 | 202 | while !found { 203 | if let Some(comment_node) = stack.pop_front() { 204 | for reply in comment_node.replies.iter_mut() { 205 | if target_id == &reply.id { 206 | reply.replies.push(new_comment.clone()); 207 | found = true; 208 | } else { 209 | let child_id = reply.id.clone(); 210 | 211 | if !visited.contains(child_id.as_str()) { 212 | stack.push_front(reply); 213 | visited.insert(child_id); 214 | } 215 | } 216 | } 217 | } 218 | } 219 | } 220 | 221 | /// Plant a new comment in the `Forest`. 222 | fn seed_comment(&mut self, new_comment: CommentNode) { 223 | let parent_id = &new_comment 224 | .parent_id 225 | .split('_') 226 | .last() 227 | .unwrap_or(&new_comment.parent_id) 228 | .to_string(); 229 | 230 | if parent_id == &self.root.id { 231 | self.root.replies.push(new_comment); 232 | } else { 233 | self._dfs_insert(new_comment); 234 | } 235 | } 236 | 237 | /// Return an array of `CommentNode`s in the form of a `String`. This enables 238 | /// Python to `json.loads()` this string to convert the `Forest` into a Python 239 | /// native type. 240 | #[getter] 241 | fn comments(&self) -> String { 242 | serde_json::to_string(&self.root.replies).unwrap_or("None".to_string()) 243 | } 244 | 245 | /// Returns the `root` of the `Forest`. 246 | #[getter] 247 | fn root(&self) -> CommentNode { 248 | self.root.clone() 249 | } 250 | } 251 | -------------------------------------------------------------------------------- /manual/src/implementation-details/the-forest.md: -------------------------------------------------------------------------------- 1 | # The Forest 2 | 3 | **Created:** March 17, 2021 4 | 5 | > This Python code has been deprecated as of `URS v3.4.0` and has been rewritten in Rust. However, the concepts discussed in this document as well as the implementation are still applicable to the Rust rewrite. 6 | > 7 | > See [Speeding Up Python with Rust](./speeding-up-python-with-rust.md) for details on how I rewrote this code in Rust and how it yielded drastic performance improvements if you are interested in learning more. 8 | 9 | # Table of Contents 10 | 11 | - [Introduction](#introduction) 12 | - [Motivation](#motivation) 13 | - [Inspiration](#inspiration) 14 | - [How the Forest Works](#how-the-forest-works) 15 | - [The `CommentNode`](#the-commentnode) 16 | - [The `Forest`](#the-forest-1) 17 | - [The Root Node](#the-root-node) 18 | - [How `PRAW` Comments Are Linked](#how-praw-comments-are-linked) 19 | - [The Insertion Methods](#the-insertion-methods) 20 | - [Serializing the `Forest`](#serializing-the-forest) 21 | 22 | # Introduction 23 | 24 | ## Motivation 25 | 26 | I am a self-taught software developer who just recently graduated from college and am currently looking for my first full-time job. I do not have a computer science degree, so I have had to teach myself a ton of concepts that I would have learned if I got the degree. A class I wish I was able to take in college is data structures and algorithms because that seems to be all the buzz when it comes to the technical interview, which I unfortunately struggle with greatly due to my lack of experience and practice. 27 | 28 | Recently (March 2021) I have been teaching myself DSA. Implementing simple examples of each topic within DSA was not so bad (I am currently working on a study guide/reference repository containing these implementations in both Python and Rust that I will make public soon), but practicing Leetcode problems was and still is a difficult process for me. I will continue to power through the struggle because my livelihood and future career depends on it, though. 29 | 30 | While it has not been a smooth journey, I have come to realize how useful DSA is and am implementing what I have learned in a real-world use case. I do not think I would have been able to figure out a solution to the structured comments scraper's prior shortcomings if I had not studied this area within computer science. I recently implemented my first [trie][trie] and was fascinated by how abstract data structures worked. I immediately realized I needed to use a tree data structure for the structured comments scraper in order to take it to the next level, which is the purpose of [this pull request][pull request]. 31 | 32 | ## Inspiration 33 | 34 | The `Forest` is named after `PRAW`'s [`CommentForest`][commentforest]. The `CommentForest` does not return comments in structured format, so I wrote my own implementation of it. 35 | 36 | The trie was a huge inspiration for the `Forest`. I will quickly explain my implementation of the trie node. 37 | 38 | ```python 39 | class TrieNode(): 40 | def __init__(self, char, is_word): 41 | self.char = char 42 | self.is_word = is_word 43 | self.children = dict() 44 | ``` 45 | 46 | Each node of the trie contains a character, a boolean flag indicating whether the node denotes the end of a word, and holds a dictionary filled with child nodes as values and their respective characters as keys. I could have used an array and the indices within it to emulate a dictionary, but I figured I could save some access time at the cost of extra space. 47 | 48 | Anyways, the trie implementation is very similar to how the `Forest` works. 49 | 50 | # How the Forest Works 51 | 52 | ## The `CommentNode` 53 | 54 | I created a class `CommentNode` to store each comment's metadata and replies: 55 | 56 | ```python 57 | class CommentNode(): 58 | def __init__(self, metadata): 59 | for key, value in metadata.items(): 60 | self.__setattr__(key, value) 61 | 62 | self.replies = [] 63 | ``` 64 | 65 | I used `__setattr__()` because the root node defers from the standard comment node schema. By using `__setattr__()`, `CommentNode` attributes will be dynamically set based on the `metadata` dictionary that has been passed in. `self.replies` holds additional `CommentNode`s. 66 | 67 | ## The `Forest` 68 | 69 | Next, I created a class `Forest` which holds the root node and includes methods for insertion. 70 | 71 | ### The Root Node 72 | 73 | First, let's go over the root node. 74 | 75 | ```python 76 | class Forest(): 77 | def __init__(self): 78 | self.root = CommentNode({ "id": "abc123" }) 79 | ``` 80 | 81 | The only key in the dictionary passed into `CommentNode` is `id`, therefore the root `CommentNode` will only contain the attributes `self.id` and `self.replies`. A mock submission ID is shown. The actual source code will pull the submission's ID based on the URL that was passed into the `-c` flag and set the `id` value accordingly. 82 | 83 | Before I get to the insertion methods, I will explain how comments and their replies are linked. 84 | 85 | ### How `PRAW` Comments Are Linked 86 | 87 | `PRAW` returns all submission comments by level order. This means all top levels are returned first, followed by all second-level replies, then third, so on and so forth. 88 | 89 | I will create some mock comment objects to demonstrate. Here is a top level comment corresponding to the mock submisssion ID. Note the `parent_id` contains the submission's `id`, which is stored in `self.root.id`: 90 | 91 | ```json 92 | { 93 | "author": "u/asdfasdfasdfasdf", 94 | "body": "A top level comment here.", 95 | "created_utc": "06-06-2006 06:06:06", 96 | "distinguished": null, 97 | "edited": false, 98 | "id": "qwerty1", 99 | "is_submitter": false, 100 | "link_id": "t3_asdfgh", 101 | "parent_id": "t3_abc123", 102 | "score": 666, 103 | "stickied": false 104 | } 105 | ``` 106 | 107 | Here is a second-level reply to the top comment. Note the `parent_id` contains the top comment's `id`: 108 | 109 | ```json 110 | { 111 | "author": "u/hjklhjklhjklhjkl", 112 | "body": "A reply here.", 113 | "created_utc": "06-06-2006 18:06:06", 114 | "distinguished": null, 115 | "edited": false, 116 | "id": "hjkl234", 117 | "is_submitter": true, 118 | "link_id": "t3_1a2b3c", 119 | "parent_id": "t1_qwerty1", 120 | "score": 6, 121 | "stickied": false 122 | } 123 | ``` 124 | 125 | This pattern continues all the way down to the last level of comments. It is now very easy to link the correct comments together. I do this by calling `split("_", 1)` on the `parent_id` and then getting the second item in the split list to compare values. I also specify the `maxsplit` parameter to force one split. 126 | 127 | ### The Insertion Methods 128 | 129 | I then defined the methods for `CommentNode` insertion. 130 | 131 | ```python 132 | def _dfs_insert(self, new_comment): 133 | stack = [] 134 | stack.append(self.root) 135 | 136 | visited = set() 137 | visited.add(self.root) 138 | 139 | found = False 140 | while not found: 141 | current_comment = stack.pop(0) 142 | 143 | for reply in current_comment.replies: 144 | if new_comment.parent_id.split("_", 1)[1] == reply.id: 145 | reply.replies.append(new_comment) 146 | found = True 147 | else: 148 | if reply not in visited: 149 | stack.insert(0, reply) 150 | visited.add(reply) 151 | 152 | def seed(self, new_comment): 153 | parent_id = new_comment.parent_id.split("_", 1)[1] 154 | 155 | self.root.replies.append(new_comment) \ 156 | if parent_id == getattr(self.root, "id") \ 157 | else self._dfs_insert(new_comment) 158 | ``` 159 | 160 | I implemented the [depth-first search][depth-first search] algorithm to find a comment's parent node and insert it into the parent node's `replies` array. I defined a separate `visited` set to keep track of visited `CommentNode`s to avoid an infinite loop of inserting `CommentNode`s that were already visited into the `stack`. At first I wrote a recursive version of depth-first search, but then opted for an iterative version because it would not scale well for submissions that included large amounts of comments, ie. stack overflow. 161 | 162 | Within the `seed` method, I first check if the `CommentNode` is a top level comment by comparing its parent ID to the submission ID. Depth-first search is triggered if the `CommentNode` is not a top level comment. 163 | 164 | ## Serializing the `Forest` 165 | 166 | Since Python's built-in JSON module can only handle primitive types that have a direct JSON equivalent, a custom encoder is necessary to convert the `Forest` into JSON format. I defined this in `Export.py`. 167 | 168 | ```python 169 | from json import JSONEncoder 170 | 171 | class EncodeNode(JSONEncoder): 172 | def default(self, object): 173 | return object.__dict__ 174 | ``` 175 | 176 | The `default()` method overrides `JSONEncoder`'s `default()` method and serializes the `CommentNode` by converting it into a dictionary, which is a primitive type that has a direct JSON equivalent: 177 | 178 | ```python 179 | EncodeNode().encode(CommentNode) 180 | ``` 181 | 182 | This ensures the node is correctly encoded before I call the `seed()` method to insert a new `CommentNode` into the `replies` arrays of its respective parent `CommentNode`. 183 | 184 | I can then use this custom `JSONEncoder` subclass while exporting by specifying it within `json.dump()` with the `cls` kwarg: 185 | 186 | ```python 187 | with open(filename, "w", encoding = "utf-8") as results: 188 | json.dump(data, results, indent = 4, cls = EncodeNode) 189 | ``` 190 | 191 | This was how the structured comments export was implemented. Refer to the source code located in `urs/praw_scrapers/Comments.py` to see more. I hope this was somewhat interesting and/or informative. Thanks for reading! 192 | 193 | 194 | 195 | [pull request]: https://github.com/JosephLai241/URS/pull/24 196 | [commentforest]: https://praw.readthedocs.io/en/latest/code_overview/other/commentforest.html 197 | [trie]: https://www.interviewcake.com/concept/java/trie 198 | [depth-first search]: https://www.interviewcake.com/concept/java/dfs 199 | -------------------------------------------------------------------------------- /tests/test_praw_scrapers/test_live_scrapers/test_Livestream.py: -------------------------------------------------------------------------------- 1 | """ 2 | Testing `Livestream.py`. 3 | """ 4 | 5 | 6 | import argparse 7 | import os 8 | import types 9 | 10 | import praw 11 | from dotenv import load_dotenv 12 | 13 | from urs.praw_scrapers.live_scrapers import Livestream 14 | from urs.utils.Global import date 15 | 16 | 17 | class MakeArgs: 18 | """ 19 | Making dummy args to test Comments.py methods. 20 | """ 21 | 22 | @staticmethod 23 | def parser_for_testing(): 24 | parser = argparse.ArgumentParser() 25 | return parser 26 | 27 | @staticmethod 28 | def make_scraper_args(): 29 | parser = MakeArgs.parser_for_testing() 30 | parser.add_argument("--live-subreddit") 31 | parser.add_argument("--live-redditor") 32 | parser.add_argument("--stream-submissions", action="store_true") 33 | 34 | return parser 35 | 36 | 37 | class Login: 38 | """ 39 | Create a Reddit object with PRAW API credentials. 40 | """ 41 | 42 | @staticmethod 43 | def create_reddit_object(): 44 | load_dotenv() 45 | 46 | return praw.Reddit( 47 | client_id=os.getenv("CLIENT_ID"), 48 | client_secret=os.getenv("CLIENT_SECRET"), 49 | user_agent=os.getenv("USER_AGENT"), 50 | username=os.getenv("REDDIT_USERNAME"), 51 | password=os.getenv("REDDIT_PASSWORD"), 52 | ) 53 | 54 | 55 | class TestSaveStreamCreateSkeletonMethod: 56 | """ 57 | Testing SaveStream class _create_skeleton() method. 58 | """ 59 | 60 | def test_create_skeleton_method_live_subreddit_default_streaming_comments_args( 61 | self, 62 | ): 63 | parser = MakeArgs.make_scraper_args() 64 | args = parser.parse_args("--live-subreddit askreddit".split()) 65 | 66 | skeleton = Livestream.SaveStream._create_skeleton(args) 67 | 68 | assert skeleton["livestream_settings"]["subreddit"] == "askreddit" 69 | assert skeleton["livestream_settings"]["included_reddit_objects"] == "comments" 70 | assert skeleton["data"] == [] 71 | 72 | def test_create_skeleton_method_live_subreddit_streaming_submissions_args(self): 73 | parser = MakeArgs.make_scraper_args() 74 | args = parser.parse_args( 75 | "--live-subreddit askreddit --stream-submissions".split() 76 | ) 77 | 78 | skeleton = Livestream.SaveStream._create_skeleton(args) 79 | 80 | assert skeleton["livestream_settings"]["subreddit"] == "askreddit" 81 | assert ( 82 | skeleton["livestream_settings"]["included_reddit_objects"] == "submissions" 83 | ) 84 | assert skeleton["data"] == [] 85 | 86 | def test_create_skeleton_method_live_redditor_default_streaming_comments_args(self): 87 | parser = MakeArgs.make_scraper_args() 88 | args = parser.parse_args("--live-redditor spez".split()) 89 | 90 | skeleton = Livestream.SaveStream._create_skeleton(args) 91 | 92 | assert skeleton["livestream_settings"]["redditor"] == "spez" 93 | assert skeleton["livestream_settings"]["included_reddit_objects"] == "comments" 94 | assert skeleton["data"] == [] 95 | 96 | def test_create_skeleton_method_live_redditor_streaming_submissions_args(self): 97 | parser = MakeArgs.make_scraper_args() 98 | args = parser.parse_args("--live-redditor spez --stream-submissions".split()) 99 | 100 | skeleton = Livestream.SaveStream._create_skeleton(args) 101 | 102 | assert skeleton["livestream_settings"]["redditor"] == "spez" 103 | assert ( 104 | skeleton["livestream_settings"]["included_reddit_objects"] == "submissions" 105 | ) 106 | assert skeleton["data"] == [] 107 | 108 | 109 | class TestSaveStreamMakeLivestreamDirMethod: 110 | """ 111 | Testing SaveStream class _make_livestream_dir() method. 112 | """ 113 | 114 | def test_make_livestream_dir_method_subreddits_subdirectory(self): 115 | test_split_stream_info = ["r"] 116 | 117 | stream_directory = Livestream.SaveStream._make_livestream_dir( 118 | test_split_stream_info 119 | ) 120 | 121 | assert stream_directory == f"../scrapes/{date}/livestream/subreddits" 122 | 123 | def test_make_livestream_dir_method_redditors_subdirectory(self): 124 | test_split_stream_info = ["u"] 125 | 126 | stream_directory = Livestream.SaveStream._make_livestream_dir( 127 | test_split_stream_info 128 | ) 129 | 130 | assert stream_directory == f"../scrapes/{date}/livestream/redditors" 131 | 132 | 133 | class TestSaveStreamGetTempFilenameMethod: 134 | """ 135 | Testing SaveStream class _get_temp_filename() method. 136 | """ 137 | 138 | def test_get_temp_filename_method_with_subreddit(self): 139 | test_stream_info = "in r/askreddit" 140 | 141 | stream_path = Livestream.SaveStream._get_temp_filename(test_stream_info) 142 | 143 | assert stream_path == f"../scrapes/{date}/livestream/subreddits/askreddit.json" 144 | 145 | def test_get_temp_filename_method_with_redditor(self): 146 | test_stream_info = "by u/spez" 147 | 148 | stream_path = Livestream.SaveStream._get_temp_filename(test_stream_info) 149 | 150 | assert stream_path == f"../scrapes/{date}/livestream/redditors/spez.json" 151 | 152 | 153 | class TestSaveStreamCreateTempFileMethod: 154 | """ 155 | Testing SaveStream class _create_temp_file() method. 156 | """ 157 | 158 | def test_create_temp_file_method(self): 159 | test_skeleton = {"test": 1} 160 | test_stream_path = "../scrapes/livestream/subreddits/askreddit.json" 161 | 162 | if not os.path.isdir("../scrapes/livestream/subreddits"): 163 | os.makedirs("../scrapes/livestream/subreddits") 164 | 165 | Livestream.SaveStream._create_temp_file(test_skeleton, test_stream_path) 166 | 167 | assert os.path.isfile(test_stream_path) 168 | 169 | 170 | class TestSaveStreamRenameMethod: 171 | """ 172 | Testing SaveStream class _rename() method. 173 | """ 174 | 175 | def test_rename_method_with_subreddit(self): 176 | test_duration = "00:00:15" 177 | test_object_info = "comments" 178 | test_start_stream = "18:06:06" 179 | test_stream_path = f"../scrapes/{date}/livestream/subreddits/askreddit.json" 180 | 181 | with open(test_stream_path, "w", encoding="utf-8") as _: 182 | pass 183 | 184 | Livestream.SaveStream._rename( 185 | test_duration, test_object_info, test_start_stream, test_stream_path 186 | ) 187 | 188 | renamed_file = f"../scrapes/{date}/livestream/subreddits/askreddit-comments-18_06_06-00_00_15.json" 189 | 190 | assert os.path.isfile(renamed_file) 191 | 192 | def test_rename_method_with_redditor(self): 193 | test_duration = "00:00:15" 194 | test_object_info = "submissions" 195 | test_start_stream = "18:06:06" 196 | test_stream_path = f"../scrapes/{date}/livestream/redditors/spez.json" 197 | 198 | with open(test_stream_path, "w", encoding="utf-8") as _: 199 | pass 200 | 201 | Livestream.SaveStream._rename( 202 | test_duration, test_object_info, test_start_stream, test_stream_path 203 | ) 204 | 205 | renamed_file = f"../scrapes/{date}/livestream/redditors/spez-submissions-18_06_06-00_00_15.json" 206 | 207 | assert os.path.isfile(renamed_file) 208 | 209 | 210 | class TestSaveStreamWriteMethod: 211 | """ 212 | Testing SaveStream class write() method. 213 | """ 214 | 215 | def test_write_method(self): 216 | pass 217 | 218 | 219 | class TestLivestreamSetInfoAndObjectMethod: 220 | """ 221 | Testing Livestream class _set_info_and_object() method. 222 | """ 223 | 224 | def test_set_info_and_object_live_subreddit(self): 225 | reddit = Login.create_reddit_object() 226 | 227 | parser = MakeArgs.make_scraper_args() 228 | args = parser.parse_args("--live-subreddit askreddit".split()) 229 | 230 | reddit_object, stream_info = Livestream.Livestream._set_info_and_object( 231 | args, reddit 232 | ) 233 | 234 | assert isinstance(reddit_object, praw.models.Subreddit) 235 | assert stream_info == "in r/askreddit" 236 | 237 | def test_set_info_and_object_live_redditor(self): 238 | reddit = Login.create_reddit_object() 239 | 240 | parser = MakeArgs.make_scraper_args() 241 | args = parser.parse_args("--live-redditor spez".split()) 242 | 243 | reddit_object, stream_info = Livestream.Livestream._set_info_and_object( 244 | args, reddit 245 | ) 246 | 247 | assert isinstance(reddit_object, praw.models.Redditor) 248 | assert stream_info == "by u/spez" 249 | 250 | 251 | class TestLivestreamStreamSwitchMethod: 252 | """ 253 | Testing Livestream class _stream_switch() method. 254 | """ 255 | 256 | def test_stream_switch_method_default_stream_comments(self): 257 | reddit = Login.create_reddit_object() 258 | subreddit = reddit.subreddit("askreddit") 259 | 260 | parser = MakeArgs.make_scraper_args() 261 | args = parser.parse_args("--live-subreddit askreddit".split()) 262 | 263 | generator, object_info = Livestream.Livestream._stream_switch(args, subreddit) 264 | 265 | assert isinstance(generator, types.GeneratorType) 266 | assert object_info == "comments" 267 | 268 | def test_stream_switch_method_stream_submissions(self): 269 | reddit = Login.create_reddit_object() 270 | subreddit = reddit.subreddit("askreddit") 271 | 272 | parser = MakeArgs.make_scraper_args() 273 | args = parser.parse_args( 274 | "--live-subreddit askreddit --stream-submissions".split() 275 | ) 276 | 277 | generator, object_info = Livestream.Livestream._stream_switch(args, subreddit) 278 | 279 | assert isinstance(generator, types.GeneratorType) 280 | assert object_info == "submissions" 281 | 282 | 283 | class TestLivestreamNoSaveStreamMethod: 284 | """ 285 | Testing livestream class _no_save_stream() method. 286 | """ 287 | 288 | def test_no_save_stream_method(self): 289 | pass 290 | 291 | 292 | class TestLivestreamStreamMethod: 293 | """ 294 | Testing Livestream class stream() method. 295 | """ 296 | 297 | def test_stream_method_live_subreddit(self): 298 | pass 299 | 300 | def test_stream_method_live_redditor(self): 301 | pass 302 | -------------------------------------------------------------------------------- /urs/praw_scrapers/static_scrapers/Comments.py: -------------------------------------------------------------------------------- 1 | """ 2 | Submission comments scraper 3 | =========================== 4 | Defining methods for the submission comments scraper. 5 | """ 6 | 7 | 8 | import json 9 | import logging 10 | from argparse import Namespace 11 | from typing import Any, Dict, List 12 | 13 | from colorama import Fore, Style 14 | from halo import Halo 15 | from praw import Reddit 16 | from praw.models import Submission 17 | from rich.progress import ( 18 | BarColumn, 19 | MofNCompleteColumn, 20 | Progress, 21 | RenderableColumn, 22 | SpinnerColumn, 23 | TextColumn, 24 | TimeRemainingColumn, 25 | ) 26 | from taisun.comments_utils import CommentNode, Forest 27 | 28 | from urs.praw_scrapers.utils.Objectify import Objectify 29 | from urs.praw_scrapers.utils.Validation import Validation 30 | from urs.utils.Cli import GetPRAWScrapeSettings 31 | from urs.utils.Export import Export, NameFile 32 | from urs.utils.Global import Status, convert_time, make_none_dict 33 | from urs.utils.Logger import LogExport, LogPRAWScraper 34 | from urs.utils.Titles import PRAWTitles 35 | 36 | 37 | class SortComments: 38 | """ 39 | Methods for sorting comments depending on which style of comments was 40 | specified (raw or structured). 41 | """ 42 | 43 | @staticmethod 44 | def sort_raw(all_comments: List[Dict[str, Any]], submission: Submission) -> None: 45 | """ 46 | Sort all comments in raw format. 47 | 48 | :param list[dict[str, Any]] all_comments: A `list[dict[str, Any]]` containing 49 | all comments within a submission. 50 | :param Submission submission: PRAW `Submission` object. 51 | """ 52 | 53 | for comment in submission.comments.list(): 54 | all_comments.append(Objectify().make_comment(comment, False)) 55 | 56 | @staticmethod 57 | def sort_structured(submission: Submission, url: str) -> List[Dict[str, Any]]: 58 | """ 59 | Sort all comments in structured format. 60 | 61 | :param Submission submission: PRAW `Submission` object. 62 | :param str url: The submission's URL. 63 | 64 | :returns: A `list[dict[str, Any]]` containing `CommentNode`s in `dict` 65 | form. 66 | :rtype: `list[dict[str, Any]]` 67 | """ 68 | 69 | renderable_column = RenderableColumn(renderable="|") 70 | spinner_column = SpinnerColumn(spinner_name="noise") 71 | text_column = TextColumn("Seeding Forest") 72 | 73 | progress_bar = Progress( 74 | spinner_column, 75 | text_column, 76 | BarColumn(), 77 | MofNCompleteColumn(), 78 | renderable_column, 79 | TimeRemainingColumn(), 80 | ) 81 | 82 | forest = Forest(submission.id_from_url(url)) 83 | 84 | with progress_bar: 85 | for comment in progress_bar.track(submission.comments.list()): 86 | comment_node = CommentNode( 87 | json.dumps((Objectify().make_comment(comment, False))) 88 | ) 89 | 90 | forest.seed_comment(comment_node) 91 | 92 | return forest.root.replies 93 | 94 | 95 | class GetSort: 96 | """ 97 | Methods for getting comments from a Reddit submission. 98 | """ 99 | 100 | def __init__(self, args: Namespace, submission: Submission, url: str) -> None: 101 | """ 102 | Initialize variables used in later methods: 103 | 104 | :param Namespace args: A `Namespace` object containing all arguments used 105 | in the CLI. 106 | :param Submission submission: PRAW `Submission` object. 107 | :param str url: The submission's URL. 108 | """ 109 | 110 | self._args = args 111 | self._url = url 112 | 113 | more_comments_status = Status( 114 | "Finished resolving instances of MoreComments.", 115 | Fore.CYAN 116 | + Style.BRIGHT 117 | + "Resolving instances of MoreComments. This may take a while. Please wait.", 118 | "cyan", 119 | ) 120 | 121 | more_comments_status.start() 122 | self._submission = submission 123 | self._submission.comments.replace_more(limit=None) 124 | more_comments_status.succeed() 125 | 126 | def get_sort(self, args: Namespace, limit: str) -> List[Dict[str, Any]]: 127 | """ 128 | Get comments from posts. 129 | 130 | :param Namespace args: A `Namespace` object containing all arguments used 131 | in the CLI. 132 | :param str limit: A `str` indicating the number of results to return. 133 | 134 | :returns: A `list[dict[str, Any]]` containing all comments within a submission. 135 | :rtype: `list[dict[str, Any]]` 136 | """ 137 | 138 | if args.raw: 139 | all_comments = [] 140 | SortComments().sort_raw(all_comments, self._submission) 141 | else: 142 | all_comments = SortComments().sort_structured(self._submission, self._url) 143 | 144 | return all_comments[: int(limit)] if int(limit) != 0 else all_comments 145 | 146 | 147 | class Write: 148 | """ 149 | Methods for writing scraped comments to CSV or JSON. 150 | """ 151 | 152 | @staticmethod 153 | def _make_json_skeleton( 154 | args: Namespace, limit: str, submission: Submission, url: str 155 | ) -> Dict[str, Dict[str, Any]]: 156 | """ 157 | Create a skeleton for JSON export. Include scrape details at the top. 158 | 159 | :param Namespace args: A `Namespace` object containing all arguments used 160 | in the CLI. 161 | :param str limit: A `str` indicating the number of results to return. 162 | :param Submission submission: PRAW `Submission` object. 163 | :param str url: The submission's URL. 164 | 165 | :returns: A `dict[str, dict[str, Any]]` containing scrape settings and 166 | all scrape data. 167 | :rtype: `dict[str, dict[str, Any]]` 168 | """ 169 | 170 | metadata_status = Status( 171 | "Extracted submission metadata.", "Extracting submission metadata.", "white" 172 | ) 173 | 174 | metadata_status.start() 175 | skeleton = { 176 | "scrape_settings": { 177 | "n_results": int(limit) if int(limit) > 0 else "all", 178 | "style": "structured" if not args.raw else "raw", 179 | "url": url, 180 | }, 181 | "data": { 182 | "submission_metadata": { 183 | "author": "u/" + submission.author.name 184 | if hasattr(submission.author, "name") 185 | else "[deleted]", 186 | "created_utc": convert_time(submission.created_utc), 187 | "distinguished": submission.distinguished, 188 | "edited": submission.edited 189 | if submission.edited == False 190 | else convert_time(submission.edited), 191 | "is_original_content": submission.is_original_content, 192 | "is_self": submission.is_self, 193 | "link_flair_text": submission.link_flair_text, 194 | "locked": submission.locked, 195 | "nsfw": submission.over_18, 196 | "num_comments": submission.num_comments, 197 | "permalink": submission.permalink, 198 | "score": submission.score, 199 | "selftext": submission.selftext, 200 | "spoiler": submission.spoiler, 201 | "stickied": submission.stickied, 202 | "subreddit": submission.subreddit.display_name, 203 | "title": submission.title, 204 | "upvote_ratio": submission.upvote_ratio, 205 | }, 206 | "comments": None, 207 | }, 208 | } 209 | 210 | try: 211 | skeleton["data"]["submission_metadata"][ 212 | "gallery_data" 213 | ] = submission.gallery_data 214 | skeleton["data"]["submission_metadata"][ 215 | "media_metadata" 216 | ] = submission.media_metadata 217 | 218 | skeleton["data"]["submission_metadata"] = dict( 219 | sorted(skeleton["data"]["submission_metadata"].items()) 220 | ) 221 | except AttributeError: 222 | pass 223 | 224 | metadata_status.succeed() 225 | 226 | return skeleton 227 | 228 | @staticmethod 229 | def _determine_export(args: Namespace, data: Dict[str, Any], f_name: str) -> None: 230 | """ 231 | Export either structured or raw comments. 232 | 233 | :param Namespace args: A `Namespace` object containing all arguments used 234 | in the CLI. 235 | :param dict[str, Any] data: A `dict[str, Any]` containing all scraped data. 236 | :param str f_name: The filename. 237 | """ 238 | 239 | if args.raw: 240 | export_status = f"Exporting {data['scrape_settings']['n_results']} comments in raw format." 241 | Halo().info(export_status) 242 | logging.info(export_status) 243 | Export.export(data, f_name, "json", "comments") 244 | else: 245 | export_status = f"Exporting {data['scrape_settings']['n_results']} comments in structured format." 246 | Halo().info(export_status) 247 | logging.info(export_status) 248 | Export.write_structured_comments(data, f_name) 249 | 250 | @staticmethod 251 | def write(args: Namespace, c_master: Dict[str, Any], reddit: Reddit): 252 | """ 253 | Get, sort, then write scraped comments to CSV or JSON. 254 | 255 | :param Namespace args: A `Namespace` object containing all arguments used 256 | in the CLI. 257 | :param dict[str, Any] data: A `dict[str, Any]` containing all scraped data. 258 | :param Reddit reddit: PRAW Reddit instance. 259 | """ 260 | 261 | for url, limit in c_master.items(): 262 | submission = reddit.submission(url=url) 263 | data = Write._make_json_skeleton(args, limit, submission, url) 264 | data["data"]["comments"] = GetSort(args, submission, url).get_sort( 265 | args, limit 266 | ) 267 | 268 | f_name = NameFile().c_fname(args, limit, submission.title) 269 | Write._determine_export(args, data, f_name) 270 | 271 | print() 272 | Halo( 273 | color="green", 274 | text=Style.BRIGHT 275 | + Fore.GREEN 276 | + f"JSON file for '{submission.title}' comments created.", 277 | ).succeed() 278 | print() 279 | 280 | 281 | class RunComments: 282 | """ 283 | Run the comments scraper. 284 | """ 285 | 286 | @staticmethod 287 | @LogExport.log_export 288 | @LogPRAWScraper.scraper_timer("comments") 289 | def run(args: Namespace, reddit: Reddit) -> Dict[str, Any]: 290 | """ 291 | Run comments scraper. 292 | 293 | :param Namespace args: A `Namespace` object containing all arguments used 294 | in the CLI. 295 | :param Reddit reddit: PRAW Reddit instance. 296 | 297 | :returns: A `dict[str, Any]` containing all submission comments scrape 298 | settings. 299 | :rtype: `dict[str, Any]` 300 | """ 301 | 302 | PRAWTitles.c_title() 303 | 304 | post_list = GetPRAWScrapeSettings().create_list(args, "comments") 305 | not_posts, posts = Validation.validate(post_list, reddit, "comments") 306 | c_master = make_none_dict(posts) 307 | GetPRAWScrapeSettings().get_settings(args, not_posts, c_master, "comments") 308 | 309 | Write.write(args, c_master, reddit) 310 | 311 | return c_master 312 | --------------------------------------------------------------------------------