├── tests
    ├── __init__.py
    ├── test_utils
    │   ├── __init__.py
    │   ├── test_DirInit.py
    │   ├── test_Tools.py
    │   ├── test_Global.py
    │   └── test_Utilities.py
    ├── test_analytics
    │   ├── __init__.py
    │   ├── test_utils
    │   │   ├── __init__.py
    │   │   └── test_PrepData.py
    │   ├── test_Wordcloud.py
    │   └── test_Frequencies.py
    ├── test_praw_scrapers
    │   ├── __init__.py
    │   ├── test_utils
    │   │   └── __init__.py
    │   ├── test_live_scrapers
    │   │   ├── __init__.py
    │   │   ├── test_utils
    │   │   │   ├── __init__.py
    │   │   │   └── test_StreamGenerator.py
    │   │   └── test_Livestream.py
    │   └── test_static_scrapers
    │   │   ├── __init__.py
    │   │   ├── test_Basic.py
    │   │   └── test_Redditor.py
    ├── test_pushshift_scrapers
    │   └── __init__.py
    └── conftest.py
├── urs
    ├── __init__.py
    ├── utils
    │   ├── __init__.py
    │   ├── DirInit.py
    │   ├── Global.py
    │   ├── Tools.py
    │   ├── Utilities.py
    │   └── Titles.py
    ├── analytics
    │   ├── __init__.py
    │   ├── utils
    │   │   └── __init__.py
    │   ├── Wordcloud.py
    │   └── Frequencies.py
    ├── praw_scrapers
    │   ├── __init__.py
    │   ├── utils
    │   │   ├── __init__.py
    │   │   ├── Objectify.py
    │   │   └── Validation.py
    │   ├── live_scrapers
    │   │   ├── __init__.py
    │   │   └── utils
    │   │   │   ├── __init__.py
    │   │   │   ├── StreamGenerator.py
    │   │   │   └── DisplayStream.py
    │   └── static_scrapers
    │   │   ├── __init__.py
    │   │   └── Comments.py
    ├── Version.py
    └── Urs.py
├── manual
    ├── .gitignore
    ├── book.toml
    └── src
    │   ├── contributing
    │       ├── building-on-top-of-urs.md
    │       ├── making-pull-or-feature-requests.md
    │       └── before-making-pull-or-feature-requests.md
    │   ├── utilities
    │       ├── rate-limit-checking.md
    │       └── tree.md
    │   ├── derivative-projects.md
    │   ├── installation.md
    │   ├── introduction.md
    │   ├── additional-information
    │       ├── 2fa-information.md
    │       └── error-messages.md
    │   ├── credentials.md
    │   ├── scraping-reddit
    │       ├── scrape-speeds-and-rate-limits.md
    │       ├── all-attributes-table.md
    │       ├── submission-comments.md
    │       ├── subreddit.md
    │       └── redditor.md
    │   ├── contributors.md
    │   ├── README.md
    │   ├── SUMMARY.md
    │   ├── livestreaming-reddit
    │       ├── livestreaming-subreddits-and-redditors.md
    │       └── general-information.md
    │   ├── analytical-tools
    │       ├── frequencies-and-wordclouds.md
    │       └── general-information.md
    │   ├── exporting.md
    │   └── implementation-details
    │       └── the-forest.md
├── .github
    ├── FUNDING.yml
    ├── workflows
    │   ├── manual.yml
    │   ├── rust.yml
    │   └── python.yml
    ├── ISSUE_TEMPLATE
    │   ├── FEATURE_REQUEST.md
    │   └── BUG_REPORT.md
    ├── CODE_OF_CONDUCT.md
    ├── STYLE_GUIDE.md
    └── PULL_REQUEST_TEMPLATE.md
├── poetry.toml
├── .gitignore
├── rustfmt.toml
├── .env
├── Cargo.toml
├── pyproject.toml
├── taisun
    ├── lib.rs
    └── comments.rs
├── LICENSE
├── README.md
└── supplemental_docs
    └── The Forest.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/urs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/urs/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/manual/.gitignore:
--------------------------------------------------------------------------------
1 | book
2 | 


--------------------------------------------------------------------------------
/tests/test_utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/urs/analytics/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_analytics/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/urs/analytics/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/urs/praw_scrapers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_praw_scrapers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/urs/praw_scrapers/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_analytics/test_utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_pushshift_scrapers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/urs/praw_scrapers/live_scrapers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: JosephLai241
2 | 


--------------------------------------------------------------------------------
/tests/test_praw_scrapers/test_utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/urs/praw_scrapers/static_scrapers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/poetry.toml:
--------------------------------------------------------------------------------
1 | [virtualenvs]
2 | in-project = true
3 | 


--------------------------------------------------------------------------------
/urs/praw_scrapers/live_scrapers/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_praw_scrapers/test_live_scrapers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_praw_scrapers/test_static_scrapers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_praw_scrapers/test_live_scrapers/test_utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | 
3 | scrapes/*
4 | target/*
5 | 
6 | Cargo.lock
7 | 


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | group_imports = "StdExternalCrate"
2 | imports_granularity = "Crate"
3 | 


--------------------------------------------------------------------------------
/urs/Version.py:
--------------------------------------------------------------------------------
1 | """
2 | Version
3 | =======
4 | Defining the version number in one place.
5 | """
6 | 
7 | __version__ = "3.4.0"
8 | 


--------------------------------------------------------------------------------
/manual/book.toml:
--------------------------------------------------------------------------------
1 | [book]
2 | authors = ["Joseph Lai"]
3 | language = "en"
4 | multilingual = false
5 | src = "src"
6 | title = "URS User Guide"
7 | 


--------------------------------------------------------------------------------
/.env:
--------------------------------------------------------------------------------
 1 | # PRAW Credentials
 2 | 
 3 | # Personal use script (14 characters)
 4 | CLIENT_ID="14_CHAR_HERE"
 5 | 
 6 | # Secret key (27 characters)
 7 | CLIENT_SECRET="27_CHAR_HERE"
 8 | 
 9 | # App name
10 | USER_AGENT="APP_NAME_HERE"
11 | 
12 | # Reddit username
13 | REDDIT_USERNAME="REDDIT_USERNAME_HERE"
14 | 
15 | # Reddit password
16 | REDDIT_PASSWORD="REDDIT_PASSWORD_HERE"
17 | 


--------------------------------------------------------------------------------
/manual/src/contributing/building-on-top-of-urs.md:
--------------------------------------------------------------------------------
1 | # Building on Top of `URS`
2 | 
3 | Although I will not approve requests that deviate from the project scope, feel free to reach out if you have built something on top of `URS` or have made modifications to scrape something specific on Reddit. I will add your project to the [Derivative Projects](../derivative-projects.md) section!
4 | 


--------------------------------------------------------------------------------
/manual/src/utilities/rate-limit-checking.md:
--------------------------------------------------------------------------------
 1 | # Check PRAW Rate Limits
 2 | 
 3 | ![Check PRAW Rate Limits Demo GIF][check praw rate limits demo]
 4 | 
 5 | You can quickly check the rate limits for your account by using this flag.
 6 | 
 7 | ```
 8 | poetry run Urs.py --check
 9 | ```
10 | 
11 | [check praw rate limits demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/utilities/check_rate_limit_demo.gif?raw=true
12 | 


--------------------------------------------------------------------------------
/manual/src/derivative-projects.md:
--------------------------------------------------------------------------------
 1 | # Derivative Projects
 2 | 
 3 | This is a showcase for projects that are built on top of URS!
 4 | 
 5 | ## [skiwheelr/URS][skiwheelr project link]
 6 | 
 7 | ![skiwheelr project output screenshot][skiwheelr screenshot]
 8 | 
 9 | Contains a bash script built on URS which counts ticker mentions in Subreddits, subsequently cURLs all the relevant links in parallel, and counts the mentions of those.
10 | 
11 | [skiwheelr project link]: https://github.com/skiwheelr/URS
12 | [skiwheelr screenshot]: https://i.imgur.com/ChHdAZv.png
13 | 


--------------------------------------------------------------------------------
/tests/test_utils/test_DirInit.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Testing `DirInit.py`.
 3 | """
 4 | 
 5 | 
 6 | import os
 7 | 
 8 | from urs.utils.DirInit import InitializeDirectory
 9 | 
10 | 
11 | class TestInitializeDirectoryCreateDirsMethod:
12 |     """
13 |     Testing InitializeDirectory class create_dirs() method.
14 |     """
15 | 
16 |     def test_create_dirs_method(self):
17 |         test_path = "../scrapes/test_dir/another_test_dir/a_final_dir"
18 | 
19 |         InitializeDirectory.create_dirs(test_path)
20 | 
21 |         assert True if os.path.isdir(test_path) else False
22 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | authors = ["Joseph Lai <urs_project@protonmail.com>"]
 3 | description = "The heavy lifter for URS"
 4 | edition = "2021"
 5 | homepage = "https://github.com/JosephLai241/URS"
 6 | license = "MIT"
 7 | name = "taisun"
 8 | repository = "https://github.com/JosephLai241/URS"
 9 | version = "1.0.0"
10 | 
11 | [lib]
12 | crate-type = ["cdylib"]
13 | name = "taisun"
14 | path = "taisun/lib.rs"
15 | 
16 | [dependencies]
17 | pyo3 = { version = "0.17.3", features = ["extension-module", "serde"] }
18 | serde = { version = "1.0.148", features = ["derive"] }
19 | serde_json = "1.0.89"
20 | 


--------------------------------------------------------------------------------
/manual/src/contributing/making-pull-or-feature-requests.md:
--------------------------------------------------------------------------------
1 | # Making Pull or Feature Requests
2 | 
3 | You can suggest new features or changes by going to the [Issues tab][issues] and fill out the Feature Request template. If there is a good reason for a new feature, I will consider adding it.
4 | 
5 | You are also more than welcome to create a pull request -- adding additional features, improving runtime, or refactoring existing code. If it is approved, I will merge the pull request into the master branch and credit you for contributing to this project.
6 | 
7 | [issues]: https://github.com/JosephLai241/URS/issues
8 | 


--------------------------------------------------------------------------------
/manual/src/installation.md:
--------------------------------------------------------------------------------
 1 | > **_NOTE:_ Requires Python 3.11+ and [Poetry][poetry installation page] installed on your system.**
 2 | 
 3 | Run the following commands to install `URS`:
 4 | 
 5 | ```
 6 | git clone --depth=1 https://github.com/JosephLai241/URS.git
 7 | cd URS
 8 | poetry install
 9 | poetry shell
10 | maturin develop --release
11 | ```
12 | 
13 | > **_TIP:_** If `poetry shell` does not activate the virtual environment created by `Poetry`, run the following command to activate it:
14 | >
15 | > ```
16 | > source .venv/bin/activate
17 | > ```
18 | 
19 | [poetry installation page]: https://python-poetry.org/docs/#installation
20 | 


--------------------------------------------------------------------------------
/urs/utils/DirInit.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Initialize directories
 3 | ======================
 4 | Initialize directories in which scraped or analytical data is stored.
 5 | """
 6 | 
 7 | 
 8 | import os
 9 | 
10 | 
11 | class InitializeDirectory:
12 |     """
13 |     Methods for initializing directories for the exported files.
14 |     """
15 | 
16 |     @staticmethod
17 |     def create_dirs(path: str) -> None:
18 |         """
19 |         Make directories for scrape files.
20 | 
21 |         :param str path: The path to the directories in which scrape files are
22 |             saved.
23 |         """
24 | 
25 |         if not os.path.isdir(path):
26 |             os.makedirs(path)
27 | 


--------------------------------------------------------------------------------
/manual/src/introduction.md:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | 
 3 | This is a comprehensive Reddit scraping tool that integrates multiple features:
 4 | 
 5 | - Scrape Reddit via [`PRAW`][praw] (the official Python Reddit API Wrapper)
 6 |   - Scrape Subreddits
 7 |   - Scrape Redditors
 8 |   - Scrape submission comments
 9 | - Livestream Reddit via `PRAW`
10 |   - Livestream comments submitted within Subreddits or by Redditors
11 |   - Livestream submissions submitted within Subreddits or by Redditors
12 | - Analytical tools for scraped data
13 |   - Generate frequencies for words that are found in submission titles, bodies, and/or comments
14 |   - Generate a wordcloud from scrape results
15 | 
16 | [praw]: https://pypi.org/project/praw/
17 | 


--------------------------------------------------------------------------------
/.github/workflows/manual.yml:
--------------------------------------------------------------------------------
 1 | name: Deploy Manual
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 | 
 8 | jobs:
 9 |   deploy:
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |       - uses: actions/checkout@v3
14 |         with:
15 |           fetch-depth: 0
16 | 
17 |       - name: Setup mdBook
18 |         uses: peaceiris/actions-mdbook@v1
19 |         with:
20 |           mdbook-version: "latest"
21 | 
22 |       - name: Build manual
23 |         run: mdbook build
24 |         working-directory: manual
25 | 
26 |       - name: Deploy to GitHub Pages
27 |         uses: JamesIves/github-pages-deploy-action@v4.2.5
28 |         with:
29 |           branch: gh-pages
30 |           folder: manual/book
31 | 


--------------------------------------------------------------------------------
/manual/src/contributing/before-making-pull-or-feature-requests.md:
--------------------------------------------------------------------------------
1 | # Before Making Pull or Feature Requests
2 | 
3 | Consider the scope of this project before submitting a pull or feature request. `URS` stands for Universal Reddit Scraper. Two important aspects are listed in its name - _universal_ and _scraper_.
4 | 
5 | I will not approve feature or pull requests that deviate from its sole purpose. This may include scraping a specific aspect of Reddit or [adding functionality that allows you to post a comment with `URS`][commenting feature request]. Adding either of these requests will no longer allow `URS` to be universal or merely a scraper. However, I am more than happy to approve requests that enhance the current scraping capabilities of `URS`.
6 | 
7 | [commenting feature request]: https://github.com/JosephLai241/URS/issues/17
8 | 


--------------------------------------------------------------------------------
/manual/src/additional-information/2fa-information.md:
--------------------------------------------------------------------------------
1 | # Two-Factor Authentication
2 | 
3 | If you choose to use 2FA with your Reddit account, enter your password followed by a colon and then your 2FA token in the `password` field on line 26. For example, if your password is `"p4ssw0rd"` and your 2FA token is `"123456"`, you will enter `"p4ssw0rd:123456"` in the `password` field.
4 | 
5 | **2FA is NOT recommended for use with this program.** This is because PRAW will raise an OAuthException after one hour, prompting you to refresh your 2FA token and re-enter your credentials. Additionally, this means your 2FA token would be stored alongside your Reddit username and password, which would defeat the purpose of enabling 2FA in the first place. See [here](https://praw.readthedocs.io/en/latest/getting_started/authentication.html#two-factor-authentication) for more information.
6 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "urs"
 3 | version = "3.4.0"
 4 | description = "URS (Universal Reddit Scraper): A comprehensive Reddit scraping and OSINT command-line tool"
 5 | authors = ["Joseph Lai <urs_project@protonmail.com>"]
 6 | license = "MIT"
 7 | readme = "README.md"
 8 | 
 9 | [tool.poetry.dependencies]
10 | python = "^3.11"
11 | colorama = "^0.4.6"
12 | halo = "^0.0.31"
13 | praw = "^7.7.0"
14 | prettytable = "^3.7.0"
15 | python-dotenv = "^1.0.0"
16 | rich = "^13.3.5"
17 | wordcloud = "^1.9.1.1"
18 | 
19 | [tool.poetry.group.dev.dependencies]
20 | black = "^23.3.0"
21 | coverage = "^7.2.4"
22 | isort = "^5.12.0"
23 | maturin = "^0.14.17"
24 | pytest = "^7.3.1"
25 | pytest-cov = "^4.0.0"
26 | 
27 | [tool.maturin]
28 | features = ["pyo3/extension-module"]
29 | python-source = "urs"
30 | 
31 | [build-system]
32 | requires = ["maturin>=0.14,<0.15", "poetry-core"]
33 | build-backend = "maturin"
34 | 


--------------------------------------------------------------------------------
/.github/workflows/rust.yml:
--------------------------------------------------------------------------------
 1 | name: Rust code checks
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches-ignore:
 6 |       - "demo-gifs"
 7 |       - "gh-pages"
 8 |       - "rust-demo"
 9 |       - "samples"
10 | 
11 | jobs:
12 |   rustfmt:
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |       - uses: actions/checkout@v3
17 | 
18 |       - uses: dtolnay/rust-toolchain@stable
19 | 
20 |       - name: Rustfmt check
21 |         uses: mbrobbel/rustfmt-check@master
22 |         with:
23 |           token: ${{ secrets.GITHUB_TOKEN }}
24 | 
25 |   clippy:
26 |     runs-on: ubuntu-latest
27 | 
28 |     steps:
29 |       - uses: actions/checkout@v3
30 | 
31 |       - name: Add clippy via rustup
32 |         run: rustup component add clippy
33 | 
34 |       - name: Clippy check
35 |         uses: actions-rs/clippy-check@v1
36 |         with:
37 |           token: ${{ secrets.GITHUB_TOKEN }}
38 |           args: --all-features
39 | 


--------------------------------------------------------------------------------
/taisun/lib.rs:
--------------------------------------------------------------------------------
 1 | //! `taisun` - The heavy lifter for `URS`.
 2 | 
 3 | use pyo3::{prelude::*, types::PyDict};
 4 | 
 5 | use comments::{CommentNode, Forest};
 6 | 
 7 | mod comments;
 8 | 
 9 | /// This module contains utilities for submission comments scraping.
10 | #[pymodule]
11 | fn comments_utils(_python: Python, module: &PyModule) -> PyResult<()> {
12 |     module.add_class::<CommentNode>()?;
13 |     module.add_class::<Forest>()?;
14 | 
15 |     Ok(())
16 | }
17 | 
18 | /// `taisun` - The heavy lifter for `URS`.
19 | #[pymodule]
20 | fn taisun(python: Python, module: &PyModule) -> PyResult<()> {
21 |     let comments_utils = pyo3::wrap_pymodule!(comments_utils);
22 |     module.add_wrapped(comments_utils)?;
23 | 
24 |     let sys = PyModule::import(python, "sys")?;
25 |     let sys_modules: &PyDict = sys.getattr("modules")?.downcast()?;
26 |     sys_modules.set_item("taisun.comments_utils", module.getattr("comments_utils")?)?;
27 | 
28 |     Ok(())
29 | }
30 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Cleanup scripts that are run after tests are done.
 3 | """
 4 | 
 5 | 
 6 | from pathlib import Path
 7 | 
 8 | import pytest
 9 | 
10 | 
11 | def remove_directories(directory):
12 |     """
13 |     Recursively remove directories created by `pytest`.
14 | 
15 |     Parameters
16 |     ----------
17 |     directory: Path
18 | 
19 |     Returns
20 |     -------
21 |     None
22 |     """
23 | 
24 |     directory = Path(directory)
25 |     for item in directory.iterdir():
26 |         remove_directories(item) if item.is_dir() else item.unlink()
27 | 
28 |     directory.rmdir()
29 | 
30 | 
31 | @pytest.hookimpl(trylast=True)
32 | def pytest_sessionfinish():
33 |     """
34 |     Clean up after `pytest` is done running tests.
35 |     """
36 | 
37 |     print("\nCleaning up tests...")
38 | 
39 |     try:
40 |         remove_directories(Path("../scrapes"))
41 |         print("Done.")
42 |     except Exception as e:
43 |         print("An error has occurred: %s" % e)
44 | 


--------------------------------------------------------------------------------
/tests/test_analytics/test_Wordcloud.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Testing `Wordcloud.py`.
 3 | """
 4 | 
 5 | 
 6 | from urs.analytics import Wordcloud
 7 | 
 8 | 
 9 | class TestSetUpWordcloudInitializeWordcloudMethod:
10 |     """
11 |     Testing SetUpWordcloud class initialize_wordcloud() method.
12 |     """
13 | 
14 |     def test_initialize_wordcloud_method(self):
15 |         pass
16 | 
17 | 
18 | class TestSetUpWordcloudModifyWordcloudMethod:
19 |     """
20 |     Testing SetUpWordcloud class modify_wordcloud() method.
21 |     """
22 | 
23 |     def test_modify_wordcloud_method(self):
24 |         pass
25 | 
26 | 
27 | class TestFinalizeWordcloudShowWordcloudMethod:
28 |     """
29 |     Testing FinalizeWordcloud class show_wordcloud() method.
30 |     """
31 | 
32 |     def test_show_wordcloud_method(self):
33 |         pass
34 | 
35 | 
36 | class TestFinalizeWordcloudSaveWordcloudMethod:
37 |     """
38 |     Testing FinalizeWordcloud class save_wordcloud() method.
39 |     """
40 | 
41 |     def test_save_wordcloud_method(self):
42 |         pass
43 | 


--------------------------------------------------------------------------------
/manual/src/utilities/tree.md:
--------------------------------------------------------------------------------
 1 | # Display Directory Tree
 2 | 
 3 | ![Display Directory Tree Demo GIF][display directory tree demo]
 4 | 
 5 | ## All Flags
 6 | 
 7 | These are all the flags that may be used when displaying the directory tree.
 8 | 
 9 | ```
10 | [-t [<optional_date>]]
11 | ```
12 | 
13 | ## Usage
14 | 
15 | If no date is provided, you can quickly view the directory structure for the current date. This is a quick alternative to [`nomad`][nomad] or the `tree` command.
16 | 
17 | You can also display a different day's scrapes by providing a date after the `-t` flag.
18 | 
19 | ```
20 | poetry run Urs.py -t [<optional_date>]
21 | ```
22 | 
23 | The following date formats are supported:
24 | 
25 | - `YYYY-MM-DD`
26 | - `YYYY/MM/DD`
27 | 
28 | An error is displayed if `URS` was not run on the entered date (if the date directory is not found within the `scrapes/` directory).
29 | 
30 | [display directory tree demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/utilities/tree_demo.gif?raw=true
31 | [nomad]: https://github.com/JosephLai241/nomad
32 | 


--------------------------------------------------------------------------------
/tests/test_utils/test_Tools.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Testing `Tools.py`.
 3 | """
 4 | 
 5 | 
 6 | import argparse
 7 | import os
 8 | 
 9 | import praw
10 | from dotenv import load_dotenv
11 | 
12 | from urs.utils import Global, Tools
13 | 
14 | 
15 | class Login:
16 |     """
17 |     Create a Reddit object with PRAW API credentials.
18 |     """
19 | 
20 |     @staticmethod
21 |     def create_reddit_object():
22 |         load_dotenv()
23 | 
24 |         return praw.Reddit(
25 |             client_id=os.getenv("CLIENT_ID"),
26 |             client_secret=os.getenv("CLIENT_SECRET"),
27 |             user_agent=os.getenv("USER_AGENT"),
28 |             username=os.getenv("USERNAME"),
29 |             password=os.getenv("PASSWORD"),
30 |         )
31 | 
32 | 
33 | class TestRunInitMethod:
34 |     """
35 |     Testing Run class __init__() method.
36 |     """
37 | 
38 |     def test_init_instance_variables(self):
39 |         reddit = Login.create_reddit_object()
40 | 
41 |         try:
42 |             Tools.Run(reddit)
43 |             assert False
44 |         except SystemExit:
45 |             assert True
46 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Joseph Lai
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/FEATURE_REQUEST.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature/Enhancement Request
 3 | about: Suggest a new feature or enhancement for URS.
 4 | title: "Feature/Enhancement Request | ADD A SHORT SUMMARY OF THE NEW FEATURE HERE"
 5 | labels: enhancement
 6 | assignees: JosephLai241
 7 | ---
 8 | 
 9 | # **DO NOT DELETE THIS TEMPLATE**
10 | 
11 | ## Describe the New Feature or Enhancement You Would Like
12 | 
13 | Delete this line and write a clear description of what you want to happen.
14 | 
15 | ## Explain Why You Believe This Would Be a Good Feature For URS
16 | 
17 | Delete this line and write your motivation for creating this request.
18 | 
19 | ## Is Your Request Related To a Problem?
20 | 
21 | **Put "N/A" in this block if this is not applicable.**
22 | 
23 | Delete this line and link an issue by using the `#` symbol followed by the issue number. Then add some additional information as to how your request relates to the open issue.
24 | 
25 | ## Describe Alternatives You Have Considered
26 | 
27 | **Put "N/A" in this block if this is not applicable.**
28 | 
29 | Delete this line and write a clear and concise description of any alternative solutions or features you have considered.
30 | 
31 | ## Additional Context
32 | 
33 | **Put "N/A" in this block if this is not applicable.**
34 | 
35 | Delete this line and add any other context, codeblocks, screenshots, etc., about the request here. 
36 | 


--------------------------------------------------------------------------------
/manual/src/credentials.md:
--------------------------------------------------------------------------------
 1 | # How to Get PRAW Credentials
 2 | 
 3 | Create your own Reddit account and then head over to [Reddit's apps page](https://old.reddit.com/prefs/apps).
 4 | 
 5 | Click `"are you a developer? create an app... "`.
 6 | 
 7 | ![Create an app screenshot][create an app]
 8 | 
 9 | Name your app, choose `"script"` for the type of app, and type `"http://localhost:8080"` in the redirect URI field since this is a personal use app. You can also add a description and an about URL.
10 | 
11 | ![Enter Stuff In Boxes screenshot][enter stuff in boxes]
12 | 
13 | Click `"create app"`, then `"edit"` to reveal more information.
14 | 
15 | ![Click Edit screenshot][click edit]
16 | 
17 | You should see a string of 14 characters on the top left corner underneath `"personal use script"`. That is your API ID. Further down you will see `"secret"` and a string of 27 characters; that is your API password. **Save this information as it will be used in the program in order to access the Reddit API**.
18 | 
19 | ![All Info screenshot][all info]
20 | 
21 | You will also have to provide your app name and Reddit account username and password in the block of credentials found in `.env`.
22 | 
23 | <!-- SCREENSHOT LINKS -->
24 | 
25 | [create an app]: https://i.imgur.com/Bf0pKGJ.png
26 | [enter stuff in boxes]: https://i.imgur.com/g0xARWA.png
27 | [click edit]: https://i.imgur.com/1NOyMTN.png
28 | [all info]: https://i.imgur.com/VajTKJu.png
29 | 


--------------------------------------------------------------------------------
/manual/src/scraping-reddit/scrape-speeds-and-rate-limits.md:
--------------------------------------------------------------------------------
 1 | # Scrape Speeds
 2 | 
 3 | Your internet connection speed is the primary bottleneck that will establish the scrape duration; however, there are additional bottlenecks such as:
 4 | 
 5 | - The number of results returned for Subreddit or Redditor scraping.
 6 | - The submission's popularity (total number of comments) for submission comments scraping.
 7 | 
 8 | # Rate Limits
 9 | 
10 | Yes, PRAW has rate limits. These limits are proportional to how much karma you have accumulated -- the higher the karma, the higher the rate limit. This has been implemented to mitigate spammers and bots that utilize PRAW.
11 | 
12 | Rate limit information for your account is displayed in a small table underneath the successful login message each time you run any of the PRAW scrapers. I have also added a [`--check` flag](../utilities/rate-limit-checking.md) if you want to quickly view this information.
13 | 
14 | `URS` will display an error message as well as the rate limit reset date if you have used all your available requests.
15 | 
16 | There are a couple ways to circumvent rate limits:
17 | 
18 | - Scrape intermittently
19 | - Use an account with high karma to get your PRAW credentials
20 | - Scrape less results per run
21 | 
22 | Available requests are refilled if you use the PRAW scrapers intermittently, which might be the best solution. This can be especially helpful if you have automated `URS` and are not looking at the output on each run.
23 | 


--------------------------------------------------------------------------------
/tests/test_analytics/test_Frequencies.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Testing `Frequencies.py`.
 3 | """
 4 | 
 5 | 
 6 | from urs.analytics import Frequencies
 7 | 
 8 | 
 9 | class TestSortGetDataMethod:
10 |     """
11 |     Testing Sort class get_data() method.
12 |     """
13 | 
14 |     def test_get_data_method(self):
15 |         pass
16 | 
17 | 
18 | class TestSortNameAndCreateDirMethod:
19 |     """
20 |     Testing Sort class name_and_create_dir() method.
21 |     """
22 | 
23 |     def test_name_and_create_dir_method(self):
24 |         pass
25 | 
26 | 
27 | class TestSortCreateCsvMethod:
28 |     """
29 |     Testing Sort class create_csv() method.
30 |     """
31 | 
32 |     def test_create_csv_method(self):
33 |         plt_dict = {"test": 1, "testing": 2}
34 | 
35 |         assert Frequencies.Sort().create_csv(plt_dict) == {
36 |             "words": ["test", "testing"],
37 |             "frequencies": [1, 2],
38 |         }
39 | 
40 | 
41 | class TestSortCreateJsonMethod:
42 |     """
43 |     Testing Sort class create_json() method.
44 |     """
45 | 
46 |     def test_create_json_method(self):
47 |         scrape_file = ["test", "something"]
48 |         plt_dict = {"test": 1, "testing": 2}
49 | 
50 |         assert Frequencies.Sort().create_json(plt_dict, scrape_file) == {
51 |             "raw_file": "test",
52 |             "data": {"test": 1, "testing": 2},
53 |         }
54 | 
55 | 
56 | class TestExportFrequenciesExportMethod:
57 |     """
58 |     Testing ExportFrequencies class export() method.
59 |     """
60 | 
61 |     def test_export_method(self):
62 |         pass
63 | 


--------------------------------------------------------------------------------
/manual/src/contributors.md:
--------------------------------------------------------------------------------
 1 | # Contributors
 2 | 
 3 | | Date             | User                                   | Contribution                                                                                                                                                                 |
 4 | | ---------------- | -------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 5 | | March 11, 2020   | [ThereGoesMySanity][theregoesmysanity] | Created a [pull request][theregoesmysanity pull request] adding 2FA information to README                                                                                    |
 6 | | October 6, 2020  | [LukeDSchenk][lukedschenk]             | Created a [pull request][lukedschenk pull request] fixing `"[Errno 36] File name too long"` issue, making it impossible to save comment scrapes with long titles               |
 7 | | October 10, 2020 | [IceBerge421][iceberge421]             | Created a [pull request][icegerge421 pull request] fixing a cloning error occuring on Windows machines due to illegal file name characters, `"`, found in two scrape samples |
 8 | 
 9 | [iceberge421]: https://github.com/IceBerge421
10 | [icegerge421 pull request]: https://github.com/JosephLai241/URS/pull/20
11 | [lukedschenk]: https://github.com/LukeDSchenk
12 | [lukedschenk pull request]: https://github.com/JosephLai241/URS/pull/19
13 | [theregoesmysanity]: https://github.com/ThereGoesMySanity
14 | [theregoesmysanity pull request]: https://github.com/JosephLai241/URS/pull/9
15 | 


--------------------------------------------------------------------------------
/urs/praw_scrapers/live_scrapers/utils/StreamGenerator.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Stream Generator
 3 | ================
 4 | Defining methods for the stream generator which yields new Reddit objects and
 5 | converts them to JSON serializable objects when saving to file.
 6 | """
 7 | 
 8 | 
 9 | from typing import Any, Dict, Generator, Union
10 | 
11 | from praw.models.reddit.redditor import RedditorStream
12 | from praw.models.reddit.subreddit import SubredditStream
13 | 
14 | from urs.praw_scrapers.utils.Objectify import Objectify
15 | 
16 | 
17 | class StreamGenerator:
18 |     """
19 |     Methods for creating a generator which yields new Reddit objects while
20 |     streaming.
21 |     """
22 | 
23 |     @staticmethod
24 |     def stream_submissions(
25 |         stream: Union[RedditorStream, SubredditStream]
26 |     ) -> Generator[Dict[str, Any], None, None]:
27 |         """
28 |         Yield new Reddit submissions.
29 | 
30 |         :param RedditorStream | SubredditStream stream: The Reddit stream instance.
31 | 
32 |         :yields: Reddit submission object.
33 |         """
34 | 
35 |         for submission in stream.submissions(skip_existing=True):
36 |             yield Objectify().make_submission(True, submission)
37 | 
38 |     @staticmethod
39 |     def stream_comments(
40 |         stream: Union[RedditorStream, SubredditStream]
41 |     ) -> Generator[Dict[str, Any], None, None]:
42 |         """
43 |         Yield new Reddit comments.
44 | 
45 |         :param RedditorStream | SubredditStream stream: The Reddit stream instance.
46 | 
47 |         :yields: Reddit comment object.
48 |         """
49 | 
50 |         for comment in stream.comments(skip_existing=True):
51 |             yield Objectify().make_comment(comment, True)
52 | 


--------------------------------------------------------------------------------
/manual/src/README.md:
--------------------------------------------------------------------------------
 1 |      __  __  _ __   ____
 2 |     /\ \/\ \/\`'__\/',__\
 3 |     \ \ \_\ \ \ \//\__, `\
 4 |      \ \____/\ \_\\/\____/
 5 |       \/___/  \/_/ \/___/
 6 | 
 7 | > **U**niversal **R**eddit **S**craper - A comprehensive Reddit scraping command-line tool written in Python.
 8 | 
 9 | ![GitHub Workflow Status (Python)](https://img.shields.io/github/actions/workflow/status/JosephLai241/URS/python.yml?label=Python&logo=python&logoColor=blue)
10 | ![GitHub Workflow Status (Rust)](https://img.shields.io/github/actions/workflow/status/JosephLai241/URS/rust.yml?label=Rust&logo=rust&logoColor=orange)
11 | [![Codecov](https://img.shields.io/codecov/c/gh/JosephLai241/URS?logo=Codecov)][codecov]
12 | [![GitHub release (latest by date)](https://img.shields.io/github/v/release/JosephLai241/URS)][releases]
13 | ![Total lines](https://img.shields.io/tokei/lines/github/JosephLai241/URS)
14 | ![License](https://img.shields.io/github/license/JosephLai241/URS)
15 | 
16 | ```
17 | [-h]
18 | [-e]
19 | [-v]
20 | 
21 | [-t [<optional_date>]]
22 | [--check]
23 | 
24 | [-r <subreddit> <(h|n|c|t|r|s)> <n_results_or_keywords> [<optional_time_filter>]]
25 |     [-y]
26 |     [--csv]
27 |     [--rules]
28 | [-u <redditor> <n_results>]
29 | [-c <submission_url> <n_results>]
30 |     [--raw]
31 | [-b]
32 |     [--csv]
33 | 
34 | [-lr <subreddit>]
35 | [-lu <redditor>]
36 | 
37 |     [--nosave]
38 |     [--stream-submissions]
39 | 
40 | [-f <file_path>]
41 |     [--csv]
42 | [-wc <file_path> [<optional_export_format>]]
43 |     [--nosave]
44 | ```
45 | 
46 | [codecov]: https://codecov.io/gh/JosephLai241/URS
47 | [github workflow status]: https://github.com/JosephLai241/URS/actions/workflows/pytest.yml
48 | [praw]: https://pypi.org/project/praw/
49 | [releases]: https://github.com/JosephLai241/URS/releases
50 | 


--------------------------------------------------------------------------------
/urs/Urs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | 
 4 | """
 5 | URS
 6 | ===
 7 | 
 8 | URS, an acronym for "Universal Reddit Scraper", is a comprehensive Reddit scraping 
 9 | command-line tool written in Python.
10 | 
11 | * Scrape Reddit via PRAW (the official Python Reddit API Wrapper)
12 |     + Scrape Subreddits
13 |     + Scrape Redditors
14 |     + Scrape submission comments
15 | * Livestream Reddit via PRAW
16 |     + Livestream comments submitted within Subreddits or by Redditors
17 |     + Livestream submissions submitted within Subreddits or by Redditors
18 | * Analytical tools for scraped data
19 |     + Generate frequencies for words that are found in submission titles, bodies, and/or comments
20 |     + Generate a wordcloud from scrape results
21 | 
22 | @author: Joseph Lai
23 | @contact: urs_project@protonmail.com
24 | @github: https://github.com/JosephLai241/URS
25 | """
26 | 
27 | 
28 | import os
29 | 
30 | import praw
31 | from colorama import init
32 | from dotenv import load_dotenv
33 | 
34 | from urs.utils.Logger import LogMain
35 | from urs.utils.Tools import Run
36 | 
37 | # Automate sending reset sequences to turn off color changes at the end of
38 | # every print.
39 | init(autoreset=True)
40 | 
41 | 
42 | class Main:
43 |     """
44 |     Run URS.
45 |     """
46 | 
47 |     @staticmethod
48 |     @LogMain.master_timer
49 |     def main() -> None:
50 |         load_dotenv()
51 | 
52 |         reddit = praw.Reddit(
53 |             client_id=os.getenv("CLIENT_ID"),
54 |             client_secret=os.getenv("CLIENT_SECRET"),
55 |             user_agent=os.getenv("USER_AGENT"),
56 |             username=os.getenv("REDDIT_USERNAME"),
57 |             password=os.getenv("REDDIT_PASSWORD"),
58 |         )
59 | 
60 |         Run(reddit).run_urs()
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     Main.main()
65 | 


--------------------------------------------------------------------------------
/.github/workflows/python.yml:
--------------------------------------------------------------------------------
 1 | name: Python code checks
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - "master"
 7 |   pull_request:
 8 |     branches-ignore:
 9 |       - "demo-gifs"
10 |       - "gh-pages"
11 |       - "rust-demo"
12 |       - "samples"
13 | 
14 | jobs:
15 |   formatting-checks:
16 |     runs-on: ubuntu-latest
17 | 
18 |     steps:
19 |       - name: Black formatting check
20 |         uses: psf/black@stable
21 |         with:
22 |           options: "--check --verbose --diff --color"
23 | 
24 |       - name: isort formatting check
25 |         uses: isort/isort-action@master
26 |         with:
27 |           configuration: "--check-only --verbose --diff --color --profile black"
28 | 
29 |   pytest:
30 |     runs-on: ${{ matrix.os }}
31 | 
32 |     strategy:
33 |       matrix:
34 |         os: [ubuntu-latest, macOS-latest]
35 | 
36 |     if: github.ref == 'refs/heads/master'
37 |     steps:
38 |       - uses: actions/checkout@v3
39 | 
40 |       - name: Set up Python 3.11
41 |         uses: actions/setup-python@v4
42 |         with:
43 |           python-version: "3.11"
44 | 
45 |       - name: Install Poetry
46 |         uses: abatilo/actions-poetry@v2
47 | 
48 |       - name: Install dependencies
49 |         run: poetry install
50 | 
51 |       - name: Build taisun
52 |         uses: PyO3/maturin-action@v1
53 |         with:
54 |           command: develop
55 |           args: --release
56 | 
57 |       - name: Run Pytest
58 |         env:
59 |           CLIENT_ID: ${{ secrets.CLIENT_ID }}
60 |           CLIENT_SECRET: ${{ secrets.CLIENT_SECRET }}
61 |           USER_AGENT: ${{ secrets.USER_AGENT }}
62 |           REDDIT_USERNAME: ${{ secrets.REDDIT_USERNAME }}
63 |           REDDIT_PASSWORD: ${{ secrets.REDDIT_PASSWORD }}
64 |         run: |
65 |           poetry run pytest --cov=./
66 | 
67 |       - name: Send coverage data to Codecov
68 |         uses: codecov/codecov-action@v1
69 |         if: matrix.os == 'ubuntu-latest'
70 | 


--------------------------------------------------------------------------------
/manual/src/SUMMARY.md:
--------------------------------------------------------------------------------
 1 | # Summary
 2 | 
 3 | [URS](./README.md)
 4 | 
 5 | - [Introduction](./introduction.md)
 6 | - [Installation](./installation.md)
 7 | - [Exporting](./exporting.md)
 8 | - [How to Get Reddit API Credentials for PRAW](./credentials.md)
 9 | 
10 | # Scraping Reddit
11 | 
12 | - [Scrape Speeds and Rate Limits](./scraping-reddit/scrape-speeds-and-rate-limits.md)
13 | - [A Table of All Subreddit, Redditor, and Submission Comments Attributes](./scraping-reddit/all-attributes-table.md)
14 | - [Scraping Subreddits](./scraping-reddit/subreddit.md)
15 | - [Scraping Redditors](./scraping-reddit/redditor.md)
16 | - [Scraping Submission Comments](./scraping-reddit/submission-comments.md)
17 | 
18 | # Livestreaming Reddit
19 | 
20 | - [General Information](./livestreaming-reddit/general-information.md)
21 | - [Livestreaming Subreddits and Redditors](./livestreaming-reddit/livestreaming-subreddits-and-redditors.md)
22 | 
23 | # Analytical Tools
24 | 
25 | - [General Information](./analytical-tools/general-information.md)
26 | - [Generating Word Frequencies and Wordclouds](./analytical-tools/frequencies-and-wordclouds.md)
27 | 
28 | # Utilities
29 | 
30 | - [Built-in Tree](./utilities/tree.md)
31 | - [PRAW Rate Limit Check](./utilities/rate-limit-checking.md)
32 | 
33 | # Additional Information
34 | 
35 | - [2-Factor Authentication](./additional-information/2fa-information.md)
36 | - [Error Messages](./additional-information/error-messages.md)
37 | 
38 | # Implementation Details
39 | 
40 | - [The Forest](./implementation-details/the-forest.md)
41 | - [Speeding Up Python with Rust](./implementation-details/speeding-up-python-with-rust.md)
42 | 
43 | # Contributing
44 | 
45 | - [Before Making Pull or Feature Requests](./contributing/before-making-pull-or-feature-requests.md)
46 | - [Building on Top of `URS`](./contributing/building-on-top-of-urs.md)
47 | - [Making Pull or Feature Requests](./contributing/making-pull-or-feature-requests.md)
48 | 
49 | ---
50 | 
51 | [Contributors](./contributors.md)
52 | [Derivative Projects](./derivative-projects.md)
53 | 


--------------------------------------------------------------------------------
/tests/test_praw_scrapers/test_live_scrapers/test_utils/test_StreamGenerator.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Testing `StreamGenerator.py`.
 3 | """
 4 | 
 5 | 
 6 | import os
 7 | import types
 8 | 
 9 | import praw
10 | from dotenv import load_dotenv
11 | 
12 | from urs.praw_scrapers.live_scrapers.utils import StreamGenerator
13 | 
14 | 
15 | class Login:
16 |     """
17 |     Create a Reddit object with PRAW API credentials.
18 |     """
19 | 
20 |     @staticmethod
21 |     def create_reddit_object():
22 |         load_dotenv()
23 | 
24 |         return praw.Reddit(
25 |             client_id=os.getenv("CLIENT_ID"),
26 |             client_secret=os.getenv("CLIENT_SECRET"),
27 |             user_agent=os.getenv("USER_AGENT"),
28 |             username=os.getenv("REDDIT_USERNAME"),
29 |             password=os.getenv("REDDIT_PASSWORD"),
30 |         )
31 | 
32 | 
33 | class TestStreamGeneratorStreamSubmissionsMethod:
34 |     """
35 |     Testing StreamGenerator class stream_submissions() method.
36 |     """
37 | 
38 |     def test_stream_submissions_method(self):
39 |         reddit = Login.create_reddit_object()
40 |         subreddit = reddit.subreddit("askreddit")
41 | 
42 |         generator = StreamGenerator.StreamGenerator.stream_submissions(subreddit.stream)
43 | 
44 |         assert isinstance(generator, types.GeneratorType)
45 | 
46 |         for obj in generator:
47 |             if isinstance(obj, dict):
48 |                 assert True
49 |                 break
50 | 
51 | 
52 | class TestStreamGeneratorStreamCommentsMethod:
53 |     """
54 |     Testing StreamGenerator class stream_comments() method.
55 |     """
56 | 
57 |     def test_stream_comments_method(self):
58 |         reddit = Login.create_reddit_object()
59 |         subreddit = reddit.subreddit("askreddit")
60 | 
61 |         generator = StreamGenerator.StreamGenerator.stream_comments(subreddit.stream)
62 | 
63 |         assert isinstance(generator, types.GeneratorType)
64 | 
65 |         for obj in generator:
66 |             if isinstance(obj, dict):
67 |                 assert True
68 |                 break
69 | 


--------------------------------------------------------------------------------
/manual/src/livestreaming-reddit/livestreaming-subreddits-and-redditors.md:
--------------------------------------------------------------------------------
 1 | # Livestreaming Subreddits
 2 | 
 3 | ![Livestream Subreddit Demo GIF][livestream subreddit demo]
 4 | 
 5 | \*_This GIF has been cut for demonstration purposes._
 6 | 
 7 | ## All Flags
 8 | 
 9 | These are all the flags that may be used when livestreaming Subreddits.
10 | 
11 | ```
12 | [-lr <subreddit>]
13 |     [--nosave]
14 |     [--stream-submissions]
15 | ```
16 | 
17 | ## Usage
18 | 
19 | ```
20 | poetry run Urs.py -lr <subreddit>
21 | ```
22 | 
23 | **Default stream objects:** Comments. To stream submissions instead, include the `--stream-submissions` flag.
24 | 
25 | You can livestream comments or submissions that are created within a Subreddit.
26 | 
27 | Reddit object information will be displayed in a [PrettyTable][prettytable] as they are submitted.
28 | 
29 | > **_NOTE:_** PRAW may not be able to catch all new submissions or comments within a high-volume Subreddit, as mentioned in [these disclaimers located in the "Note" boxes][subreddit stream disclaimer].
30 | 
31 | # Livestreaming Redditors
32 | 
33 | _Livestream demo was not recorded for Redditors because its functionality is identical to the Subreddit livestream._
34 | 
35 | ## All Flags
36 | 
37 | These are all the flags that may be used when livestreaming Redditors.
38 | 
39 | ```
40 | [-lu <redditor>]
41 |     [--nosave]
42 |     [--stream-submissions]
43 | ```
44 | 
45 | ## Usage
46 | 
47 | ```
48 | poetry run Urs.py -lu <redditor>
49 | ```
50 | 
51 | **Default stream objects:** Comments. To stream submissions instead, include the `--stream-submissions` flag.
52 | 
53 | You can livestream comments or submissions that are created by a Redditor.
54 | 
55 | Reddit object information will be displayed in a PrettyTable as they are submitted.
56 | 
57 | # Do Not Save Livestream to File
58 | 
59 | Include the `--nosave` flag if you do not want to save the livestream to file.
60 | 
61 | [livestream subreddit demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/praw_scrapers/live_scrapers/livestream_subreddit_demo.gif?raw=true
62 | [prettytable]: https://pypi.org/project/prettytable/
63 | [subreddit stream disclaimer]: https://praw.readthedocs.io/en/latest/code_overview/other/subredditstream.html#praw.models.reddit.subreddit.SubredditStream
64 | 


--------------------------------------------------------------------------------
/tests/test_utils/test_Global.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Testing `Global.py`.
 3 | """
 4 | 
 5 | 
 6 | import datetime as dt
 7 | 
 8 | from urs.utils import Global
 9 | 
10 | 
11 | class TestGlobalVariables:
12 |     """
13 |     Testing all global variables in Global.py.
14 |     """
15 | 
16 |     def test_date_variable(self):
17 |         assert Global.date == dt.datetime.now().strftime("%Y-%m-%d")
18 | 
19 |     def test_subreddit_categories_list(self):
20 |         assert Global.categories == [
21 |             "Hot",
22 |             "New",
23 |             "Controversial",
24 |             "Top",
25 |             "Rising",
26 |             "Search",
27 |         ]
28 | 
29 |     def test_subreddit_short_cat_list(self):
30 |         categories = ["Hot", "New", "Controversial", "Top", "Rising", "Search"]
31 |         assert Global.short_cat == [cat[0] for cat in categories]
32 | 
33 | 
34 | class TestConvertTimeFunction:
35 |     """
36 |     Testing convert_time() function.
37 |     """
38 | 
39 |     def test_convert_time(self):
40 |         unix_time = 1592291124
41 |         converted_time = "2020-06-16 07:05:24"
42 | 
43 |         assert Global.convert_time(unix_time) == converted_time
44 | 
45 | 
46 | class TestMakeListDictFunction:
47 |     """
48 |     Testing make_list_dict() function.
49 |     """
50 | 
51 |     def test_make_list_dict(self):
52 |         item = [1, 2, 3, 4]
53 |         correct_list_dict = {1: [], 2: [], 3: [], 4: []}
54 | 
55 |         assert Global.make_list_dict(item) == correct_list_dict
56 | 
57 | 
58 | class TestMakeNoneDictFunction:
59 |     """
60 |     Testing make_none_dict() function.
61 |     """
62 | 
63 |     def test_make_none_dict(self):
64 |         item = [1, 2, 3, 4]
65 |         correct_none_dict = {1: None, 2: None, 3: None, 4: None}
66 | 
67 |         assert Global.make_none_dict(item) == correct_none_dict
68 | 
69 | 
70 | class TestStatus:
71 |     """
72 |     Testing Status class.
73 |     """
74 | 
75 |     def test_status_init_method(self):
76 |         test_status = Global.Status(
77 |             "test after message", "test before message", "test color"
78 |         )
79 | 
80 |         assert test_status._after_message == "test after message"
81 |         assert test_status._before_message == "test before message"
82 |         assert test_status._color == "test color"
83 | 


--------------------------------------------------------------------------------
/manual/src/analytical-tools/frequencies-and-wordclouds.md:
--------------------------------------------------------------------------------
 1 | # Generating Word Frequencies
 2 | 
 3 | ![Frequencies Demo GIF][frequencies demo]
 4 | 
 5 | ## All Flags
 6 | 
 7 | These are all the flags that may be used when generating word frequencies.
 8 | 
 9 | ```
10 | [-f <file_path>]
11 |     [--csv]
12 | ```
13 | 
14 | ## Usage
15 | 
16 | ```
17 | poetry run Urs.py -f <file_path>
18 | ```
19 | 
20 | **Supports exporting to CSV.** To export to CSV, include the `--csv` flag.
21 | 
22 | You can generate a dictionary of word frequencies created from the words within the target fields. These frequencies are sorted from highest to lowest.
23 | 
24 | Frequencies export to JSON by default, but this tool also works well in CSV format.
25 | 
26 | Exported files will be saved to the `analytics/frequencies` directory.
27 | 
28 | # Generating Wordclouds
29 | 
30 | ![Wordcloud Demo GIF][wordcloud demo]
31 | 
32 | ## All Flags
33 | 
34 | ```
35 | [-wc <file_path> [<optional_export_format>]]
36 |     [--nosave]
37 | ```
38 | 
39 | ## Usage
40 | 
41 | ```
42 | poetry run Urs.py -wc <file_path>
43 | ```
44 | 
45 | ## Supported Export Formats
46 | 
47 | The following are the supported export formats for wordclouds:
48 | 
49 | - `eps`
50 | - `jpeg`
51 | - `jpg`
52 | - `pdf`
53 | - `png` (default)
54 | - `ps`
55 | - `rgba`
56 | - `tif`
57 | - `tiff`
58 | 
59 | Taking word frequencies to the next level, you can generate wordclouds based on word frequencies. This tool is independent of the frequencies generator -- you do not need to run the frequencies generator before creating a wordcloud.
60 | 
61 | PNG is the default format, but you can also export to any of the options listed above by including the format as the second flag argument.
62 | 
63 | ```
64 | poetry run Urs.py -wc <file_path> [<optional_export_format>]
65 | ```
66 | 
67 | Exported files will be saved to the `analytics/wordclouds` directory.
68 | 
69 | ## Display Wordcloud Instead of Saving
70 | 
71 | Wordclouds are saved to file by default. If you do not want to keep a file, include the `--nosave` flag to only display the wordcloud.
72 | 
73 | [frequencies demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/analytical_tools/frequencies_generator_demo.gif?raw=true
74 | [wordcloud demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/analytical_tools/wordcloud_generator_demo.gif?raw=true
75 | 


--------------------------------------------------------------------------------
/manual/src/livestreaming-reddit/general-information.md:
--------------------------------------------------------------------------------
 1 | # Livestreaming Reddit via PRAW
 2 | 
 3 | These tools may be used to livestream comments or submissions submitted within Subreddits or by Redditors.
 4 | 
 5 | **Comments are streamed by default**. To stream submissions instead, include the `--stream-submissions` flag.
 6 | 
 7 | **New comments or submissions will continue to display within your terminal until you abort the stream using `Ctrl + C`**.
 8 | 
 9 | ## File Naming Conventions
10 | 
11 | The filenames will follow this format:
12 | 
13 | ```
14 | [SUBREDDIT_OR_REDDITOR]-[comments_OR_submissions]-[START_TIME_IN_HOURS_MINUTES_SECONDS]-[DURATION_IN_HOURS_MINUTES_SECONDS].json
15 | ```
16 | 
17 | This file is saved in the main `livestream` directory into the `subreddits` or `redditors` directory depending on which stream was run.
18 | 
19 | Reddit objects will be written to this JSON file in real time. After aborting the stream, the filename will be updated with the start time and duration.
20 | 
21 | ## Displayed vs. Saved Attributes
22 | 
23 | Displayed comment and submission attributes have been stripped down to essential fields to declutter the output. Here is a table of what is shown during the stream:
24 | 
25 | | Comment Attributes           | Submission Attributes |
26 | | ---------------------------- | --------------------- |
27 | | `author`                     | `author`              |
28 | | `body`                       | `created_utc`         |
29 | | `created_utc`                | `is_self`             |
30 | | `is_submitter`               | `link_flair_text`     |
31 | | `submission_author`          | `nsfw`                |
32 | | `submission_created_utc`     | `selftext`            |
33 | | `submission_link_flair_text` | `spoiler`             |
34 | | `submission_nsfw`            | `stickied`            |
35 | | `submission_num_comments`    | `title`               |
36 | | `submission_score`           | `url`                 |
37 | | `submission_title`           |                       |
38 | | `submission_upvote_ratio`    |                       |
39 | | `submission_url`             |                       |
40 | 
41 | Comment and submission attributes that are written to file will include the full list of attributes found in the [Table of All Subreddit, Redditor, and Submission Comments Attributes](../scraping-reddit/all-attributes-table.md).
42 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/BUG_REPORT.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug Report
 3 | about: Report a bug that you have encountered while using URS.
 4 | title: "Bug Report | ADD A SHORT SUMMARY OF THE BUG HERE"
 5 | labels: bug
 6 | assignees: JosephLai241
 7 | ---
 8 | 
 9 | # **DO NOT DELETE THIS TEMPLATE**
10 | 
11 | ## Describe The Bug
12 | 
13 | Delete this line and write a clear and concise description for the bug.
14 | 
15 | ## Expected Behavior
16 | 
17 | Delete this line and write a description of what you expected to happen.
18 | 
19 | ## Actual Behavior
20 | 
21 | Delete this line and write a description of what actually happened.
22 | 
23 | ## Steps To Reproduce
24 | 
25 | Delete this line and describe how to reproduce this behavior. Create an outline if there is additional relevant information you would like to share. An example outline is shown below:
26 | 
27 | * I ran `a command here`.
28 | * Then I ran `a second command here`.
29 |     + Providing additional information about this bullet point here.
30 | 
31 | If the command is long, wrap it in a code block like so:
32 | 
33 | * Ran the command:
34 | 
35 | ```
36 | Paste long command here.
37 | ```
38 | 
39 | ## Traceback, `urs.log`, or Screenshots
40 | 
41 | **Put "N/A" in this block if this is not applicable.**
42 | 
43 | Add a codeblock of the *entire* traceback here to help explain your problem.
44 | 
45 | ```
46 | Paste the traceback here. Make sure it is formatted correctly.
47 | ```
48 | 
49 | **The traceback is mandatory**, however you can also add the following information below.
50 | 
51 | A log of command history, `urs.log`, is written to the date directory every time you run URS. You can also add the relevant log block within a codeblock to help explain your problem. An example log is shown:
52 | 
53 | ```
54 | [2021-06-22 19:22:32,296] [INFO]: INITIALIZING URS.
55 | [2021-06-22 19:22:32,296] [INFO]: 
56 | [2021-06-22 19:22:32,301] [CRITICAL]: RECEIVED INVALID SCRAPE FILE FOR FREQUENCIES.
57 | [2021-06-22 19:22:32,301] [CRITICAL]: ABORTING URS.
58 | ```
59 | 
60 | You can also add a screenshot of your issue like so:
61 | 
62 | ![Error Screenshot](PASTE_URL_TO_IMAGE_HERE)
63 | 
64 | ## Machine Specs
65 | 
66 | * Operating System: Write your operating system here, e.g. Arch Linux x86_64
67 | * Python Version: Write your Python version here, e.g. Python 3.8.2
68 | 
69 | ## Additional Context
70 | 
71 | **Put "N/A" in this block if this is not applicable.**
72 | 
73 | Delete this line and add any other context about the problem here. 
74 | 


--------------------------------------------------------------------------------
/manual/src/analytical-tools/general-information.md:
--------------------------------------------------------------------------------
 1 | # Analytical Tools
 2 | 
 3 | This suite of tools can be used _after_ scraping data from Reddit. Both of these tools analyze the frequencies of words found in submission titles and bodies, or comments within JSON scrape data.
 4 | 
 5 | There are a few ways you can quickly get the correct filepath to the scrape file:
 6 | 
 7 | - Drag and drop the file into the terminal.
 8 | - Partially type the path and rely on tab completion support to finish the full path for you.
 9 | 
10 | Running either tool will create the `analytics` directory within the date directory. **This directory is located in the same directory in which the scrape data resides**. For example, if you run the frequencies generator on February 16th for scrape data that was captured on February 14th, `analytics` will be created in the February 14th directory. Command history will still be written in the February 16th `urs.log`.
11 | 
12 | The sub-directories `frequencies` or `wordclouds` are created in `analytics` depending on which tool is run. These directories mirror the directories in which the original scrape files reside. For example, if you run the frequencies generator on a Subreddit scrape, the directory structure will look like this:
13 | 
14 | ```
15 | analytics/
16 | └── frequencies
17 |     └── subreddits
18 |         └── SUBREDDIT_SCRAPE.json
19 | ```
20 | 
21 | A shortened export path is displayed once `URS` has completed exporting the data, informing you where the file is saved within the `scrapes` directory. You can open `urs.log` to view the full path.
22 | 
23 | # Target Fields
24 | 
25 | The data varies depending on the scraper, so these tools target different fields for each type of scrape data:
26 | 
27 | | Scrape Data         | Targets                           |
28 | | ------------------- | --------------------------------- |
29 | | Subreddit           | `selftext`, `title`               |
30 | | Redditor            | `selftext`, `title`, `body`       |
31 | | Submission Comments | `body`                            |
32 | | Livestream          | `selftext` and `title`, or `body` |
33 | 
34 | For Subreddit scrapes, data is pulled from the `selftext` and `title` fields for each submission (submission title and body).
35 | 
36 | For Redditor scrapes, data is pulled from all three fields because both submission and comment data is returned. The `title` and `body` fields are targeted for submissions, and the `selftext` field is targeted for comments.
37 | 
38 | For submission comments scrapes, data is only pulled from the `body` field of each comment.
39 | 
40 | For livestream scrapes, comments or submissions may be included depending on user settings. The `selftext` and `title` fields are targeted for submissions, and the `body` field is targeted for comments.
41 | 
42 | # File Names
43 | 
44 | File names are identical to the original scrape data so that it is easier to distinguish which analytical file corresponds to which scrape.
45 | 


--------------------------------------------------------------------------------
/manual/src/scraping-reddit/all-attributes-table.md:
--------------------------------------------------------------------------------
 1 | # Subreddit, Redditor, and Submission Comments Attributes
 2 | 
 3 | These attributes are included in each scrape.
 4 | 
 5 | | Subreddits (submissions) | Redditors                        | Submission Comments |
 6 | | ------------------------ | -------------------------------- | ------------------- |
 7 | | `author`                 | `comment_karma`                  | `author`            |
 8 | | `created_utc`            | `created_utc`                    | `body`              |
 9 | | `distinguished`          | `fullname`                       | `body_html`         |
10 | | `edited`                 | `has_verified_email`             | `created_utc`       |
11 | | `id`                     | `icon_img`                       | `distinguished`     |
12 | | `is_original_content`    | `id`                             | `edited`            |
13 | | `is_self`                | `is_employee`                    | `id`                |
14 | | `link_flair_text`        | `is_friend`                      | `is_submitter`      |
15 | | `locked`                 | `is_mod`                         | `link_id`           |
16 | | `name`                   | `is_gold`                        | `parent_id`         |
17 | | `num_comments`           | `link_karma`                     | `score`             |
18 | | `nsfw`                   | `name`                           | `stickied`          |
19 | | `permalink`              | `subreddit`                      |                     |
20 | | `score`                  | \*`trophies`                     |                     |
21 | | `selftext`               | \*`comments`                     |                     |
22 | | `spoiler`                | \*`controversial`                |                     |
23 | | `stickied`               | \*`downvoted` (may be forbidden) |                     |
24 | | `title`                  | \*`gilded`                       |                     |
25 | | `upvote_ratio`           | \*`gildings` (may be forbidden)  |                     |
26 | | `url`                    | \*`hidden` (may be forbidden)    |                     |
27 | |                          | \*`hot`                          |                     |
28 | |                          | \*`moderated`                    |                     |
29 | |                          | \*`multireddits`                 |                     |
30 | |                          | \*`new`                          |                     |
31 | |                          | \*`saved` (may be forbidden)     |                     |
32 | |                          | \*`submissions`                  |                     |
33 | |                          | \*`top`                          |                     |
34 | |                          | \*`upvoted` (may be forbidden)   |                     |
35 | 
36 | \*_Includes additional attributes; see the [Scraping Redditors](./redditor.md) section for more information._
37 | 


--------------------------------------------------------------------------------
/urs/praw_scrapers/live_scrapers/utils/DisplayStream.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Display stream
 3 | ==============
 4 | Defining methods to format data that will appear in the terminal. 
 5 | """
 6 | 
 7 | 
 8 | from typing import Any, Dict, List
 9 | 
10 | from prettytable import PrettyTable
11 | 
12 | 
13 | class DisplayStream:
14 |     """
15 |     Methods to format and display Reddit stream objects.
16 |     """
17 | 
18 |     @staticmethod
19 |     def _populate_table(
20 |         include_fields: List[str],
21 |         obj: Dict[str, Any],
22 |         prefix: str,
23 |         pretty_stream: PrettyTable,
24 |     ) -> None:
25 |         """
26 |         Populate the PrettyTable rows with Reddit object metadata.
27 | 
28 |         :param list[str] include_fields: A `list[str]` containing dictionary keys
29 |             that will be added to the `PrettyTable` row.
30 |         :param dict[str, Any] obj: A `dict[str, Any]` containing Reddit comment
31 |             submission data.
32 |         :param str prefix: The prefix to prepend to an attribute.
33 |         :param PrettyTable pretty_stream: A `PrettyTable` instance.
34 |         """
35 | 
36 |         for attribute, data in obj.items():
37 |             if attribute in include_fields:
38 |                 pretty_stream.add_row([prefix + attribute, data])
39 | 
40 |     @staticmethod
41 |     def display(obj: Dict[str, Any]) -> None:
42 |         """
43 |         Format and print string containing stream information.
44 | 
45 |         :param dict[str, Any] obj: A `dict[str, Any]` containing Reddit comment
46 |             submission data.
47 |         """
48 | 
49 |         pretty_stream = PrettyTable()
50 |         pretty_stream.field_names = [f"{obj['type'].capitalize()} Attribute", "Data"]
51 | 
52 |         if obj["type"] == "submission":
53 |             include_fields = [
54 |                 "author",
55 |                 "created_utc",
56 |                 "link_flair_text",
57 |                 "nsfw",
58 |                 "selftext",
59 |                 "spoiler",
60 |                 "title",
61 |                 "url",
62 |             ]
63 |         elif obj["type"] == "comment":
64 |             include_fields = [
65 |                 "author",
66 |                 "body",
67 |                 "created_utc",
68 |                 "is_submitter",
69 |             ]
70 | 
71 |             submission_fields = [
72 |                 "author",
73 |                 "created_utc",
74 |                 "link_flair_text",
75 |                 "nsfw",
76 |                 "num_comments",
77 |                 "score",
78 |                 "title",
79 |                 "upvote_ratio",
80 |                 "url",
81 |             ]
82 | 
83 |             DisplayStream._populate_table(
84 |                 submission_fields, obj["submission"], "submission_", pretty_stream
85 |             )
86 | 
87 |         DisplayStream._populate_table(include_fields, obj, "", pretty_stream)
88 | 
89 |         pretty_stream.sortby = f"{obj['type'].capitalize()} Attribute"
90 |         pretty_stream.align = "l"
91 |         pretty_stream.max_width = 120
92 | 
93 |         print(pretty_stream)
94 | 


--------------------------------------------------------------------------------
/manual/src/additional-information/error-messages.md:
--------------------------------------------------------------------------------
  1 | # Error Messages
  2 | 
  3 | This document will briefly go over all the potential error messages you might run into while using URS.
  4 | 
  5 | # Table of Contents
  6 | 
  7 | - [Global Errors](#global-errors)
  8 |   - [Invalid Arguments](#invalid-arguments)
  9 |   - [Export Error](#export-error)
 10 | - [PRAW Errors](#praw-errors)
 11 |   - [Invalid API Credentials or No Internet Connection](#invalid-api-credentials-or-no-internet-connection)
 12 |   - [No Reddit Objects Left to Scrape](#no-reddit-objects-left-to-scrape)
 13 |   - [Rate Limit Reached](#rate-limit-reached)
 14 | - [Analytical Tool Errors](#analytical-tool-errors)
 15 |   - [Invalid File](#invalid-file)
 16 | 
 17 | # Global Errors
 18 | 
 19 | ## Invalid Arguments
 20 | 
 21 |        __
 22 |      /'__`\
 23 |     /\  __/
 24 |     \ \____\
 25 |      \/____/... [ERROR MESSAGE]
 26 | 
 27 |      Please recheck args or refer to help for usage examples.
 28 | 
 29 | This message is displayed if you have entered invalid arguments. The specific error will follow `...`.
 30 | 
 31 | You can use the `-h` flag to see the help message or the `-e` flag to display example usage.
 32 | 
 33 | ## Export Error
 34 | 
 35 |      __
 36 |     /\ \
 37 |     \ \ \
 38 |      \ \ \
 39 |       \ \_\
 40 |        \/\_\
 41 |         \/_/... An error has occurred while exporting scraped data.
 42 | 
 43 |     [ERROR MESSAGE]
 44 | 
 45 | This message is displayed if an error occured while exporting the data. This applies to the scraper tools or word frequencies tool. The specific error will be printed under the art.
 46 | 
 47 | # PRAW Errors
 48 | 
 49 | ## Invalid API Credentials or No Internet Connection
 50 | 
 51 |      _____
 52 |     /\ '__`\
 53 |     \ \ \L\ \
 54 |      \ \ ,__/... Please recheck API credentials or your internet connection.
 55 |       \ \ \/
 56 |        \ \_\
 57 |         \/_/
 58 | 
 59 |     Prawcore exception: [EXCEPTION]
 60 | 
 61 | This message is displayed if you enter invalid API credentials or if you are not connected to the internet. The exception is printed under the art.
 62 | 
 63 | Recheck the environment variables in `.env` to make sure your API credentials are correct.
 64 | 
 65 | ## No Reddit Objects Left to Scrape
 66 | 
 67 |       ___
 68 |     /' _ `\
 69 |     /\ \/\ \
 70 |     \ \_\ \_\
 71 |      \/_/\/_/... No [OBJECTS] to scrape! Exiting.
 72 | 
 73 | This message is displayed if the Reddit objects you have passed in have failed validation (does not exist), are skipped, and there are no longer any objects left for URS to process for that specific scraper.
 74 | 
 75 | ## Rate Limit Reached
 76 | 
 77 |      __
 78 |     /\ \
 79 |     \ \ \
 80 |      \ \ \  __
 81 |       \ \ \L\ \
 82 |        \ \____/
 83 |         \/___/... You have reached your rate limit.
 84 | 
 85 |     Please try again when your rate limit is reset: [DATE]
 86 | 
 87 | PRAW has rate limits. This message is displayed if you have reached the rate limit set for your account. The reset date will vary depending on when you ran URS. The date I received during testing is usually 24 hours later.
 88 | 
 89 | # Analytical Tool Errors
 90 | 
 91 | ## Invalid File
 92 | 
 93 |      __
 94 |     /\_\
 95 |     \/\ \
 96 |      \ \ \
 97 |       \ \_\
 98 |        \/_/... [ERROR MESSAGE]
 99 | 
100 | This message is displayed when you have passed in an invalid file to generate word frequencies or a wordcloud for. The specific error will follow `...`.
101 | 


--------------------------------------------------------------------------------
/tests/test_utils/test_Utilities.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Testing `Utilities.py`.
  3 | """
  4 | 
  5 | 
  6 | import os
  7 | 
  8 | from rich.tree import Tree
  9 | 
 10 | from urs.utils.Utilities import DateTree
 11 | 
 12 | 
 13 | class TestDateTreeCheckDateFormatMethod:
 14 |     """
 15 |     Testing DateTree class _check_date_format() method.
 16 |     """
 17 | 
 18 |     def test_check_date_format_dash_format(self):
 19 |         test_date = "06-28-2021"
 20 |         test_search_date = DateTree._check_date_format(test_date)
 21 | 
 22 |         assert test_search_date == test_date
 23 | 
 24 |     def test_check_date_format_slash_format(self):
 25 |         test_date = "06/28/2021"
 26 |         test_search_date = DateTree._check_date_format(test_date)
 27 | 
 28 |         assert test_search_date == "06-28-2021"
 29 | 
 30 |     def test_check_date_wrong_format(self):
 31 |         test_date = "06.28.2021"
 32 | 
 33 |         try:
 34 |             _ = DateTree._check_date_format(test_date)
 35 |             assert False
 36 |         except TypeError:
 37 |             assert True
 38 | 
 39 |     def test_check_date_short_date_wrong_format(self):
 40 |         test_date = "06-28-21"
 41 | 
 42 |         try:
 43 |             _ = DateTree._check_date_format(test_date)
 44 |             assert False
 45 |         except TypeError:
 46 |             assert True
 47 | 
 48 | 
 49 | class TestDateTreeFindDateDirectoryMethod:
 50 |     """
 51 |     Testing DateTree class _find_date_directory() method.
 52 |     """
 53 | 
 54 |     def test_find_date_directory_directory_exists(self):
 55 |         os.mkdir("../scrapes/06-28-2021")
 56 |         dir_exists = DateTree._find_date_directory("06-28-2021")
 57 | 
 58 |         assert dir_exists == True
 59 | 
 60 |     def test_find_date_directory_directory_does_not_exist(self):
 61 |         os.rmdir("../scrapes/06-28-2021")
 62 |         dir_exists = DateTree._find_date_directory("06-28-2021")
 63 | 
 64 |         assert dir_exists == False
 65 | 
 66 | 
 67 | class TestDateTreeCreateDirectoryTreeMethod:
 68 |     """
 69 |     Testing DateTree class _create_directory_tree() method.
 70 |     """
 71 | 
 72 |     def test_create_directory_tree(self):
 73 |         os.makedirs("../scrapes/06-28-2021/testing/nested/directories/tree")
 74 | 
 75 |         test_tree = Tree("test")
 76 | 
 77 |         try:
 78 |             DateTree._create_directory_tree("../scrapes/06-28-2021", test_tree)
 79 |             assert True
 80 |         except Exception as e:
 81 |             print(
 82 |                 f"An exception was thrown when testing DateTree._create_directory_tree(): {e}"
 83 |             )
 84 |             assert False
 85 | 
 86 | 
 87 | class TestDateTreeDisplayTreeMethod:
 88 |     """
 89 |     Testing DateTree class display_tree() method.
 90 |     """
 91 | 
 92 |     def test_display_tree_method_valid_search_date(self):
 93 |         try:
 94 |             DateTree.display_tree("06-28-2021")
 95 |             assert True
 96 |         except Exception as e:
 97 |             print(f"An exception was thrown when testing DateTree.display_tree(): {e}")
 98 |             assert False
 99 | 
100 |     def test_display_tree_method_search_date_not_found(self):
101 |         try:
102 |             DateTree.display_tree("00-00-0000")
103 |             assert False
104 |         except SystemExit:
105 |             assert True
106 | 
107 |     def test_display_tree_method_invalid_search_date(self):
108 |         try:
109 |             DateTree.display_tree("00.00.0000")
110 |             assert False
111 |         except SystemExit:
112 |             assert True
113 | 


--------------------------------------------------------------------------------
/manual/src/scraping-reddit/submission-comments.md:
--------------------------------------------------------------------------------
 1 | # Table of Contents
 2 | 
 3 | - [Submission Comments](#submission-comments)
 4 |   - [All Flags](#all-flags)
 5 |   - [Usage](#usage)
 6 |   - [File Naming Conventions](#file-naming-conventions)
 7 |   - [Number of Comments Returned](#number-of-comments-returned)
 8 |   - [Structured Comments](#structured-comments)
 9 |   - [Raw Comments](#raw-comments)
10 | 
11 | # Submission Comments
12 | 
13 | ![Submission Comments Demo GIF][submission comments demo]
14 | 
15 | \*_This GIF has been cut for demonstration purposes._
16 | 
17 | ## All Flags
18 | 
19 | These are all the flags that may be used when scraping submission comments.
20 | 
21 | ```
22 | [-c <submission_url> <n_results>]
23 |     [--raw]
24 | ```
25 | 
26 | ## Usage
27 | 
28 | ```
29 | poetry run Urs.py -c <submission_url> <n_results>
30 | ```
31 | 
32 | Submission metadata will be included in the `submission_metadata` field and includes the following attributes:
33 | 
34 | - `author`
35 | - `created_utc`
36 | - `distinguished`
37 | - `edited`
38 | - `is_original_content`
39 | - `is_self`
40 | - `link_flair_text`
41 | - `locked`
42 | - `nsfw`
43 | - `num_comments`
44 | - `permalink`
45 | - `score`
46 | - `selftext`
47 | - `spoiler`
48 | - `stickied`
49 | - `subreddit`
50 | - `title`
51 | - `upvote_ratio`
52 | 
53 | If the submission contains a gallery, the attributes `gallery_data` and `media_metadata` will be included.
54 | 
55 | Comments are written to the `comments` field. They are sorted by "Best", which is the default sorting option when you visit a submission.
56 | 
57 | PRAW returns submission comments in level order, which means scrape speeds are proportional to the submission's popularity.
58 | 
59 | ## File Naming Conventions
60 | 
61 | The file names will generally follow this format:
62 | 
63 | ```
64 | [POST_TITLE]-[N_RESULTS]-result(s).json
65 | ```
66 | 
67 | Scrape data is exported to the `comments` directory.
68 | 
69 | ## Number of Comments Returned
70 | 
71 | You can scrape all comments from a submission by passing in `0` for `<n_results>`. Subsequently, `[N_RESULTS]-result(s)` in the file name will be replaced with `all`.
72 | 
73 | Otherwise, specify the number of results you want returned. If you passed in a specific number of results, the structured export will return up to `<n_results>` top level comments and include all of its replies.
74 | 
75 | ## Structured Comments
76 | 
77 | **This is the default export style.** Structured scrapes resemble comment threads on Reddit. This style takes just a little longer to export compared to the raw format because `URS` uses [depth-first search][depth-first search] to create the comment `Forest` after retrieving all comments from a submission.
78 | 
79 | If you want to learn more about how it works, refer to [The Forest](../implementation-details/the-forest.md), where I describe how I implemented the `Forest`, and [Speeding up Python With Rust](../implementation-details/speeding-up-python-with-rust.md) to learn about how I drastically improved the performance of the `Forest` by rewriting it in Rust.
80 | 
81 | ## Raw Comments
82 | 
83 | Raw scrapes do not resemble comment threads, but returns all comments on a submission in level order: all top-level comments are listed first, followed by all second-level comments, then third, etc.
84 | 
85 | You can export to raw format by including the `--raw` flag. `-raw` will also be appended to the end of the file name.
86 | 
87 | [depth-first search]: https://www.interviewcake.com/concept/java/dfs
88 | [submission comments demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/praw_scrapers/static_scrapers/submission_comments_demo.gif?raw=true
89 | 


--------------------------------------------------------------------------------
/.github/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |  advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |  address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |  professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at urs_project@protonmail.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/manual/src/exporting.md:
--------------------------------------------------------------------------------
 1 | # Exporting
 2 | 
 3 | ## Table of Contents
 4 | 
 5 | - [Export File Format](#export-file-format)
 6 |   - [Exporting to CSV](#exporting-to-csv)
 7 | - [Export Directory Structure](#export-directory-structure)
 8 |   - [PRAW Scrapers](#praw-scrapers)
 9 |   - [PRAW Livestream Scrapers](#praw-livestream-scrapers)
10 |   - [Analytical Tools](#analytical-tools)
11 |   - [Example Directory Structure](#example-directory-structure)
12 | 
13 | ## Export File Format
14 | 
15 | **All files except for those generated by the wordcloud tool are exported to JSON by default**. Wordcloud files are exported to PNG by default.
16 | 
17 | `URS` supports exporting to CSV as well, but JSON is the more versatile option.
18 | 
19 | ### Exporting to CSV
20 | 
21 | You will have to include the `--csv` flag to export to CSV.
22 | 
23 | You can only export to CSV when using:
24 | 
25 | - The Subreddit scrapers.
26 | - The word frequencies generator.
27 | 
28 | These tools are also suitable for CSV format and are optimized to do so if you want to use that format instead.
29 | 
30 | The `--csv` flag is ignored if it is present while using any of the other scrapers.
31 | 
32 | ## Export Directory Structure
33 | 
34 | All exported files are saved within the `scrapes` directory and stored in a sub-directory labeled with the date. Many more sub-directories may be created in the date directory. Sub-directories are only created when its respective tool is run. For example, if you only use the Subreddit scraper, only the `subreddits` directory is created.
35 | 
36 | ### PRAW Scrapers
37 | 
38 | The `subreddits`, `redditors`, or `comments` directories may be created.
39 | 
40 | ### PRAW Livestream Scrapers
41 | 
42 | The `livestream` directory is created when you run any of the livestream scrapers. Within it, the `subreddits` or `redditors` directories may be created.
43 | 
44 | ### Analytical Tools
45 | 
46 | The `analytics` directory is created when you run any of the analytical tools. Within it, the `frequencies` or `wordclouds` directories may be created. See the [Analytical Tools](./analytical-tools/general-information.md) section for more information.
47 | 
48 | ### Example Directory Structure
49 | 
50 | This is the [samples][samples] directory structure generated by [`nomad`][nomad], a modern `tree` alternative I wrote in [Rust][rust].
51 | 
52 | ```
53 | scrapes/
54 | └── 06-02-2021
55 |     ├── analytics
56 |     │   ├── frequencies
57 |     │   │   ├── comments
58 |     │   │   │   └── What’s something from the 90s you miss_-all.json
59 |     │   │   ├── livestream
60 |     │   │   │   └── subreddits
61 |     │   │   │       └── askreddit-comments-20_44_11-00_01_10.json
62 |     │   │   └── subreddits
63 |     │   │       └── cscareerquestions-search-'job'-past-year-rules.json
64 |     │   └── wordcloud
65 |     │       ├── comments
66 |     │       │   └── What’s something from the 90s you miss_-all.png
67 |     │       ├── livestream
68 |     │       │   └── subreddits
69 |     │       │       └── askreddit-comments-20_44_11-00_01_10.png
70 |     │       └── subreddits
71 |     │           └── cscareerquestions-search-'job'-past-year-rules.png
72 |     ├── comments
73 |     │   └── What’s something from the 90s you miss_-all.json
74 |     ├── livestream
75 |     │   └── subreddits
76 |     │       ├── askreddit-comments-20_44_11-00_01_10.json
77 |     │       └── askreddit-submissions-20_46_12-00_01_52.json
78 |     ├── redditors
79 |     │   └── spez-5-results.json
80 |     ├── subreddits
81 |     │   ├── askreddit-hot-10-results.json
82 |     │   └── cscareerquestions-search-'job'-past-year-rules.json
83 |     └── urs.log
84 | ```
85 | 
86 | [nomad]: https://github.com/JosephLai241/nomad
87 | [samples]: https://github.com/JosephLai241/URS/tree/samples
88 | [rust]: https://www.rust-lang.org/
89 | 


--------------------------------------------------------------------------------
/urs/utils/Global.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Global variables
  3 | ================
  4 | Variables, functions, and classes that are used throughout this program.
  5 | """
  6 | 
  7 | 
  8 | import datetime as dt
  9 | from typing import Any, Dict, List, Union
 10 | 
 11 | from halo import Halo
 12 | 
 13 | # Get current date.
 14 | date = dt.datetime.now().strftime("%Y-%m-%d")
 15 | 
 16 | # Subreddit categories.
 17 | categories = ["Hot", "New", "Controversial", "Top", "Rising", "Search"]
 18 | short_cat = [cat[0] for cat in categories]
 19 | 
 20 | 
 21 | def convert_time(raw_timestamp: float) -> str:
 22 |     """
 23 |     Convert UNIX time to readable format.
 24 | 
 25 |     :param float raw_timestamp: A UNIX timestamp.
 26 | 
 27 |     :returns: The timestamp converted into a readable format.
 28 |     :rtype: `str`
 29 |     """
 30 | 
 31 |     return dt.datetime.fromtimestamp(raw_timestamp).strftime("%Y-%m-%d %H:%M:%S")
 32 | 
 33 | 
 34 | def confirm_settings() -> Union[str, None]:
 35 |     """
 36 |     Confirm scraping options.
 37 | 
 38 |     :raises ValueError: Raised if the confirmation input is invalid.
 39 | 
 40 |     :returns: A `str` denoting whether to confirm settings and continue scraping,
 41 |         or `None` if the operation is cancelled.
 42 |     :rtype: `str | None`
 43 |     """
 44 | 
 45 |     options = ["y", "n"]
 46 | 
 47 |     while True:
 48 |         try:
 49 |             confirm = input("\nConfirm options? [Y/N] ").strip().lower()
 50 | 
 51 |             if confirm == options[0]:
 52 |                 return confirm
 53 |             elif confirm == options[1]:
 54 |                 break
 55 |             elif confirm not in options:
 56 |                 raise ValueError
 57 |         except ValueError:
 58 |             print("Not an option! Try again.")
 59 | 
 60 | 
 61 | def make_list_dict(keys: List[str]) -> Dict[str, List[Any]]:
 62 |     """
 63 |     Initialize a dictionary of keys with empty lists as values.
 64 | 
 65 |     :param list[str] keys: A `list[str]` of keys used to initialize a dictionary.
 66 | 
 67 |     :returns: A `dict[str, list[any]]` initialized with the keys in the `keys`
 68 |         `list[str]` and empty arrays as its values.
 69 |     """
 70 | 
 71 |     return dict((key, []) for key in keys)
 72 | 
 73 | 
 74 | def make_none_dict(keys: List[str]) -> Dict[str, None]:
 75 |     """
 76 |     Initialize a dictionary of keys with `None` as values.
 77 | 
 78 |     :param list[str] keys: A `list[str]` of keys used to initialize a dictionary.
 79 | 
 80 |     :returns: A `dict[str, list[any]]` initialized with the keys in the `keys`
 81 |         `list[str]` and `None` as its values.
 82 |     """
 83 | 
 84 |     return dict((key, None) for key in keys)
 85 | 
 86 | 
 87 | class Status:
 88 |     """
 89 |     Methods for defining status spinners.
 90 |     """
 91 | 
 92 |     def __init__(self, after_message: str, before_message: str, color: str) -> None:
 93 |         """
 94 |         Initialize variables used in later methods:
 95 | 
 96 |             self._after_message: success message
 97 |             self._before_message: status message
 98 |             self._color: the color of the spinner
 99 | 
100 |             self._spinner: Halo instance
101 | 
102 |         :param str after_message: The success message to display.
103 |         :param str before_message: The status message to display.
104 |         :param str color: The spinner's color.
105 |         """
106 | 
107 |         self._after_message = after_message
108 |         self._before_message = before_message
109 |         self._color = color
110 | 
111 |         self.spinner = Halo(color=self._color, text=self._before_message)
112 | 
113 |     def start(self) -> None:
114 |         """
115 |         Start the spinner.
116 |         """
117 | 
118 |         self.spinner.start()
119 | 
120 |     def succeed(self) -> None:
121 |         """
122 |         Display the success spinner message.
123 |         """
124 | 
125 |         self.spinner.succeed(self._after_message)
126 | 


--------------------------------------------------------------------------------
/manual/src/scraping-reddit/subreddit.md:
--------------------------------------------------------------------------------
  1 | # Table of Contents
  2 | 
  3 | - [Subreddits](#subreddits)
  4 |   - [All Flags](#all-flags)
  5 |   - [Basic Usage](#basic-usage)
  6 |   - [Filename Naming Conventions](#filename-naming-conventions)
  7 | - [Time Filters](#time-filters)
  8 |   - [Filename Naming Conventions](#filename-naming-conventions-1)
  9 | - [Subreddit Rules and Post Requirements](#subreddit-rules-and-post-requirements)
 10 | - [Bypassing the Final Settings Check](#bypassing-the-final-settings-check)
 11 | 
 12 | # Subreddits
 13 | 
 14 | ![Subreddit Demo GIF][subreddit demo]
 15 | 
 16 | ## All Flags
 17 | 
 18 | These are all the flags that may be used when scraping Subreddits.
 19 | 
 20 | ```
 21 | [-r <subreddit> <(h|n|c|t|r|s)> <n_results_or_keywords> [<optional_time_filter>]]
 22 |     [-y]
 23 |     [--csv]
 24 |     [--rules]
 25 | ```
 26 | 
 27 | ## Basic Usage
 28 | 
 29 | ```
 30 | poetry run Urs.py -r <subreddit> <(h|n|c|t|r|s)> <n_results_or_keywords>
 31 | ```
 32 | 
 33 | **Supports exporting to CSV.** To export to CSV, include the `--csv` flag.
 34 | 
 35 | Specify Subreddits, the submission category, and how many results are returned from each scrape. I have also added a search option where you can search for keywords within a Subreddit.
 36 | 
 37 | These are the submission categories:
 38 | 
 39 | - Hot
 40 | - New
 41 | - Controversial
 42 | - Top
 43 | - Rising
 44 | - Search
 45 | 
 46 | ## Filename Naming Conventions
 47 | 
 48 | The file names for all categories except for Search will follow this format:
 49 | 
 50 | ```
 51 | [SUBREDDIT]-[POST_CATEGORY]-[N_RESULTS]-result(s).[FILE_FORMAT]
 52 | ```
 53 | 
 54 | If you searched for keywords, file names will follow this format:
 55 | 
 56 | ```
 57 | [SUBREDDIT]-Search-'[KEYWORDS]'.[FILE_FORMAT]
 58 | ```
 59 | 
 60 | Scrape data is exported to the `subreddits` directory.
 61 | 
 62 | > **_NOTE:_** Up to 100 results are returned if you search for keywords within a Subreddit. You will not be able to specify how many results to keep.
 63 | 
 64 | # Time Filters
 65 | 
 66 | Time filters may be applied to some categories. Here is a table of the categories on which you can apply a time filter as well as the valid time filters.
 67 | 
 68 | | Categories    | Time Filters  |
 69 | | ------------- | ------------- |
 70 | | Controversial | All (default) |
 71 | | Search        | Day           |
 72 | | Top           | Hour          |
 73 | |               | Month         |
 74 | |               | Week          |
 75 | |               | Year          |
 76 | 
 77 | Specify the time filter after the number of results returned or keywords you want to search for:
 78 | 
 79 | ```
 80 | poetry run Urs.py -r <subreddit> <(c|t|s)> <n_results_or_keywords> [<time_filter>]
 81 | ```
 82 | 
 83 | If no time filter is specified, the default time filter `all` is applied. The Subreddit settings table will display `None` for categories that do not offer the additional time filter option.
 84 | 
 85 | ## Filename Naming Conventions
 86 | 
 87 | If you specified a time filter, `-past-[TIME_FILTER]` will be appended to the file name before the file format like so:
 88 | 
 89 | ```
 90 | [SUBREDDIT]-[POST_CATEGORY]-[N_RESULTS]-result(s)-past-[TIME_FILTER].[FILE_FORMAT]
 91 | ```
 92 | 
 93 | Or if you searched for keywords:
 94 | 
 95 | ```
 96 | [SUBREDDIT]-Search-'[KEYWORDS]'-past-[TIME_FILTER].[FILE_FORMAT]
 97 | ```
 98 | 
 99 | # Subreddit Rules and Post Requirements
100 | 
101 | You can also include the Subreddit's rules and post requirements in your scrape data by including the `--rules` flag. **This is only compatible with JSON**. This data will be included in the `subreddit_rules` field.
102 | 
103 | If rules are included in your file, `-rules` will be appended to the end of the file name.
104 | 
105 | # Bypassing the Final Settings Check
106 | 
107 | After submitting the arguments and Reddit validation, `URS` will display a table of Subreddit scraping settings as a final check before executing. You can include the `-y` flag to bypass this and immediately scrape.
108 | 
109 | [subreddit demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/praw_scrapers/static_scrapers/Subreddit_demo.gif?raw=true
110 | 


--------------------------------------------------------------------------------
/urs/utils/Tools.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tools
  3 | =====
  4 | Running all tools that URS has to offer.
  5 | """
  6 | 
  7 | 
  8 | import logging
  9 | from argparse import ArgumentParser, Namespace
 10 | from typing import Tuple
 11 | 
 12 | from praw import Reddit
 13 | 
 14 | from urs.analytics.Frequencies import GenerateFrequencies
 15 | from urs.analytics.Wordcloud import GenerateWordcloud
 16 | from urs.praw_scrapers.live_scrapers.Livestream import Livestream
 17 | from urs.praw_scrapers.static_scrapers.Basic import RunBasic
 18 | from urs.praw_scrapers.static_scrapers.Comments import RunComments
 19 | from urs.praw_scrapers.static_scrapers.Redditor import RunRedditor
 20 | from urs.praw_scrapers.static_scrapers.Subreddit import RunSubreddit
 21 | from urs.praw_scrapers.utils.Validation import Validation
 22 | from urs.utils.Cli import CheckCli, Parser
 23 | from urs.utils.Titles import MainTitle
 24 | from urs.utils.Utilities import DateTree
 25 | 
 26 | 
 27 | class Run:
 28 |     """
 29 |     Methods to call CLI and all tools.
 30 |     """
 31 | 
 32 |     def __init__(self, reddit: Reddit) -> None:
 33 |         """
 34 |         Initialize variables used in instance methods:
 35 | 
 36 |             self._reddit: Reddit instance
 37 |             self._args: argparse Namespace object
 38 |             self._parser: argparse ArgumentParser object
 39 | 
 40 |         :param Reddit reddit: PRAW `Reddit` object.
 41 |         """
 42 | 
 43 |         self._reddit = reddit
 44 |         self._args, self._parser = self._introduce_then_args()
 45 | 
 46 |     def _introduce_then_args(self) -> Tuple[Namespace, ArgumentParser]:
 47 |         """
 48 |         Print title, then run checks for CLI args and PRAW credentials.
 49 | 
 50 |         :returns: The `Namespace` and `ArgumentParser` objects.
 51 |         :rtype: `(Namespace, ArgumentParser)`
 52 |         """
 53 | 
 54 |         MainTitle.title()
 55 | 
 56 |         args, parser = Parser().parse_args()
 57 |         CheckCli().check_args(args)
 58 | 
 59 |         return args, parser
 60 | 
 61 |     def run_urs(self) -> None:
 62 |         """
 63 |         Switch for running all URS tools.
 64 |         """
 65 | 
 66 |         if self._args.check:
 67 |             """
 68 |             Run rate limit check.
 69 |             """
 70 | 
 71 |             logging.info("RUNNING API CREDENTIALS CHECK.")
 72 |             logging.info("")
 73 | 
 74 |             Validation.validate_user(self._parser, self._reddit)
 75 | 
 76 |         elif self._args.tree:
 77 |             """
 78 |             Display visual directory tree for a date (default is the current date).
 79 |             """
 80 | 
 81 |             DateTree.display_tree(self._args.tree)
 82 | 
 83 |         elif (
 84 |             self._args.subreddit
 85 |             or self._args.redditor
 86 |             or self._args.comments
 87 |             or self._args.basic
 88 |         ):
 89 |             """
 90 |             Run PRAW scrapers.
 91 |             """
 92 | 
 93 |             Validation.validate_user(self._parser, self._reddit)
 94 | 
 95 |             if self._args.subreddit:
 96 |                 RunSubreddit.run(self._args, self._reddit)
 97 |             if self._args.redditor:
 98 |                 RunRedditor.run(self._args, self._reddit)
 99 |             if self._args.comments:
100 |                 RunComments.run(self._args, self._reddit)
101 |             elif self._args.basic:
102 |                 RunBasic.run(self._args, self._parser, self._reddit)
103 | 
104 |         elif self._args.live_subreddit or self._args.live_redditor:
105 |             """
106 |             Run PRAW livestream scrapers.
107 |             """
108 | 
109 |             Validation.validate_user(self._parser, self._reddit)
110 |             Livestream.stream(self._args, self._reddit)
111 | 
112 |         elif self._args.frequencies or self._args.wordcloud:
113 |             """
114 |             Run analytical tools.
115 |             """
116 | 
117 |             if self._args.frequencies:
118 |                 GenerateFrequencies.generate(self._args)
119 |             if self._args.wordcloud:
120 |                 GenerateWordcloud.generate(self._args)
121 | 


--------------------------------------------------------------------------------
/.github/STYLE_GUIDE.md:
--------------------------------------------------------------------------------
  1 | # URS Style Guide
  2 | 
  3 | ## Table of Contents
  4 | 
  5 | - [Code Formatting](#code-formatting)
  6 |   - [`Black` Formatting](#black-formatting)
  7 |   - [`isort` Formatting](#isort-formatting)
  8 | - [Docstring and Type Hint Etiquette](#docstring-and-type-hint-etiquette)
  9 | - [Unit Testing Code](#unit-testing-code)
 10 | 
 11 | ## Code Formatting
 12 | 
 13 | The rules for code formatting are very simple -- **all formatting rules are delegated to [`Black`][black] and [`isort`][isort]**.
 14 | 
 15 | ### `Black` Formatting
 16 | 
 17 | Use the standard formatting rules when formatting code with `Black`. Formatting code manually is a very simple command:
 18 | 
 19 | ```
 20 | black urs/
 21 | ```
 22 | 
 23 | ### `isort` Formatting
 24 | 
 25 | When formatting imports with `isort`, you will have to specify the `profile` setting when running the command to allow for interoperability between code styles. In this case, you will have to tell `isort` to use the `black` profile since we are formatting everything else with `Black`. The command looks something like this:
 26 | 
 27 | ```
 28 | isort urs/ --profile black
 29 | ```
 30 | 
 31 | ## Docstring and Type Hint Etiquette
 32 | 
 33 | **Every single function needs a docstring describing what the function does, its parameters, and what it returns (if applicable)**, even if the function name is self-explanatory. The docstring format used by `URS` is the reStructuredText (RST) format. See the [Real Python reStructuredText example][real python restructuredtext example] for an idea as to how it looks.
 34 | 
 35 | Docstrings have a max character count of 80 characters. If the function description docstring exceeds 80 characters, continue typing on a new line. If parameter, exception, or return docstrings exceed 80 characters, create a new line, tab in (4 spaces), and continue typing.
 36 | 
 37 | Parameters, exceptions, and return statements should be grouped together/separated by a newline. Refer to the Python codeblock below for an example.
 38 | 
 39 | **Every single function also requires type hints for its parameters and return type**, even if the parameter name is self-explanatory. See this [Real Python type hint tutorial][real python type hint tutorial] if you are unfamiliar with type hints.
 40 | 
 41 | Below is an example of a properly documented function:
 42 | 
 43 | ```python
 44 | def add_two_numbers(first_number: int, second_number: int) -> int:
 45 |     """
 46 |     Returns the sum of two numbers.
 47 | 
 48 |     :param int first_number: The first number to add.
 49 |     :param int second_number: The second number to add.
 50 | 
 51 |     :raises ValueError: Raised if either `first_number` or `second_number` is not
 52 |         an `int`.
 53 | 
 54 |     :returns: The sum of two numbers
 55 |     :rtype: `int`
 56 |     """
 57 | 
 58 |     if not isinstance(first_number, int) or not isinstance(second_number, int):
 59 |         raise ValueError("Can only add two integers together!")
 60 | 
 61 |     return first_number + second_number
 62 | ```
 63 | 
 64 | ## Unit Testing Code
 65 | 
 66 | Every method in URS has to be wrapped in a class for unit testing. This makes it easier to add and group tests if features are added to a method in the future.
 67 | 
 68 | Showing an example would be the best way to describe how unit tests should be named and structured:
 69 | 
 70 | `_list_switch()` is a method found in `Cli.py` within the `GetScrapeSettings` class:
 71 | 
 72 | ```python
 73 | class GetScrapeSettings():
 74 |     """
 75 |     Methods for creating data structures to store scrape settings.
 76 |     """
 77 | 
 78 |     def _list_switch(self, args, index):
 79 |         ...
 80 | ```
 81 | 
 82 | The unit test for this function is located in the `tests/` directory in the file `test_Cli.py` and looks like this:
 83 | 
 84 | ```python
 85 | class TestGetScrapeSettingsListSwitchMethod():
 86 |     """
 87 |     Testing GetScrapeSettings class _list_switch() method.
 88 |     """
 89 | 
 90 |     def test_list_switch_method_first_switch(self):
 91 |         ...
 92 | 
 93 |     def test_list_switch_method_second_switch(self):
 94 |         ...
 95 | 
 96 |     def test_list_switch_method_third_switch(self):
 97 |         ...
 98 | ```
 99 | 
100 | The unit test class will use the following naming convention:
101 | 
102 | ```python
103 | class Test[CamelCaseClassName][CamelCaseMethodName]Method():
104 |     ...
105 | ```
106 | 
107 | Include a block comment under the unit test class using the following convention:
108 | 
109 | ```
110 | Testing [ClassName] class [method_name()] method.
111 | ```
112 | 
113 | The unit test method will use the following naming convention:
114 | 
115 | ```python
116 |     def test_[underscored_method_name]_[underscored_test_case](self):
117 |         ...
118 | ```
119 | 
120 | <!-- LINKS -->
121 | 
122 | [black]: https://black.readthedocs.io/en/stable/
123 | [isort]: https://pycqa.github.io/isort/
124 | [real python type hint tutorial]: https://realpython.com/lessons/type-hinting/
125 | [real python restructuredtext example]: https://realpython.com/documenting-python-code/#restructuredtext-example
126 | 


--------------------------------------------------------------------------------
/tests/test_praw_scrapers/test_static_scrapers/test_Basic.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import sys
  3 | from io import StringIO
  4 | 
  5 | from urs.praw_scrapers.static_scrapers import Basic
  6 | 
  7 | ### Function names are pretty self-explanatory, so I will not be adding comments
  8 | ### above the functions.
  9 | 
 10 | ### Includes a total of 30 tests.
 11 | 
 12 | 
 13 | class MakeArgs:
 14 |     """
 15 |     Making dummy args to test Basic.py functions.
 16 |     """
 17 | 
 18 |     @staticmethod
 19 |     def parser_for_testing_basic():
 20 |         parser = argparse.ArgumentParser()
 21 |         parser.add_argument("--basic", action="store_true")
 22 |         return parser
 23 | 
 24 | 
 25 | class TestPrintSubsFindSubsMethod:
 26 |     """
 27 |     Testing PrintSubs class _find_subs() method.
 28 |     """
 29 | 
 30 |     def test_find_subs_only_returning_found_subreddits(self):
 31 |         pass
 32 | 
 33 |     def test_find_subs_only_returning_not_found_subreddits(self):
 34 |         pass
 35 | 
 36 |     def test_find_subs_returning_both_found_and_not_found_subreddits(self):
 37 |         pass
 38 | 
 39 | 
 40 | class TestPrintSubsPrintSubredditsMethod:
 41 |     """
 42 |     Testing PrintSubs class print_subreddits() method.
 43 |     """
 44 | 
 45 |     def test_print_subreddits_only_printing_found_subreddits(self):
 46 |         pass
 47 | 
 48 |     def test_print_subreddits_only_printing_not_found_subreddits(self):
 49 |         pass
 50 | 
 51 |     def test_print_subreddits_printing_both_found_and_not_found_subreddits(self):
 52 |         pass
 53 | 
 54 | 
 55 | class TestGetInputGetSubredditsMethod:
 56 |     """
 57 |     Testing GetInput class get_subreddits() method.
 58 |     """
 59 | 
 60 |     def test_get_input_get_subreddits_no_input_from_user(self):
 61 |         pass
 62 | 
 63 |     def test_get_input_get_subreddits_valid_input(self):
 64 |         pass
 65 | 
 66 | 
 67 | class TestGetInputUpdateMasterMethod:
 68 |     """
 69 |     Testing GetInput class _update_master() method.
 70 |     """
 71 | 
 72 |     def test_update_master_not_search_category(self):
 73 |         cat_i = 0
 74 |         test_master = {"test_subreddit": []}
 75 |         search_for = 10
 76 |         sub = "test_subreddit"
 77 | 
 78 |         Basic.GetInput()._update_master(cat_i, test_master, search_for, sub)
 79 | 
 80 |         assert test_master == {"test_subreddit": [["h", 10, None]]}
 81 | 
 82 |     def test_update_master_search_category(self):
 83 |         cat_i = 5
 84 |         test_master = {"test_subreddit": []}
 85 |         search_for = "test string"
 86 |         sub = "test_subreddit"
 87 | 
 88 |         Basic.GetInput()._update_master(cat_i, test_master, search_for, sub)
 89 | 
 90 |         assert test_master == {"test_subreddit": [["s", "test string", "all"]]}
 91 | 
 92 | 
 93 | class TestGetInputGetSearchMethod:
 94 |     """
 95 |     Testing GetInput class _get_search() method.
 96 |     """
 97 | 
 98 |     def test_get_input_search_for_is_a_number(self):
 99 |         pass
100 | 
101 |     def test_get_input_search_for_is_a_string(self):
102 |         pass
103 | 
104 |     def test_get_input_search_for_no_input(self):
105 |         pass
106 | 
107 | 
108 | class TestGetInputGetNResultsMethod:
109 |     """
110 |     Testing GetInput class _get_n_results() method.
111 |     """
112 | 
113 |     def test_get_n_results_normal_input(self):
114 |         pass
115 | 
116 |     def test_get_n_results_invalid_input(self):
117 |         pass
118 | 
119 |     def test_get_n_results_no_input(self):
120 |         pass
121 | 
122 | 
123 | class TestGetInputGetSettingsMethod:
124 |     """
125 |     Testing GetInput class get_settings() method.
126 |     """
127 | 
128 |     def test_get_settings_selected_search_option(self):
129 |         pass
130 | 
131 |     def test_get_settings_selected_other_category_option(self):
132 |         pass
133 | 
134 |     def test_get_settings_invalid_option_out_of_range(self):
135 |         pass
136 | 
137 |     def test_get_settings_invalid_option_is_not_a_number(self):
138 |         pass
139 | 
140 | 
141 | class TestConfirmInputConfirmSubredditsMethod:
142 |     """
143 |     Testing ConfirmInput class confirm_subreddits() method.
144 |     """
145 | 
146 |     def test_confirm_subreddits_selected_yes(self):
147 |         pass
148 | 
149 |     def test_confirm_subreddits_selected_no(self):
150 |         pass
151 | 
152 |     def test_confirm_subreddits_invalid_option(self):
153 |         pass
154 | 
155 | 
156 | class TestConfirmInputAnotherMethod:
157 |     """
158 |     Testing ConfirmInput class another() method.
159 |     """
160 | 
161 |     def test_another_selected_yes(self):
162 |         pass
163 | 
164 |     def test_another_selected_no(self):
165 |         pass
166 | 
167 |     def test_another_invalid_option(self):
168 |         pass
169 | 
170 | 
171 | class TestRunBasicCreateSettingsMethod:
172 |     """
173 |     Testing RunBasic class _create_settings() method.
174 |     """
175 | 
176 |     def test_create_settings(self):
177 |         pass
178 | 
179 | 
180 | class TestRunBasicPrintConfirmMethod:
181 |     """
182 |     Testing RunBasic class _print_confirm() method.
183 |     """
184 | 
185 |     def test_print_confirm(self):
186 |         pass
187 | 


--------------------------------------------------------------------------------
/urs/analytics/Wordcloud.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Wordcloud Generator
  3 | ===================
  4 | Generate a wordcloud based on word frequencies extracted from scraped data.
  5 | """
  6 | 
  7 | 
  8 | from argparse import Namespace
  9 | from pathlib import Path
 10 | from typing import List
 11 | 
 12 | import matplotlib.pyplot as plt
 13 | from colorama import Fore, Style
 14 | from halo import Halo
 15 | from wordcloud import WordCloud
 16 | 
 17 | from urs.analytics.utils.PrepData import GetPath, PrepData
 18 | from urs.utils.Global import Status
 19 | from urs.utils.Logger import LogAnalytics
 20 | from urs.utils.Titles import AnalyticsTitles
 21 | 
 22 | 
 23 | class SetUpWordcloud:
 24 |     """
 25 |     Methods for setting up the wordcloud.
 26 |     """
 27 | 
 28 |     @staticmethod
 29 |     def initialize_wordcloud(file: List[str], scrape_type: str) -> WordCloud:
 30 |         """
 31 |         Initialize wordcloud by setting dimensions, max font size, and generating
 32 |         it from word frequencies.
 33 | 
 34 |         :param list[str] file: A `list[str]` containing scrape files and file
 35 |             formats to generate wordclouds with.
 36 |         :param str scrape_type: The scrape type.
 37 | 
 38 |         :returns: A `WordCloud` instance.
 39 |         :rtype: `WordCloud`
 40 |         """
 41 | 
 42 |         frequencies = PrepData.prep(file[0], scrape_type)
 43 | 
 44 |         initialize_status = Status(
 45 |             "Generated wordcloud.", "Generating wordcloud.", "white"
 46 |         )
 47 | 
 48 |         initialize_status.start()
 49 |         wordcloud = WordCloud(
 50 |             height=1200, max_font_size=400, width=1600
 51 |         ).generate_from_frequencies(frequencies)
 52 |         initialize_status.succeed()
 53 | 
 54 |         return wordcloud
 55 | 
 56 |     @staticmethod
 57 |     def modify_wordcloud(wc: WordCloud):
 58 |         """
 59 |         Further modify wordcloud preferences.
 60 | 
 61 |         :param WordCloud wc: The `WordCloud` instance.
 62 | 
 63 |         :returns: A `matplotlib.pyplot` instance.
 64 |         :rtype: `matplotlib.pyplot`
 65 |         """
 66 | 
 67 |         plt.imshow(wc, interpolation="bilinear")
 68 |         plt.axis("off")
 69 | 
 70 |         return plt
 71 | 
 72 | 
 73 | class FinalizeWordcloud:
 74 |     """
 75 |     Methods for either saving or displaying the wordcloud.
 76 |     """
 77 | 
 78 |     @LogAnalytics.log_show("wordcloud")
 79 |     def show_wordcloud(self, plt) -> None:
 80 |         """
 81 |         Display wordcloud.
 82 | 
 83 |         :param matplotlib.pyplot plt: A `matplotlib.pyplot` instance.
 84 |         """
 85 | 
 86 |         Halo().info(Style.BRIGHT + Fore.GREEN + "Displaying wordcloud.")
 87 |         print()
 88 | 
 89 |         plt.show()
 90 | 
 91 |     @LogAnalytics.log_save("wordcloud")
 92 |     def save_wordcloud(
 93 |         self, analytics_dir: str, scrape_file: List[str], wc: WordCloud
 94 |     ) -> str:
 95 |         """
 96 |         Save wordcloud to file.
 97 | 
 98 |         :param str analytics_dir: the path to the directory in which the analytical
 99 |             data will be written.
100 |         :param list[str] scrape_file: A `list[str]` containing scrape files and
101 |             file formats to generate wordclouds with.
102 |         :param WordCloud wc: The `WordCloud` instance.
103 | 
104 |         :returns: The filename for the exported wordcloud.
105 |         :rtype: `str`
106 |         """
107 | 
108 |         filename = GetPath.name_file(analytics_dir, scrape_file[0])
109 | 
110 |         split_path = list(Path(filename).parts)
111 | 
112 |         split_filename = split_path[-1].split(".")
113 |         split_filename[-1] = scrape_file[-1]
114 | 
115 |         split_path[-1] = ".".join(split_filename)
116 |         new_filename = "/".join(split_path)
117 | 
118 |         export_status = Status(
119 |             Style.BRIGHT + Fore.GREEN + f"Wordcloud exported to {new_filename}.",
120 |             "Exporting wordcloud.",
121 |             "white",
122 |         )
123 | 
124 |         export_status.start()
125 |         wc.to_file(new_filename)
126 |         export_status.succeed()
127 |         print()
128 | 
129 |         return new_filename
130 | 
131 | 
132 | class GenerateWordcloud:
133 |     """
134 |     Methods for generating a wordcloud.
135 |     """
136 | 
137 |     @staticmethod
138 |     @LogAnalytics.generator_timer("wordcloud")
139 |     def generate(args: Namespace) -> None:
140 |         """
141 |         Generate wordcloud.
142 | 
143 |         :param Namespace args: A `Namespace` object containing all arguments used
144 |             in the CLI.
145 |         """
146 | 
147 |         AnalyticsTitles.wc_title()
148 | 
149 |         for scrape_file in args.wordcloud:
150 |             analytics_dir, scrape_type = GetPath.get_scrape_type(
151 |                 scrape_file[0], "wordcloud"
152 |             )
153 |             wc = SetUpWordcloud.initialize_wordcloud(scrape_file, scrape_type)
154 |             plt = SetUpWordcloud.modify_wordcloud(wc)
155 | 
156 |             FinalizeWordcloud().show_wordcloud(
157 |                 plt
158 |             ) if args.nosave else FinalizeWordcloud().save_wordcloud(
159 |                 analytics_dir, scrape_file, wc
160 |             )
161 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
  1 | # Summary
  2 | 
  3 | Delete this line and write your summary here. Section your summary by relevance if it is lengthy.
  4 | 
  5 | # Motivation/Context
  6 | 
  7 | Delete this line and write your motivation/context here. Section your motivation/context by relevance if it is lengthy.
  8 | 
  9 | # New Dependencies
 10 | 
 11 | ```
 12 | Delete this line and paste your new dependencies here. Put "None" here if there are no new dependencies.
 13 | ```
 14 | 
 15 | # Issue Fix or Enhancement Request
 16 | 
 17 | **Put "N/A" in this block if this is not applicable.**
 18 | 
 19 | If it fixes an open issue, link the issue and write a summary for the bug and fix like so:
 20 | 
 21 | - Fixes #issue_number_here.
 22 |   - **Bug:** Write a brief description of the bug.
 23 |   - **Fix:** Write a brief description of the fix.
 24 |     - If applicable, add additional information for the fix.
 25 | 
 26 | Alternatively, if it resolves an open feature/enhancement request, link the request in this pull request like so:
 27 | 
 28 | - Resolves #issue_number_here.
 29 |   - **Enhancement/Feature Request:** Write a brief description of the enhancement/feature request.
 30 |   - **Enhancement or Feature:** Write a brief description of what is new in this pull request.
 31 |     - If applicable, add additional information for the enhancement or feature.
 32 | 
 33 | If neither of the above apply, use the templates described above and replace the issue number with a summary of the new changes you have made.
 34 | 
 35 | # Type of Change
 36 | 
 37 | **Please delete options that are not relevant.**
 38 | 
 39 | - [x] Bug Fix (non-breaking change which fixes an issue)
 40 | - [x] Bug Fix - Breaking Change (breaking change causes existing functionality to not work as expected)
 41 | - [x] Code Refactor
 42 | - [x] New Feature (non-breaking change which adds functionality)
 43 | - [x] New Feature - Breaking Change (breaking change causes existing functionality to not work as expected)
 44 | - [x] This change requires a documentation update
 45 | 
 46 | # Breaking Change
 47 | 
 48 | **Put "N/A" in this block if this is not applicable.**
 49 | 
 50 | Delete this line and describe how URS breaks. Then provide a code block or screenshots of the **_entire_** traceback underneath your description. Section your description by relevance if it is lengthy.
 51 | 
 52 | ```
 53 | Paste entire traceback here. Make sure the traceback is formatted correctly.
 54 | ```
 55 | 
 56 | # List the Most Significant Changes That Have Been Made
 57 | 
 58 | **Please delete sections and/or fields that are not relevant.**
 59 | 
 60 | ## Added
 61 | 
 62 | - Summary of some new feature
 63 |   - Description of the addition
 64 | - Summary of some new feature
 65 |   - Description of the addition
 66 | - Summary of some new feature
 67 |   - Description of the addition
 68 | - Summary of some new feature
 69 |   - Description of the addition
 70 | 
 71 | ## Changed
 72 | 
 73 | - Summary of something that changed
 74 |   - Description of the change
 75 | - Summary of something that changed
 76 |   - Description of the change
 77 | - Summary of something that changed
 78 |   - Description of the change
 79 | - Summary of something that changed
 80 |   - Description of the change
 81 | 
 82 | ## Deprecated
 83 | 
 84 | - Summary of something that has been deprecated
 85 |   - Summary of what has been deprecated
 86 | - Summary of something that has been deprecated
 87 |   - Summary of what has been deprecated
 88 | - Summary of something that has been deprecated
 89 |   - Summary of what has been deprecated
 90 | - Summary of something that has been deprecated
 91 |   - Summary of what has been deprecated
 92 | 
 93 | # How Has This Been Tested?
 94 | 
 95 | **Put "N/A" in this block if this is not applicable.**
 96 | 
 97 | Please describe the tests that you ran to verify your changes. Provide instructions so I can reproduce. Please also list any relevant details for your test configuration. Section your tests by relevance if it is lengthy. An example outline is shown below:
 98 | 
 99 | - Summary of a test here
100 |   - Details here with relevant test commands underneath.
101 |     - Ran `test command here`.
102 |       - If applicable, more details about the command underneath.
103 |     - Then ran `another test command here`.
104 | 
105 | ## Test Configuration
106 | 
107 | **Put "N/A" in this block if this is not applicable.**
108 | 
109 | - Python version: 3.x.x
110 | 
111 | If applicable, describe more configuration settings. An example outline is shown below:
112 | 
113 | - Summary goes here.
114 |   - Configuration 1.
115 |   - Configuration 2.
116 |     - If applicable, provide extra details underneath a configuration.
117 |   - Configuration 3.
118 | 
119 | # `pyproject.toml`
120 | 
121 | ```toml
122 | Paste your new `pyproject.toml` here. Put "N/A" in this block if this is not applicable.
123 | ```
124 | 
125 | # Checklist
126 | 
127 | Tip: You can check off items by writing an "x" in the brackets, e.g. `[x]`.
128 | 
129 | - [ ] My code follows the [style guidelines][style guide] of this project.
130 | - [ ] I have performed a self-review of my own code, including testing to ensure my fix is effective or that my feature works.
131 | - [ ] My changes generate no new warnings.
132 | - [ ] I have commented my code, providing a summary of the functionality of each method, particularly in areas that may be hard to understand.
133 | - [ ] I have made corresponding changes to the documentation.
134 | - [ ] I have performed a self-review of this Pull Request template, ensuring the Markdown file renders correctly.
135 | 
136 | <!-- LINKS -->
137 | 
138 | [style guide]: STYLE_GUIDE.md
139 | 


--------------------------------------------------------------------------------
/urs/analytics/Frequencies.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Frequencies generator
  3 | =====================
  4 | Get frequencies for words that are found in submission titles, bodies, and/or
  5 | comments within scraped data.
  6 | """
  7 | 
  8 | 
  9 | from argparse import Namespace
 10 | from typing import Any, Dict, List, Literal, Tuple, Union
 11 | 
 12 | from colorama import Fore, Style
 13 | from halo import Halo
 14 | 
 15 | from urs.analytics.utils.PrepData import GetPath, PrepData
 16 | from urs.utils.Export import Export
 17 | from urs.utils.Global import Status
 18 | from urs.utils.Logger import LogAnalytics
 19 | from urs.utils.Titles import AnalyticsTitles
 20 | 
 21 | 
 22 | class Sort:
 23 |     """
 24 |     Methods for sorting the frequencies data.
 25 |     """
 26 | 
 27 |     def get_data(self, scrape_file: List[str]) -> Tuple[str, Dict[str, int]]:
 28 |         """
 29 |         Get data from scrape file.
 30 | 
 31 |         :param list[str] scrape_file: The path to the directory in which the analytical
 32 |             data will be written.
 33 | 
 34 |         :returns: The path to the directory in which the analytical data will be
 35 |             written, and a `dict[str, int]` containing extracted scrape data.
 36 |         :rtype: `(str, dict[str, int])`
 37 |         """
 38 | 
 39 |         analytics_dir, scrape_type = GetPath.get_scrape_type(
 40 |             scrape_file[0], "frequencies"
 41 |         )
 42 | 
 43 |         return analytics_dir, PrepData.prep(scrape_file[0], scrape_type)
 44 | 
 45 |     def name_and_create_dir(
 46 |         self, analytics_dir: str, args: Namespace, scrape_file: List[str]
 47 |     ) -> Tuple[Literal["csv", "json"], str]:
 48 |         """
 49 |         Name the new file and create the analytics directory.
 50 | 
 51 |         :param str analytics_dir: The path to the directory in which the analytical
 52 |             data will be written.
 53 |         :param Namespace args: A `Namespace` object containing all arguments used
 54 |             in the CLI.
 55 |         :param list[str] scrape_file: A `list[str]` containing scrape files and
 56 |             file formats to generate wordclouds with.
 57 | 
 58 |         :returns: The file format and the filename.
 59 |         :rtype: `(str, str)`
 60 |         """
 61 | 
 62 |         f_type = "csv" if args.csv else "json"
 63 | 
 64 |         filename = GetPath.name_file(analytics_dir, scrape_file[0])
 65 | 
 66 |         return f_type, filename
 67 | 
 68 |     def create_csv(self, plt_dict: Dict[str, int]) -> Dict[str, List[Union[str, int]]]:
 69 |         """
 70 |         Create CSV structure for exporting.
 71 | 
 72 |         :param dict[str, int] plt_dict: A `dict[str, int]` containing word frequency
 73 |             data.
 74 | 
 75 |         :returns: A `dict[str, list[str | int]]` containing word frequency data.
 76 |         :rtype: `Dict[str, List[Union[str, int]]]`
 77 |         """
 78 | 
 79 |         overview = {"words": [], "frequencies": []}
 80 | 
 81 |         for word, frequency in plt_dict.items():
 82 |             overview["words"].append(word)
 83 |             overview["frequencies"].append(frequency)
 84 | 
 85 |         return overview
 86 | 
 87 |     def create_json(
 88 |         self, plt_dict: Dict[str, int], scrape_file: List[str]
 89 |     ) -> Dict[str, Any]:
 90 |         """
 91 |         Create JSON structure for exporting.
 92 | 
 93 |         :param dict[str, int] plt_dict: A `dict[str, int]` containing word frequency
 94 |             data.
 95 |         :param list[str] scrape_file: A `list[str]` containing files and file
 96 |             formats to generate wordclouds with.
 97 | 
 98 |         :returns: A `dict[str, list[str | int]]` containing word frequency data.
 99 |         :rtype: `Dict[str, List[Union[str, int]]]`
100 |         """
101 | 
102 |         return {"raw_file": scrape_file[0], "data": plt_dict}
103 | 
104 | 
105 | class ExportFrequencies:
106 |     """
107 |     Methods for exporting the frequencies data.
108 |     """
109 | 
110 |     @staticmethod
111 |     @LogAnalytics.log_export
112 |     def export(data: Dict[str, Any], f_type: str, filename: str) -> None:
113 |         """
114 |         Write data dictionary to JSON or CSV.
115 | 
116 |         :param dict[str, Any] data: A dictionary containing word frequency data.
117 |         :param str f_type: The file format.
118 |         :param str filename: The file name.
119 |         """
120 | 
121 |         Export.write_json(data, filename) if f_type == "json" else Export.write_csv(
122 |             data, filename
123 |         )
124 | 
125 | 
126 | class GenerateFrequencies:
127 |     """
128 |     Methods for generating word frequencies.
129 |     """
130 | 
131 |     @staticmethod
132 |     @LogAnalytics.generator_timer("frequencies")
133 |     def generate(args: Namespace) -> None:
134 |         """
135 |         Generate frequencies.
136 | 
137 |         :param Namespace args: A `Namespace` object containing all arguments used
138 |             in the CLI.
139 |         """
140 | 
141 |         AnalyticsTitles.f_title()
142 | 
143 |         for scrape_file in args.frequencies:
144 |             analytics_dir, plt_dict = Sort().get_data(scrape_file)
145 |             f_type, filename = Sort().name_and_create_dir(
146 |                 analytics_dir, args, scrape_file
147 |             )
148 | 
149 |             Halo().info("Generating frequencies.")
150 |             print()
151 |             data = (
152 |                 Sort().create_csv(plt_dict)
153 |                 if args.csv
154 |                 else Sort().create_json(plt_dict, scrape_file)
155 |             )
156 | 
157 |             export_status = Status(
158 |                 Style.BRIGHT
159 |                 + Fore.GREEN
160 |                 + f"Frequencies exported to {'/'.join(filename.split('/')[filename.split('/').index('scrapes'):])}.",
161 |                 "Exporting frequencies.",
162 |                 "white",
163 |             )
164 | 
165 |             export_status.start()
166 |             ExportFrequencies.export(data, f_type, filename)
167 |             export_status.succeed()
168 |             print()
169 | 


--------------------------------------------------------------------------------
/tests/test_analytics/test_utils/test_PrepData.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Testing `PrepData.py`.
  3 | """
  4 | 
  5 | 
  6 | from urs.analytics.utils import PrepData
  7 | 
  8 | 
  9 | class TestGetPathGetScrapeTypeMethod:
 10 |     """
 11 |     Testing GetPath class get_scrape_type() method.
 12 |     """
 13 | 
 14 |     def test_get_scrape_type_method_valid_filepath(self):
 15 |         test_path = "../scrapes/some_date/test/some_other_dir/some_file.json"
 16 | 
 17 |         analytics_dir, scrape_dir = PrepData.GetPath.get_scrape_type(
 18 |             test_path, "frequencies"
 19 |         )
 20 | 
 21 |         assert (
 22 |             analytics_dir
 23 |             == "../scrapes/some_date/analytics/frequencies/test/some_other_dir"
 24 |         )
 25 |         assert scrape_dir == "test"
 26 | 
 27 |     def test_get_scrape_type_method_invalid_directory(self):
 28 |         test_path = "../scrapes/some_date/test/some_other_dir/some_file.txt"
 29 | 
 30 |         try:
 31 |             PrepData.GetPath.get_scrape_type(test_path, "frequencies")
 32 |             assert False
 33 |         except SystemExit:
 34 |             assert True
 35 | 
 36 |     def test_get_scrape_type_method_invalid_file_type(self):
 37 |         test_path = "../scrapes/some_date/analytics/some_other_dir/some_file.json"
 38 | 
 39 |         try:
 40 |             PrepData.GetPath.get_scrape_type(test_path, "wordcloud")
 41 |             assert False
 42 |         except SystemExit:
 43 |             assert True
 44 | 
 45 | 
 46 | class TestGetPathNameFileMethod:
 47 |     """
 48 |     Testing GetPath class name_file() method.
 49 |     """
 50 | 
 51 |     def test_name_file_method(self):
 52 |         test_analytics = (
 53 |             "../scrapes/some_date/analytics/frequencies/test/some_other_dir"
 54 |         )
 55 |         test_path = "../something/another_thing/a_third_thing/test.json"
 56 | 
 57 |         filename = PrepData.GetPath.name_file(test_analytics, test_path)
 58 | 
 59 |         assert (
 60 |             filename
 61 |             == "..\\scrapes\\some_date\\analytics\\frequencies\\test\\some_other_dir/test.json"
 62 |             if "\\" in filename
 63 |             else "../scrapes/some_date/analytics/frequencies/test/some_other_dir/test.json"
 64 |         )
 65 | 
 66 | 
 67 | class TestExtractExtractMethod:
 68 |     """
 69 |     Testing Extract class extract() method.
 70 |     """
 71 | 
 72 |     def test_extract_method(self):
 73 |         pass
 74 | 
 75 | 
 76 | class TestCleanDataRemoveExtrasMethod:
 77 |     """
 78 |     Testing CleanData class _remove_extras() method.
 79 |     """
 80 | 
 81 |     def test_remove_extras_method(self):
 82 |         test = "[t(e)s,t:i;n.}g{a<s>t`r]ing"
 83 | 
 84 |         assert PrepData.CleanData._remove_extras(test) == "t e s t i n  g a s t r ing"
 85 | 
 86 | 
 87 | class TestCleanDataCountWordsMethod:
 88 |     """
 89 |     Testing CleanData class count_words() method.
 90 |     """
 91 | 
 92 |     def test_count_words_method(self):
 93 |         plt_dict = dict()
 94 |         obj = {
 95 |             "first": "Some text here in the first field [(,",
 96 |             "second": "Another line of words here",
 97 |         }
 98 | 
 99 |         PrepData.CleanData.count_words("second", obj, plt_dict)
100 | 
101 |         assert plt_dict["Another"] == 1
102 | 
103 | 
104 | class TestPrepSubredditPrepSubredditMethod:
105 |     """
106 |     Testing PrepSubreddit class prep_subreddit() method.
107 |     """
108 | 
109 |     def test_prep_subreddit_method(self):
110 |         data = [
111 |             {"selftext": "This is a test selftext", "title": "This is a test title"},
112 |             {"selftext": "This is a test selftext", "title": "This is a test title"},
113 |         ]
114 | 
115 |         word_count = PrepData.PrepSubreddit.prep_subreddit(data)
116 | 
117 |         assert word_count["This"] == 4
118 | 
119 | 
120 | class TestPrepRedditorPrepRedditorMethod:
121 |     """
122 |     Testing PrepRedditor class prep_redditor() method.
123 |     """
124 | 
125 |     def test_prep_redditor_method(self):
126 |         data = {
127 |             "interactions": {
128 |                 "comments": [
129 |                     {
130 |                         "type": "comment",
131 |                         "body": "This is a test body",
132 |                     }
133 |                 ],
134 |                 "submissions": [
135 |                     {
136 |                         "type": "submission",
137 |                         "selftext": "This is a test selftext",
138 |                         "title": "This is a test title",
139 |                     }
140 |                 ],
141 |                 "hidden": ["FORBIDDEN"],
142 |             }
143 |         }
144 | 
145 |         word_count = PrepData.PrepRedditor.prep_redditor(data)
146 | 
147 |         assert word_count["This"] == 3
148 |         assert word_count["selftext"] == 1
149 |         assert word_count["body"] == 1
150 |         assert "FORBIDDEN" not in word_count.keys()
151 | 
152 | 
153 | class TestPrepCommentsPrepCommentsMethod:
154 |     """
155 |     Testing PrepComments class prep_comments() method.
156 |     """
157 | 
158 |     def test_prep_comments_method_prep_raw_comments(self):
159 |         data = {
160 |             "scrape_settings": {"style": "raw"},
161 |             "data": {
162 |                 "comments": [
163 |                     {"body": "This is a test body"},
164 |                     {"body": "This is a test body"},
165 |                 ]
166 |             },
167 |         }
168 | 
169 |         word_count = PrepData.PrepComments.prep_comments(data)
170 | 
171 |         assert word_count["This"] == 2
172 | 
173 |     def test_prep_comments_method_prep_structured_comments(self):
174 |         data = {
175 |             "scrape_settings": {"style": "structured"},
176 |             "data": {
177 |                 "comments": [
178 |                     {
179 |                         "body": "This is a test body",
180 |                         "replies": [{"body": "This is a test body", "replies": []}],
181 |                     }
182 |                 ]
183 |             },
184 |         }
185 | 
186 |         word_count = PrepData.PrepComments.prep_comments(data)
187 | 
188 |         assert word_count["test"] == 2
189 | 


--------------------------------------------------------------------------------
/manual/src/scraping-reddit/redditor.md:
--------------------------------------------------------------------------------
  1 | # Table of Contents
  2 | 
  3 | - [Redditors](#redditors)
  4 |   - [All Flags](#all-flags)
  5 |   - [Usage](#usage)
  6 |   - [Redditor Interaction Attributes](#redditor-interaction-attributes)
  7 |   - [Reddit Object Attributes](#reddit-object-attributes)
  8 |   - [File Naming Conventions](#file-naming-conventions)
  9 | 
 10 | # Redditors
 11 | 
 12 | ![Redditor Demo GIF][redditor demo]
 13 | 
 14 | \*_This GIF has been cut for demonstration purposes._
 15 | 
 16 | > **_NOTE:_** If you are not allowed to access a Redditor's lists, PRAW will raise a 403 HTTP Forbidden exception and the program will just append `"FORBIDDEN"` underneath that section in the exported file.
 17 | 
 18 | ## All Flags
 19 | 
 20 | These are all the flags that may be used when scraping Redditors.
 21 | 
 22 | ```
 23 | [-u <redditor> <n_results>]
 24 | ```
 25 | 
 26 | > **_NOTE:_** The number of results returned are applied to all attributes. I have not implemented code to allow users to specify different number of results returned for individual attributes.
 27 | 
 28 | ## Usage
 29 | 
 30 | ```
 31 | poetry run Urs.py -u <redditor> <n_results>
 32 | ```
 33 | 
 34 | Redditor information will be included in the `information` field and includes the following attributes:
 35 | 
 36 | - `comment_karma`
 37 | - `created_utc`
 38 | - `fullname`
 39 | - `has_verified_email`
 40 | - `icon_img`
 41 | - `id`
 42 | - `is_employee`
 43 | - `is_friend`
 44 | - `is_mod`
 45 | - `is_gold`
 46 | - `link_karma`
 47 | - `name`
 48 | - `subreddit`
 49 | - `trophies`
 50 | 
 51 | ## Redditor Interaction Attributes
 52 | 
 53 | Redditor interactions will be included in the `interactions` field. Here is a table of all Redditor interaction attributes that are also included, how they are sorted, and what type of Reddit objects are included in each.
 54 | 
 55 | | Attribute Name | Sorted By/Time Filter                       | Reddit Objects           |
 56 | | -------------- | ------------------------------------------- | ------------------------ |
 57 | | Comments       | Sorted By: New                              | Comments                 |
 58 | | Controversial  | Time Filter: All                            | Comments and submissions |
 59 | | Downvoted      | Sorted By: New                              | Comments and submissions |
 60 | | Gilded         | Sorted By: New                              | Comments and submissions |
 61 | | Gildings       | Sorted By: New                              | Comments and submissions |
 62 | | Hidden         | Sorted By: New                              | Comments and submissions |
 63 | | Hot            | Determined by other Redditors' interactions | Comments and submissions |
 64 | | Moderated      | N/A                                         | Subreddits               |
 65 | | Multireddits   | N/A                                         | Multireddits             |
 66 | | New            | Sorted By: New                              | Comments and submissions |
 67 | | Saved          | Sorted By: New                              | Comments and submissions |
 68 | | Submissions    | Sorted By: New                              | Submissions              |
 69 | | Top            | Time Filter: All                            | Comments and submissions |
 70 | | Upvoted        | Sorted By: New                              | Comments and submissions |
 71 | 
 72 | These attributes contain comments or submissions. Subreddit attributes are also included within both.
 73 | 
 74 | ## Reddit Object Attributes
 75 | 
 76 | This is a table of all attributes that are included for each Reddit object:
 77 | 
 78 | | Subreddits              | Comments        | Submissions           | Multireddits       | Trophies      |
 79 | | ----------------------- | --------------- | --------------------- | ------------------ | ------------- |
 80 | | `can_assign_link_flair` | `body`          | `author`              | `can_edit`         | `award_id`    |
 81 | | `can_assign_user_flair` | `body_html`     | `created_utc`         | `copied_from`      | `description` |
 82 | | `created_utc`           | `created_utc`   | `distinguished`       | `created_utc`      | `icon_40`     |
 83 | | `description`           | `distinguished` | `edited`              | `description_html` | `icon_70`     |
 84 | | `description_html`      | `edited`        | `id`                  | `description_md`   | `name`        |
 85 | | `display_name`          | `id`            | `is_original_content` | `display_name`     | `url`         |
 86 | | `id`                    | `is_submitter`  | `is_self`             | `name`             |               |
 87 | | `name`                  | `link_id`       | `link_flair_text`     | `nsfw`             |               |
 88 | | `nsfw`                  | `parent_id`     | `locked`              | `subreddits`       |               |
 89 | | `public_description`    | `score`         | `name`                | `visibility`       |               |
 90 | | `spoilers_enabled`      | `stickied`      | `num_comments`        |                    |               |
 91 | | `subscribers`           | \*`submission`  | `nsfw`                |                    |               |
 92 | | `user_is_banned`        | `subreddit_id`  | `permalink`           |                    |               |
 93 | | `user_is_moderator`     |                 | `score`               |                    |               |
 94 | | `user_is_subscriber`    |                 | `selftext`            |                    |               |
 95 | |                         |                 | `spoiler`             |                    |               |
 96 | |                         |                 | `stickied`            |                    |               |
 97 | |                         |                 | \*`subreddit`         |                    |               |
 98 | |                         |                 | `title`               |                    |               |
 99 | |                         |                 | `upvote_ratio`        |                    |               |
100 | |                         |                 | `url`                 |                    |               |
101 | 
102 | \* Contains additional metadata.
103 | 
104 | ## File Naming Conventions
105 | 
106 | The file names will follow this format:
107 | 
108 | ```
109 | [USERNAME]-[N_RESULTS]-result(s).json
110 | ```
111 | 
112 | Scrape data is exported to the `redditors` directory.
113 | 
114 | [redditor demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/praw_scrapers/static_scrapers/Redditor_demo.gif?raw=true
115 | 


--------------------------------------------------------------------------------
/urs/utils/Utilities.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utilities
  3 | =========
  4 | Miscellaneous utilities for URS.
  5 | """
  6 | 
  7 | 
  8 | import logging
  9 | from pathlib import Path, PurePath
 10 | from typing import List, Tuple
 11 | 
 12 | import rich
 13 | from colorama import Fore, Style
 14 | from halo import Halo
 15 | from rich.filesize import decimal
 16 | from rich.tree import Tree
 17 | 
 18 | from urs.utils.Global import Status
 19 | from urs.utils.Titles import Errors
 20 | 
 21 | 
 22 | class DateTree:
 23 |     """
 24 |     Methods for creating a visual representation of a target date directory located
 25 |     within the `scrapes` directory.
 26 |     """
 27 | 
 28 |     @staticmethod
 29 |     def _check_date_format(date: str) -> str:
 30 |         """
 31 |         Check if the date format is valid. Revise date separation character if
 32 |         '/' was used instead of '-'.
 33 | 
 34 |         :param str date: The date of the scrapes directory.
 35 | 
 36 |         :raises TypeError: Raised if an invalid date format is entered.
 37 | 
 38 |         :returns: The date to search for.
 39 |         :rtype: `str`
 40 |         """
 41 | 
 42 |         split_date = [char for char in date]
 43 | 
 44 |         if not any(char in split_date for char in ["-", "/"]) or len(split_date) < 10:
 45 |             raise TypeError
 46 | 
 47 |         if "/" in split_date:
 48 |             for i in range(len(split_date)):
 49 |                 if split_date[i] == "/":
 50 |                     split_date[i] = "-"
 51 | 
 52 |         return "".join(split_date)
 53 | 
 54 |     @staticmethod
 55 |     def _find_date_directory(date: str) -> bool:
 56 |         """
 57 |         Traverse the `scrapes/` directory to find the corresponding date directory.
 58 | 
 59 |         :param str date: The date of the scrapes directory.
 60 | 
 61 |         :returns: Whether the date directory exists within the `scrapes/` directory.
 62 |         :rtype: `bool`
 63 |         """
 64 | 
 65 |         dir_exists = False
 66 | 
 67 |         scrapes_dir = f"{Path(Path.cwd()).parents[0]}/scrapes"
 68 |         for path in Path(scrapes_dir).iterdir():
 69 |             if path.is_dir() and PurePath(path).name == date:
 70 |                 dir_exists = True
 71 | 
 72 |         return dir_exists
 73 | 
 74 |     @staticmethod
 75 |     def _create_stack(directory: str, tree: Tree) -> List[Tuple[Path, Tree]]:
 76 |         """
 77 |         Create a stack containing paths within a directory.
 78 | 
 79 |         :param str directory: The path to the directory.
 80 |         :param Tree tree: The `Tree` instance.
 81 | 
 82 |         :returns: A `list[(Path, Tree)]` of paths and sub-`Tree`s.
 83 |         :rtype: `list[(Path, Tree)]`
 84 |         """
 85 | 
 86 |         return [
 87 |             (path, tree)
 88 |             for path in sorted(
 89 |                 Path(directory).iterdir(),
 90 |                 key=lambda path: (path.is_file(), path.name.lower()),
 91 |             )
 92 |         ]
 93 | 
 94 |     @staticmethod
 95 |     def _create_directory_tree(date_dir: str, tree: Tree) -> None:
 96 |         """
 97 |         Create the directory Tree based on the date_dir Path using iterative
 98 |         depth-first search.
 99 | 
100 |         :param str date_dir: The path to the directory.
101 |         :param Tree tree: The `Tree` instance.
102 |         """
103 | 
104 |         build_tree_status = Status(
105 |             "Displaying directory tree.",
106 |             f"Building directory tree for {date_dir}.",
107 |             "cyan",
108 |         )
109 | 
110 |         stack = DateTree._create_stack(date_dir, tree)
111 | 
112 |         visited = set()
113 |         visited.add(Path(date_dir))
114 | 
115 |         build_tree_status.start()
116 |         while stack:
117 |             current = stack.pop(0)
118 |             current_path, current_tree = current[0], current[1]
119 | 
120 |             if current_path in visited:
121 |                 continue
122 |             elif current_path.is_dir():
123 |                 sub_tree = current_tree.add(f"[bold blue]{current_path.name}")
124 |                 sub_paths = DateTree._create_stack(current_path, sub_tree)
125 | 
126 |                 stack = sub_paths + stack
127 |             elif current_path.is_file():
128 |                 file_size = current_path.stat().st_size
129 |                 current_tree.add(f"[bold]{current_path.name} [{decimal(file_size)}]")
130 | 
131 |                 visited.add(current_path)
132 | 
133 |         build_tree_status.succeed()
134 |         print()
135 | 
136 |     @staticmethod
137 |     def display_tree(search_date: str) -> None:
138 |         """
139 |         Display the scrapes directory for a specific date.
140 | 
141 |         :param str search_date: The date within the `scrapes/` directory to search
142 |             for.
143 |         """
144 | 
145 |         logging.info(f"Running tree command...")
146 |         logging.info("")
147 | 
148 |         try:
149 |             search_date = DateTree._check_date_format(search_date)
150 | 
151 |             find_dir_halo = Halo(
152 |                 color="white",
153 |                 text=f"Searching for {search_date} directory within `scrapes`.",
154 |             )
155 | 
156 |             find_dir_halo.start()
157 | 
158 |             dir_exists = DateTree._find_date_directory(search_date)
159 |             if dir_exists:
160 |                 find_dir_halo.succeed(text=f"URS was run on {search_date}.")
161 | 
162 |                 date_dir = f"{Path(Path.cwd()).parents[0]}/scrapes/{search_date}"
163 | 
164 |                 tree = Tree(f"[bold blue]scrapes/")
165 |                 dir_tree = tree.add(f"[bold blue]{search_date}")
166 | 
167 |                 DateTree._create_directory_tree(date_dir, dir_tree)
168 | 
169 |                 rich.print(tree)
170 |                 logging.info(
171 |                     f"Displayed directory tree for scrapes run on {search_date}."
172 |                 )
173 |                 logging.info("")
174 |                 print()
175 |             else:
176 |                 error_messsage = f"URS was not run on {search_date}."
177 |                 find_dir_halo.fail(Fore.RED + Style.BRIGHT + error_messsage)
178 |                 print()
179 | 
180 |                 logging.critical(error_messsage)
181 |                 logging.critical("ABORTING URS.\n")
182 | 
183 |                 quit()
184 |         except TypeError:
185 |             logging.critical("INVALID DATE FORMAT.")
186 |             logging.critical("ABORTING URS.\n")
187 | 
188 |             Errors.e_title(
189 |                 "INVALID DATE FORMAT. ACCEPTED FORMATS: MM-DD-YYYY or MM/DD/YYYY."
190 |             )
191 |             quit()
192 | 


--------------------------------------------------------------------------------
/urs/praw_scrapers/utils/Objectify.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Create Reddit objects
  3 | =====================
  4 | Defining methods to create JSON serializable objects from Reddit metadata.
  5 | """
  6 | 
  7 | 
  8 | from typing import Any, Dict
  9 | 
 10 | from praw.models import Comment, Multireddit, Submission, Subreddit
 11 | 
 12 | from urs.utils.Global import convert_time
 13 | 
 14 | 
 15 | class Objectify:
 16 |     """
 17 |     Methods for creating JSON serializable objects from Reddit metadata.
 18 |     """
 19 | 
 20 |     def make_comment(self, comment: Comment, include_all: bool) -> Dict[str, Any]:
 21 |         """
 22 |         Make a comment item.
 23 | 
 24 |         :param Comment comment: PRAW Comment object.
 25 |         :param bool include_all: Whether the `"type"` field should be included.
 26 | 
 27 |         :returns: A `dict[str, Any]` containing comment metadata.
 28 |         :rtype: `dict[str, Any]`
 29 |         """
 30 | 
 31 |         comment_object = {
 32 |             "author": "u/" + comment.author.name
 33 |             if hasattr(comment.author, "name")
 34 |             else "[deleted]",
 35 |             "body": comment.body,
 36 |             "body_html": comment.body_html,
 37 |             "created_utc": convert_time(comment.created_utc),
 38 |             "distinguished": comment.distinguished,
 39 |             "edited": comment.edited
 40 |             if comment.edited == False
 41 |             else convert_time(comment.edited),
 42 |             "id": comment.id,
 43 |             "is_submitter": comment.is_submitter,
 44 |             "link_id": comment.link_id,
 45 |             "parent_id": comment.parent_id,
 46 |             "score": comment.score,
 47 |             "stickied": comment.stickied,
 48 |         }
 49 | 
 50 |         if include_all:
 51 |             comment_object["submission"] = self.make_submission(
 52 |                 include_all, comment.submission
 53 |             )
 54 |             comment_object["subreddit_id"] = comment.subreddit_id
 55 |             comment_object["type"] = "comment"
 56 | 
 57 |             comment_object = dict(sorted(comment_object.items()))
 58 | 
 59 |         return comment_object
 60 | 
 61 |     def make_multireddit(self, multireddit: Multireddit) -> Dict[str, Any]:
 62 |         """
 63 |         Make a multireddit item.
 64 | 
 65 |         :param Multireddit multireddit: PRAW Multireddit object.
 66 | 
 67 |         :returns: A `dict[str, Any]` containing Multireddit data.
 68 |         :rtype: `Dict[str, Any]`
 69 |         """
 70 | 
 71 |         multireddit_object = {
 72 |             "can_edit": multireddit.can_edit,
 73 |             "copied_from": multireddit.copied_from,
 74 |             "created_utc": convert_time(multireddit.created_utc),
 75 |             "description_html": multireddit.description_html,
 76 |             "description_md": multireddit.description_md,
 77 |             "display_name": multireddit.display_name,
 78 |             "name": multireddit.name,
 79 |             "nsfw": multireddit.over_18,
 80 |             "subreddits": [],
 81 |             "visibility": multireddit.visibility,
 82 |         }
 83 | 
 84 |         if multireddit.subreddits:
 85 |             for subreddit in multireddit.subreddits:
 86 |                 subreddit = self.make_subreddit(subreddit)
 87 |                 multireddit_object["subreddits"].append(subreddit)
 88 | 
 89 |         return multireddit_object
 90 | 
 91 |     def make_submission(
 92 |         self, include_all: bool, submission: Submission
 93 |     ) -> Dict[str, Any]:
 94 |         """
 95 |         Make a submission object.
 96 | 
 97 |         :param bool include_all: Whether the `"type"` field should be included.
 98 |         :param Submission submission: PRAW Submission object.
 99 | 
100 |         :returns: A `dict[str, Any]` containing Submission data.
101 |         :rtype: `Dict[str, Any]`
102 |         """
103 | 
104 |         submission_object = {
105 |             "author": "u/" + submission.author.name
106 |             if hasattr(submission.author, "name")
107 |             else "[deleted]",
108 |             "created_utc": convert_time(submission.created_utc),
109 |             "distinguished": submission.distinguished,
110 |             "edited": submission.edited
111 |             if submission.edited == False
112 |             else convert_time(submission.edited),
113 |             "id": submission.id,
114 |             "is_original_content": submission.is_original_content,
115 |             "is_self": submission.is_self,
116 |             "link_flair_text": submission.link_flair_text,
117 |             "locked": submission.locked,
118 |             "name": submission.name,
119 |             "nsfw": submission.over_18,
120 |             "num_comments": submission.num_comments,
121 |             "permalink": submission.permalink,
122 |             "score": submission.score,
123 |             "selftext": submission.selftext,
124 |             "spoiler": submission.spoiler,
125 |             "stickied": submission.stickied,
126 |             "title": submission.title,
127 |             "upvote_ratio": submission.upvote_ratio,
128 |             "url": submission.url,
129 |         }
130 | 
131 |         if include_all:
132 |             submission_object["subreddit"] = self.make_subreddit(submission.subreddit)
133 |             submission_object["type"] = "submission"
134 | 
135 |             submission_object = dict(sorted(submission_object.items()))
136 | 
137 |         return submission_object
138 | 
139 |     def make_subreddit(self, subreddit: Subreddit) -> Dict[str, Any]:
140 |         """
141 |         Make a Subreddit object.
142 | 
143 |         :param Subreddit subreddit: PRAW Subreddit object.
144 | 
145 |         :returns: A `dict[str, Any]` containing Subreddit data.
146 |         :rtype: `Dict[str, Any]`
147 |         """
148 | 
149 |         return {
150 |             "can_assign_link_flair": subreddit.can_assign_link_flair,
151 |             "can_assign_user_flair": subreddit.can_assign_user_flair,
152 |             "created_utc": convert_time(subreddit.created_utc),
153 |             "description": subreddit.description,
154 |             "description_html": subreddit.description_html,
155 |             "display_name": subreddit.display_name,
156 |             "id": subreddit.id,
157 |             "name": subreddit.name,
158 |             "nsfw": subreddit.over18,
159 |             "public_description": subreddit.public_description,
160 |             "spoilers_enabled": subreddit.spoilers_enabled,
161 |             "subscribers": subreddit.subscribers,
162 |             "user_is_banned": subreddit.user_is_banned,
163 |             "user_is_moderator": subreddit.user_is_moderator,
164 |             "user_is_subscriber": subreddit.user_is_subscriber,
165 |         }
166 | 


--------------------------------------------------------------------------------
/urs/utils/Titles.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Titles
  3 | ======
  4 | Display ASCII art that is used throughout this program.
  5 | """
  6 | 
  7 | 
  8 | from colorama import Fore, Style
  9 | from prawcore import PrawcoreException
 10 | 
 11 | 
 12 | class MainTitle:
 13 |     """
 14 |     Method for printing the main URS title.
 15 |     """
 16 | 
 17 |     @staticmethod
 18 |     def title() -> None:
 19 |         """
 20 |         Print URS title.
 21 |         """
 22 | 
 23 |         print(
 24 |             Fore.WHITE
 25 |             + Style.BRIGHT
 26 |             + r"""
 27 |  __  __  _ __   ____
 28 | /\ \/\ \/\`'__\/',__\
 29 | \ \ \_\ \ \ \//\__, `\
 30 |  \ \____/\ \_\\/\____/
 31 |   \/___/  \/_/ \/___/
 32 | """
 33 |         )
 34 | 
 35 | 
 36 | class PRAWTitles:
 37 |     """
 38 |     Methods for printing PRAW scraper titles.
 39 |     """
 40 | 
 41 |     @staticmethod
 42 |     def r_title() -> None:
 43 |         """
 44 |         Print Subreddit scraper title.
 45 |         """
 46 | 
 47 |         print(
 48 |             Fore.WHITE
 49 |             + Style.BRIGHT
 50 |             + r"""
 51 |  _ __
 52 | /\`'__\
 53 | \ \ \/
 54 |  \ \_\
 55 |   \/_/
 56 | """
 57 |         )
 58 | 
 59 |     @staticmethod
 60 |     def u_title() -> None:
 61 |         """
 62 |         Print Redditor scraper title.
 63 |         """
 64 | 
 65 |         print(
 66 |             Fore.WHITE
 67 |             + Style.BRIGHT
 68 |             + r"""
 69 |  __  __
 70 | /\ \/\ \
 71 | \ \ \_\ \
 72 |  \ \____/
 73 |   \/___/
 74 | """
 75 |         )
 76 | 
 77 |     @staticmethod
 78 |     def c_title() -> None:
 79 |         """
 80 |         Print comments scraper title.
 81 |         """
 82 | 
 83 |         print(
 84 |             Fore.WHITE
 85 |             + Style.BRIGHT
 86 |             + r"""
 87 |   ___
 88 |  /'___\
 89 | /\ \__/
 90 | \ \____\
 91 |  \/____/
 92 | """
 93 |         )
 94 | 
 95 |     @staticmethod
 96 |     def b_title() -> None:
 97 |         """
 98 |         Print basic scraper title.
 99 |         """
100 | 
101 |         print(
102 |             Fore.WHITE
103 |             + Style.BRIGHT
104 |             + r"""
105 |  __
106 | /\ \
107 | \ \ \____
108 |  \ \ '__`\
109 |   \ \ \L\ \
110 |    \ \_,__/
111 |     \/___/... Only scrapes Subreddits.
112 | """
113 |         )
114 | 
115 |     @staticmethod
116 |     def lr_title() -> None:
117 |         """
118 |         Print Subreddit livestream title.
119 |         """
120 | 
121 |         print(
122 |             Fore.WHITE
123 |             + Style.BRIGHT
124 |             + r"""
125 |  ___
126 | /\_ \
127 | \//\ \    _ __ ⏺️
128 |   \ \ \  /\`'__\
129 |    \_\ \_\ \ \/
130 |    /\____\\ \_\
131 |    \/____/ \/_/
132 | """
133 |         )
134 | 
135 |     @staticmethod
136 |     def lu_title() -> None:
137 |         """
138 |         Print Redditor livestream title.
139 |         """
140 | 
141 |         print(
142 |             Fore.WHITE
143 |             + Style.BRIGHT
144 |             + r"""
145 |  ___
146 | /\_ \
147 | \//\ \    __  __⏺️
148 |   \ \ \  /\ \/\ \
149 |    \_\ \_\ \ \_\ \
150 |    /\____\\ \____/
151 |    \/____/ \/___/
152 | """
153 |         )
154 | 
155 | 
156 | class AnalyticsTitles:
157 |     """
158 |     Methods for printing analytical tool titles.
159 |     """
160 | 
161 |     @staticmethod
162 |     def f_title() -> None:
163 |         """
164 |         Print frequencies title.
165 |         """
166 | 
167 |         print(
168 |             Fore.WHITE
169 |             + Style.BRIGHT
170 |             + r"""
171 |   ___
172 |  /'___\ 📈
173 | /\ \__/
174 | \ \ ,__\
175 |  \ \ \_/
176 |   \ \_\
177 |    \/_/
178 | """
179 |         )
180 | 
181 |     @staticmethod
182 |     def wc_title() -> None:
183 |         """
184 |         Print wordcloud title.
185 |         """
186 | 
187 |         print(
188 |             Fore.WHITE
189 |             + Style.BRIGHT
190 |             + r"""
191 |  __  __  __    ___ 🖌️
192 | /\ \/\ \/\ \  /'___\
193 | \ \ \_/ \_/ \/\ \__/
194 |  \ \___x___/'\ \____\
195 |   \/__//__/   \/____/
196 | """
197 |         )
198 | 
199 | 
200 | class Errors:
201 |     """
202 |     Methods for printing error titles.
203 |     """
204 | 
205 |     @staticmethod
206 |     def e_title(invalid_message: str) -> None:
207 |         """
208 |         Print error title.
209 | 
210 |         :param str invalid_message: The specific error message in arguments.
211 |         """
212 | 
213 |         print(
214 |             Fore.RED
215 |             + Style.BRIGHT
216 |             + rf"""
217 |    __
218 |  /'__`\
219 | /\  __/
220 | \ \____\
221 |  \/____/... {invalid_message}
222 | 
223 | Please recheck args or refer to help or usage examples.
224 | """
225 |         )
226 | 
227 |     @staticmethod
228 |     def n_title(reddit_object: str) -> None:
229 |         """
230 |         Print exiting title when there are no Reddit objects left to scrape.
231 | 
232 |         :param str reddit_object: The Reddit object type.
233 |         """
234 | 
235 |         print(
236 |             Fore.RED
237 |             + Style.BRIGHT
238 |             + rf"""
239 |   ___
240 |  /' _`\
241 | /\ \/\ \
242 | \ \_\ \_\
243 |  \/_/\/_/... No {reddit_object} to scrape! Aborting URS.
244 | """
245 |         )
246 | 
247 |     @staticmethod
248 |     def i_title(error: str) -> None:
249 |         """
250 |         Print invalid file title.
251 | 
252 |         :param str error: The specific error associated with invalid files.
253 |         """
254 | 
255 |         print(
256 |             Fore.RED
257 |             + Style.BRIGHT
258 |             + rf"""
259 |  __
260 | /\_\
261 | \/\ \
262 |  \ \ \
263 |   \ \_\
264 |    \/_/... {error}
265 | """
266 |         )
267 | 
268 |     @staticmethod
269 |     def p_title(error: PrawcoreException) -> None:
270 |         """
271 |         Print PRAW error title.
272 | 
273 |         :param PrawcoreException error: The `PrawcoreException` raised when API
274 |             validation fails.
275 |         """
276 | 
277 |         print(
278 |             Fore.RED
279 |             + Style.BRIGHT
280 |             + rf"""
281 |  _____
282 | /\ '__`\
283 | \ \ \L\ \
284 |  \ \ ,__/... Please recheck API credentials or your internet connection.
285 |   \ \ \/
286 |    \ \_\
287 |     \/_/
288 | 
289 | Prawcore exception: {error}
290 | """
291 |         )
292 | 
293 |     @staticmethod
294 |     def l_title(reset_timestamp: str) -> None:
295 |         """
296 |         Print rate limit error title.
297 | 
298 |         :param str reset_timestamp: The reset timestamp provided by PRAW.
299 |         """
300 | 
301 |         print(
302 |             Fore.RED
303 |             + Style.BRIGHT
304 |             + rf"""
305 |  __
306 | /\ \
307 | \ \ \
308 |  \ \ \  __
309 |   \ \ \L\ \
310 |    \ \____/
311 |     \/___/... You have reached your rate limit.
312 | 
313 | Please try again when your rate limit is reset: {reset_timestamp}
314 | """
315 |         )
316 | 
317 |     @staticmethod
318 |     def ex_title(error: Exception) -> None:
319 |         """
320 |         Print export error title.
321 | 
322 |         :param Exception error: The `Exception` raised while exporting scrape data.
323 |         """
324 | 
325 |         print(
326 |             Fore.RED
327 |             + Style.BRIGHT
328 |             + rf"""
329 |  __
330 | /\ \
331 | \ \ \
332 |  \ \ \
333 |   \ \_\
334 |    \/\_\
335 |     \/_/... An error has occurred while exporting scraped data.
336 | 
337 | {error}
338 | """
339 |         )
340 | 


--------------------------------------------------------------------------------
/tests/test_praw_scrapers/test_static_scrapers/test_Redditor.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Testing `Redditor.py`.
  3 | """
  4 | 
  5 | 
  6 | import os
  7 | 
  8 | import praw
  9 | from dotenv import load_dotenv
 10 | 
 11 | from urs.praw_scrapers.static_scrapers import Redditor
 12 | from urs.utils import Global
 13 | 
 14 | 
 15 | class Login:
 16 |     """
 17 |     Create a Reddit object with PRAW API credentials.
 18 |     """
 19 | 
 20 |     @staticmethod
 21 |     def create_reddit_object():
 22 |         load_dotenv()
 23 | 
 24 |         return praw.Reddit(
 25 |             client_id=os.getenv("CLIENT_ID"),
 26 |             client_secret=os.getenv("CLIENT_SECRET"),
 27 |             user_agent=os.getenv("USER_AGENT"),
 28 |             username=os.getenv("REDDIT_USERNAME"),
 29 |             password=os.getenv("REDDIT_PASSWORD"),
 30 |         )
 31 | 
 32 | 
 33 | class TestGetInteractionsMakeJsonSkeletonMethod:
 34 |     """
 35 |     Testing GetInteractions class _make_json_skeleton() method.
 36 |     """
 37 | 
 38 |     def test_make_json_skeleton(self):
 39 |         reddit = Login.create_reddit_object()
 40 |         spez = reddit.redditor("spez")
 41 | 
 42 |         test_skeleton = {
 43 |             "scrape_settings": {"redditor": "spez", "n_results": 1},
 44 |             "data": {"information": None, "interactions": {}},
 45 |         }
 46 | 
 47 |         redditor, skeleton = Redditor.GetInteractions._make_json_skeleton(
 48 |             1, reddit, "spez"
 49 |         )
 50 | 
 51 |         assert redditor == spez
 52 |         assert skeleton == test_skeleton
 53 | 
 54 | 
 55 | class TestGetInteractionsGetTrophiesMethod:
 56 |     """
 57 |     Testing GetInteractions class _get_trophies() method.
 58 |     """
 59 | 
 60 |     def test_get_trophies(self):
 61 |         reddit = Login.create_reddit_object()
 62 |         spez = reddit.redditor("spez")
 63 | 
 64 |         trophies = Redditor.GetInteractions._get_trophies(spez)
 65 | 
 66 |         assert isinstance(trophies, list) == True
 67 |         assert len(trophies) > 0
 68 | 
 69 | 
 70 | class TestGetUserSubredditMethod:
 71 |     """
 72 |     Testing GetInteractions class _get_user_subreddit() method.
 73 |     """
 74 | 
 75 |     def test_get_user_subreddit(self):
 76 |         reddit = Login.create_reddit_object()
 77 |         spez = reddit.redditor("spez")
 78 | 
 79 |         redditor_subreddit = Redditor.GetInteractions._get_user_subreddit(spez)
 80 | 
 81 |         dict_fields = [
 82 |             "can_assign_link_flair",
 83 |             "can_assign_user_flair",
 84 |             "created_utc",
 85 |             "description",
 86 |             "description_html",
 87 |             "display_name",
 88 |             "id",
 89 |             "name",
 90 |             "nsfw",
 91 |             "public_description",
 92 |             "spoilers_enabled",
 93 |             "subscribers",
 94 |             "user_is_banned",
 95 |             "user_is_moderator",
 96 |             "user_is_subscriber",
 97 |         ]
 98 | 
 99 |         assert isinstance(redditor_subreddit, dict) == True
100 | 
101 |         for key in redditor_subreddit.keys():
102 |             assert key in dict_fields
103 | 
104 | 
105 | class TestGetInteractionsGetUserInfoMethod:
106 |     """
107 |     Testing GetInteractions class _get_user_info() method.
108 |     """
109 | 
110 |     def test_get_user_info(self):
111 |         reddit = Login.create_reddit_object()
112 |         spez = reddit.redditor("spez")
113 | 
114 |         skeleton = {
115 |             "scrape_settings": {"redditor": "spez", "n_results": 1},
116 |             "data": {"information": None, "interactions": {}},
117 |         }
118 | 
119 |         Redditor.GetInteractions._get_user_info(spez, skeleton)
120 | 
121 |         assert skeleton["data"]["information"] != None
122 | 
123 |         information_fields = [
124 |             "comment_karma",
125 |             "created_utc",
126 |             "fullname",
127 |             "has_verified_email",
128 |             "icon_img",
129 |             "id",
130 |             "is_employee",
131 |             "is_friend",
132 |             "is_mod",
133 |             "is_gold",
134 |             "link_karma",
135 |             "name",
136 |             "subreddit",
137 |             "trophies",
138 |         ]
139 |         for field in skeleton["data"]["information"].keys():
140 |             assert True if field in information_fields else False
141 | 
142 | 
143 | class TestGetInteractionsMakeInteractionsListsMethod:
144 |     """
145 |     Testing GetInteractions class _make_interactions_lists() method.
146 |     """
147 | 
148 |     def test_make_interactions_lists(self):
149 |         skeleton = {"data": {"interactions": {}}}
150 | 
151 |         Redditor.GetInteractions._make_interactions_lists(skeleton)
152 | 
153 |         interaction_titles = [
154 |             "comments",
155 |             "controversial",
156 |             "downvoted",
157 |             "gilded",
158 |             "gildings",
159 |             "hidden",
160 |             "hot",
161 |             "moderated",
162 |             "multireddits",
163 |             "new",
164 |             "saved",
165 |             "submissions",
166 |             "top",
167 |             "upvoted",
168 |         ]
169 |         for field in skeleton["data"]["interactions"].keys():
170 |             assert True if field in interaction_titles else False
171 | 
172 |             assert skeleton["data"]["interactions"][field] == []
173 | 
174 | 
175 | class TestGetInteractionsGetUserInteractionsMethod:
176 |     """
177 |     Testing GetInteractions class _get_user_interactions() method.
178 |     """
179 | 
180 |     def test_get_user_interactions(self):
181 |         reddit = Login.create_reddit_object()
182 |         spez = reddit.redditor("spez")
183 | 
184 |         skeleton = {
185 |             "scrape_settings": {"redditor": "spez", "n_results": 1},
186 |             "data": {"information": None, "interactions": {}},
187 |         }
188 | 
189 |         Redditor.GetInteractions._get_user_interactions(1, spez, skeleton)
190 | 
191 |         assert skeleton["data"]["information"] == None
192 |         assert skeleton["data"]["interactions"]
193 | 
194 | 
195 | class TestGetInteractionsGetMethod:
196 |     """
197 |     Testing GetInteractions class get() method.
198 |     """
199 | 
200 |     def test_get(self):
201 |         reddit = Login.create_reddit_object()
202 |         spez = reddit.redditor("spez")
203 | 
204 |         skeleton = Redditor.GetInteractions.get(1, reddit, spez)
205 | 
206 |         assert skeleton["scrape_settings"]["redditor"] == "spez"
207 |         assert skeleton["scrape_settings"]["n_results"] == 1
208 | 
209 |         assert skeleton["data"]["information"] != None
210 | 
211 |         assert len(skeleton["data"]["interactions"]["comments"])
212 |         assert len(skeleton["data"]["interactions"]["controversial"])
213 |         assert len(skeleton["data"]["interactions"]["gilded"])
214 |         assert skeleton["data"]["interactions"]["gildings"][0] == "FORBIDDEN"
215 |         assert skeleton["data"]["interactions"]["hidden"][0] == "FORBIDDEN"
216 |         assert not skeleton["data"]["interactions"]["hot"]
217 |         assert len(skeleton["data"]["interactions"]["moderated"])
218 |         assert "multireddits" in skeleton["data"]["interactions"].keys()
219 |         assert not skeleton["data"]["interactions"]["new"]
220 |         assert skeleton["data"]["interactions"]["saved"][0] == "FORBIDDEN"
221 |         assert not skeleton["data"]["interactions"]["submissions"]
222 |         assert len(skeleton["data"]["interactions"]["top"])
223 |         assert skeleton["data"]["interactions"]["upvoted"][0] == "FORBIDDEN"
224 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ```
  2 |  __  __  _ __   ____
  3 | /\ \/\ \/\`'__\/',__\
  4 | \ \ \_\ \ \ \//\__, `\
  5 |  \ \____/\ \_\\/\____/
  6 |   \/___/  \/_/ \/___/
  7 | ```
  8 | 
  9 | > **U**niversal **R**eddit **S**craper - A comprehensive Reddit scraping command-line tool written in Python.
 10 | 
 11 | ![GitHub Workflow Status (Python)](https://img.shields.io/github/actions/workflow/status/JosephLai241/URS/python.yml?label=Python&logo=python&logoColor=blue)
 12 | ![GitHub Workflow Status (Rust)](https://img.shields.io/github/actions/workflow/status/JosephLai241/URS/rust.yml?label=Rust&logo=rust&logoColor=orange)
 13 | [![Codecov](https://img.shields.io/codecov/c/gh/JosephLai241/URS?logo=Codecov)][codecov]
 14 | [![GitHub release (latest by date)](https://img.shields.io/github/v/release/JosephLai241/URS)][releases]
 15 | ![Total lines](https://img.shields.io/tokei/lines/github/JosephLai241/URS)
 16 | ![License](https://img.shields.io/github/license/JosephLai241/URS)
 17 | 
 18 | # Sponsors
 19 | 
 20 | <p align="center">
 21 |   <a href="https://dashboard.thordata.com/register?invitation_code=8I13V2C7">
 22 |     <img src="https://github.com/user-attachments/assets/67052ea0-d05c-4fd5-998c-f819fd233a8a" />
 23 |   </a>
 24 | </p>
 25 | 
 26 | <p align="center" style="max-width: 500px; margin: auto;">
 27 |   <strong>Thordata</strong>’s tools are particularly useful in scenarios that require large-scale web scraping through their 
 28 |   <a href="https://www.thordata.com/products/web-scraper/?ls=EDBORvrR&lk=wb">Web Scraper API</a>
 29 |   , API-based data extraction, or reliable 
 30 |   <a href="https://www.thordata.com/products/residential-proxies/?ls=EDBORvrR&lk=wb">Proxy</a> 
 31 |   infrastructure.
 32 |   If you plan to use Thordata's tools, you can support the project via this <a href="https://dashboard.thordata.com/register?invitation_code=8I13V2C7">affiliate link</a>.
 33 | </p>
 34 | 
 35 | ## Previous Sponsors
 36 | 
 37 | - [lolfilmworks]
 38 | 
 39 | # Table of Contents
 40 | 
 41 | - [Contact](#contact)
 42 | - [Introduction](#introduction)
 43 | - [Usage Overview](#usage-overview)
 44 | - ["Where’s the Manual?"](#wheres-the-manual)
 45 |   - [`URS` Manual](#urs-manual)
 46 | - [Demo GIFs](#demo-gifs)
 47 |   - [Subreddit Scraping](#subreddit-scraping)
 48 |   - [Redditor Scraping](#redditor-scraping)
 49 |   - [Submission Comments Scraping](#submission-comments-scraping)
 50 |   - [Livestreaming Reddit](#livestreaming-reddit)
 51 |   - [Generating Word Frequencies](#generating-word-frequencies)
 52 |   - [Generating Wordclouds](#generating-wordclouds)
 53 |   - [Checking PRAW Rate Limits](#checking-praw-rate-limits)
 54 |   - [Displaying Directory Tree](#displaying-directory-tree)
 55 | 
 56 | # Contact
 57 | 
 58 | Whether you are using `URS` for enterprise or personal use, I am very interested in hearing about your use case and how it has helped you achieve a goal. Additionally, please send me an email if you would like to [contribute][contributing manual link], have questions, or want to share something you have built on top of it.
 59 | 
 60 | You can send me an email by clicking on the badge. I look forward to hearing from you!
 61 | 
 62 | [![ProtonMail](https://img.shields.io/badge/ProtonMail-urs__project%40protonmail.com-informational?logo=protonmail)][urs project email]
 63 | 
 64 | # Introduction
 65 | 
 66 | This is a comprehensive Reddit scraping tool that integrates multiple features:
 67 | 
 68 | - Scrape Reddit via [`PRAW`][praw] (the official Python Reddit API Wrapper)
 69 |   - Scrape Subreddits
 70 |   - Scrape Redditors
 71 |   - Scrape submission comments
 72 | - Livestream Reddit via `PRAW`
 73 |   - Livestream comments submitted within Subreddits or by Redditors
 74 |   - Livestream submissions submitted within Subreddits or by Redditors
 75 | - Analytical tools for scraped data
 76 |   - Generate frequencies for words that are found in submission titles, bodies, and/or comments
 77 |   - Generate a wordcloud from scrape results
 78 | 
 79 | # Usage Overview
 80 | 
 81 | ```
 82 | [-h]
 83 | [-e]
 84 | [-v]
 85 | 
 86 | [-t [<optional_date>]]
 87 | [--check]
 88 | 
 89 | [-r <subreddit> <(h|n|c|t|r|s)> <n_results_or_keywords> [<optional_time_filter>]]
 90 |     [-y]
 91 |     [--csv]
 92 |     [--rules]
 93 | [-u <redditor> <n_results>]
 94 | [-c <submission_url> <n_results>]
 95 |     [--raw]
 96 | [-b]
 97 |     [--csv]
 98 | 
 99 | [-lr <subreddit>]
100 | [-lu <redditor>]
101 | 
102 |     [--nosave]
103 |     [--stream-submissions]
104 | 
105 | [-f <file_path>]
106 |     [--csv]
107 | [-wc <file_path> [<optional_export_format>]]
108 |     [--nosave]
109 | ```
110 | 
111 | # "Where’s the Manual?"
112 | 
113 | ### [`URS` Manual][urs manual]
114 | 
115 | This `README` has become too long to comfortably contain all usage information for this tool. Consequently, the information that used to be in this file has been moved to a separate manual created with [mdBook], a Rust command-line tool for creating books from Markdown files.
116 | 
117 | > **_Note:_** You can also find the link in the About sidebar in this repository.
118 | 
119 | # Demo GIFs
120 | 
121 | Here are all the demo GIFs recorded for `URS`.
122 | 
123 | > **_Note:_** The `nd` command is [`nomad`][nomad], a modern `tree` alternative I wrote in Rust.
124 | 
125 | ## [Subreddit Scraping][subreddit scraping manual link]
126 | 
127 | ![subreddit demo]
128 | 
129 | ## [Redditor Scraping][redditor scraping manual link]
130 | 
131 | ![redditor demo]
132 | 
133 | ## [Submission Comments Scraping][submission comments scraping manual link]
134 | 
135 | ![submission comments demo]
136 | 
137 | ## [Livestreaming Reddit][livestream scraping manual link]
138 | 
139 | ![livestream subreddit demo]
140 | 
141 | ## [Generating Word Frequencies][frequencies scraping manual link]
142 | 
143 | ![frequencies demo]
144 | 
145 | ## [Generating Wordclouds][wordcloud scraping manual link]
146 | 
147 | ![wordcloud demo]
148 | 
149 | ## [Checking PRAW Rate Limits][check praw rate limits manual link]
150 | 
151 | ![check praw rate limits demo]
152 | 
153 | ## [Displaying Directory Tree][display directory tree manual link]
154 | 
155 | ![display directory tree demo]
156 | 
157 | [check praw rate limits demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/utilities/check_rate_limit_demo.gif
158 | [check praw rate limits manual link]: https://josephlai241.github.io/URS/utilities/rate-limit-checking.html
159 | [codecov]: https://codecov.io/gh/JosephLai241/URS
160 | [contributing manual link]: https://josephlai241.github.io/URS/contributing/before-making-pull-or-feature-requests.html
161 | [display directory tree demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/utilities/tree_demo.gif
162 | [display directory tree manual link]: https://josephlai241.github.io/URS/utilities/tree.html
163 | [frequencies demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/analytical_tools/frequencies_generator_demo.gif
164 | [frequencies scraping manual link]: https://josephlai241.github.io/URS/analytical-tools/frequencies-and-wordclouds.html#generating-word-frequencies
165 | [livestream scraping manual link]: https://josephlai241.github.io/URS/livestreaming-reddit/general-information.html
166 | [livestream subreddit demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/praw_scrapers/live_scrapers/livestream_subreddit_demo.gif
167 | [lolfilmworks]: https://github.com/lolfilmworks
168 | [mdbook]: https://github.com/rust-lang/mdBook
169 | [nomad]: https://github.com/JosephLai241/nomad
170 | [praw]: https://praw.readthedocs.io/en/stable/
171 | [redditor demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/praw_scrapers/static_scrapers/Redditor_demo.gif
172 | [redditor scraping manual link]: https://josephlai241.github.io/URS/scraping-reddit/redditor.html
173 | [releases]: https://github.com/JosephLai241/URS/releases
174 | [submission comments demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/praw_scrapers/static_scrapers/submission_comments_demo.gif
175 | [submission comments scraping manual link]: https://josephlai241.github.io/URS/scraping-reddit/submission-comments.html
176 | [subreddit demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/praw_scrapers/static_scrapers/Subreddit_demo.gif
177 | [subreddit scraping manual link]: https://josephlai241.github.io/URS/scraping-reddit/subreddit.html
178 | [urs manual]: https://josephlai241.github.io/URS
179 | [urs project email]: mailto:urs_project@protonmail.com
180 | [wordcloud demo]: https://github.com/JosephLai241/URS/blob/demo-gifs/analytical_tools/wordcloud_generator_demo.gif
181 | [wordcloud scraping manual link]: https://josephlai241.github.io/URS/analytical-tools/frequencies-and-wordclouds.html#generating-wordclouds
182 | 


--------------------------------------------------------------------------------
/urs/praw_scrapers/utils/Validation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | PRAW validation
  3 | ===============
  4 | Validation methods for PRAW credentials and scrapers.
  5 | """
  6 | 
  7 | 
  8 | import logging
  9 | from argparse import ArgumentParser
 10 | from typing import Dict, List, Tuple, Union
 11 | 
 12 | from colorama import Fore, Style
 13 | from halo import Halo
 14 | from praw import Reddit, models
 15 | from prawcore import NotFound, PrawcoreException
 16 | from prettytable import PrettyTable
 17 | 
 18 | from urs.utils.Global import Status
 19 | from urs.utils.Logger import LogError
 20 | from urs.utils.Titles import Errors
 21 | 
 22 | 
 23 | class Validation:
 24 |     """
 25 |     Methods for validating PRAW credentials and Subreddits, Redditors, and URLs.
 26 |     """
 27 | 
 28 |     @staticmethod
 29 |     @LogError.log_rate_limit
 30 |     def get_rate_info(reddit: Reddit) -> Dict[str, Union[str, int, None]]:
 31 |         """
 32 |         Get user rate limit information. Quits the program if the user does not
 33 |         have any requests left in the current rate limit window.
 34 | 
 35 |         :param Reddit reddit: Reddit instance.
 36 | 
 37 |         :returns: PRAW rate limits.
 38 |         :rtype: `dict[str, str | int | None]`
 39 |         """
 40 | 
 41 |         return models.Auth(_data=dict(), reddit=reddit).limits
 42 | 
 43 |     @staticmethod
 44 |     def print_rate_limit(reddit: Reddit) -> None:
 45 |         """
 46 |         Print user rate limit information. This includes the number of requests
 47 |         remaining, a timestamp for when the rate limit counters will be reset, and
 48 |         the number of requests that have been made in the current rate limit window.
 49 | 
 50 |         :param Reddit reddit: Reddit instance.
 51 |         """
 52 | 
 53 |         user_limits = Validation.get_rate_info(reddit)
 54 | 
 55 |         pretty_limits = PrettyTable()
 56 |         pretty_limits.field_names = ["Remaining Requests", "Used Requests"]
 57 |         pretty_limits.add_row([int(user_limits["remaining"]), int(user_limits["used"])])
 58 | 
 59 |         pretty_limits.align = "c"
 60 | 
 61 |         print(pretty_limits)
 62 | 
 63 |     @staticmethod
 64 |     def validate_user(parser: ArgumentParser, reddit: Reddit) -> None:
 65 |         """
 66 |         Check if PRAW credentials are valid, then print rate limit PrettyTable.
 67 | 
 68 |         :param ArgumentParser parser: The `ArgumentParser` object.
 69 |         :param Reddit reddit: Reddit instance.
 70 |         """
 71 | 
 72 |         login_spinner = Halo(color="white", text="Logging in.")
 73 |         login_spinner.start()
 74 | 
 75 |         try:
 76 |             redditor = reddit.user.me()
 77 | 
 78 |             login_spinner.succeed(
 79 |                 Style.BRIGHT + Fore.GREEN + f"Successfully logged in as u/{redditor}."
 80 |             )
 81 |             print()
 82 | 
 83 |             Validation.print_rate_limit(reddit)
 84 | 
 85 |             logging.info(f"Successfully logged in as u/{redditor}.")
 86 |             logging.info("")
 87 |         except PrawcoreException as error:
 88 |             login_spinner.fail(Style.BRIGHT + Fore.RED + "Failed to log in.")
 89 | 
 90 |             Errors.p_title(error)
 91 |             logging.critical("LOGIN FAILED.")
 92 |             logging.critical(f"PRAWCORE EXCEPTION: {error}.")
 93 |             logging.critical("ABORTING URS.\n")
 94 |             parser.exit()
 95 | 
 96 |     @staticmethod
 97 |     def _check_subreddits(
 98 |         invalid: List[str], object_list: List[str], reddit: Reddit, valid: List[str]
 99 |     ) -> None:
100 |         """
101 |         Check if Subreddits are valid.
102 | 
103 |         :param list[str] invalid: An empty `list[str]` to store invalid Subreddits.
104 |         :param list[str] object_list: A list of Subreddits to validate.
105 |         :param Reddit reddit: Reddit instance.
106 |         :param list[str] valid: An empty `list[str]` to store valid Subreddits.
107 |         """
108 | 
109 |         for sub in object_list:
110 |             try:
111 |                 reddit.subreddits.search_by_name(sub, exact=True)
112 |                 valid.append(sub)
113 |             except NotFound:
114 |                 invalid.append(sub)
115 | 
116 |     @staticmethod
117 |     def _check_redditors(
118 |         invalid: List[str], object_list: List[str], reddit: Reddit, valid: List[str]
119 |     ) -> None:
120 |         """
121 |         Check if Redditors are valid.
122 | 
123 |         :param list[str] invalid: An empty `list[str]` to store invalid Redditors.
124 |         :param list[str] object_list: A list of Redditors to validate.
125 |         :param Reddit reddit: Reddit instance.
126 |         :param list[str] valid: An empty `list[str]` to store valid Redditors.
127 |         """
128 | 
129 |         for user in object_list:
130 |             try:
131 |                 reddit.redditor(user).id
132 |                 valid.append(user)
133 |             except NotFound:
134 |                 invalid.append(user)
135 | 
136 |     @staticmethod
137 |     def _check_submissions(
138 |         invalid: List[str], object_list: List[str], reddit: Reddit, valid: List[str]
139 |     ) -> None:
140 |         """
141 |         Check if submission URLs are valid.
142 | 
143 |         :param list[str] invalid: An empty `list[str]` to store invalid submissions.
144 |         :param list[str] object_list: A list of submissions to validate.
145 |         :param Reddit reddit: Reddit instance.
146 |         :param list[str] valid: An empty `list[str]` to store valid submissions.
147 |         """
148 | 
149 |         for post in object_list:
150 |             try:
151 |                 reddit.submission(url=post).title
152 |                 valid.append(post)
153 |             except Exception:
154 |                 invalid.append(post)
155 | 
156 |     @staticmethod
157 |     def check_existence(
158 |         object_list: List[str], reddit: Reddit, scraper_type: str
159 |     ) -> Tuple[List[str], List[str]]:
160 |         """
161 |         Check whether Reddit objects are valid.
162 | 
163 |         :param list[str] object_list: A `list[str]` of Reddit objects to check.
164 |         :param Reddit reddit: Reddit instance.
165 |         :param str scraper_type: The scraper type.
166 | 
167 |         :raises NotFound: Raised if invalid Subreddits or Redditors were provided.
168 |         :raises Exception: Raised if invalid submission URLs were provided
169 | 
170 |         :returns: A `list[str]` of invalid and valid Reddit objects
171 |         :rtype: `(list[str], list[str])`
172 |         """
173 | 
174 |         invalid = []
175 |         valid = []
176 | 
177 |         if scraper_type == "subreddit":
178 |             Validation._check_subreddits(invalid, object_list, reddit, valid)
179 |         elif scraper_type == "redditor":
180 |             Validation._check_redditors(invalid, object_list, reddit, valid)
181 |         elif scraper_type == "comments":
182 |             Validation._check_submissions(invalid, object_list, reddit, valid)
183 | 
184 |         return invalid, valid
185 | 
186 |     @staticmethod
187 |     def validate(
188 |         object_list: List[str], reddit: Reddit, scraper_type: str
189 |     ) -> Tuple[List[str], List[str]]:
190 |         """
191 |         Check if Subreddit(s), Redditor(s), or submission(s) exist and catch PRAW
192 |         exceptions. Log invalid Reddit objects to `urs.log` if applicable.
193 | 
194 |         :param list[str] object_list: A `list[str]` of Reddit objects to check.
195 |         :param Reddit reddit: Reddit instance.
196 |         :param str scrape_type: The scraper type.
197 | 
198 |         :returns: A `list[str]` of invalid and valid Reddit objects.
199 |         :rtype: `(list[str], list[str])`
200 |         """
201 | 
202 |         object_type = (
203 |             "submission" if scraper_type == "comments" else scraper_type.capitalize()
204 |         )
205 | 
206 |         check_status = Status(
207 |             f"Finished {object_type} validation.",
208 |             f"Validating {object_type}(s)",
209 |             "white",
210 |         )
211 | 
212 |         check_status.start()
213 | 
214 |         logging.info(f"Validating {object_type}(s)...")
215 |         logging.info("")
216 | 
217 |         invalid, valid = Validation.check_existence(object_list, reddit, scraper_type)
218 | 
219 |         check_status.succeed()
220 |         print()
221 | 
222 |         if invalid:
223 |             warning_message = (
224 |                 f"The following {object_type}s were not found and will be skipped:"
225 |             )
226 | 
227 |             print(Fore.YELLOW + Style.BRIGHT + warning_message)
228 |             print(Fore.YELLOW + Style.BRIGHT + "-" * len(warning_message))
229 |             print(*invalid, sep="\n")
230 | 
231 |             logging.warning(f"Failed to validate the following {object_type}s:")
232 |             logging.warning(f"{invalid}")
233 |             logging.warning("Skipping.")
234 |             logging.info("")
235 | 
236 |         if not valid:
237 |             logging.critical(f"ALL {object_type.upper()}S FAILED VALIDATION.")
238 |             Errors.n_title(object_type + "s")
239 |             logging.critical(f"NO {object_type.upper()}S LEFT TO SCRAPE.")
240 |             logging.critical("ABORTING URS.\n")
241 | 
242 |             quit()
243 | 
244 |         return invalid, valid
245 | 


--------------------------------------------------------------------------------
/supplemental_docs/The Forest.md:
--------------------------------------------------------------------------------
  1 | # The Forest
  2 | 
  3 | ## Table of Contents
  4 | 
  5 | * [Introduction](#introduction)
  6 |     + [Motivation](#motivation)
  7 |     + [Inspiration](#inspiration)
  8 | * [How the Forest Works](#how-the-forest-works)
  9 |     + [The `CommentNode`](#the-commentnode)
 10 |     + [The `Forest`](#the-forest-1)
 11 |     + [Serializing the `Forest`](#serializing-the-forest)
 12 | 
 13 | ## Introduction
 14 | 
 15 | ### Motivation
 16 | 
 17 | I am a self-taught software developer who just recently graduated from college and am currently looking for my first full-time job. I do not have a computer science degree, so I have had to teach myself a ton of concepts that I would have learned if I got the degree. A class I wish I was able to take in college is data structures and algorithms because that seems to be all the buzz when it comes to the technical interview, which I unfortunately struggle with greatly due to my lack of experience and practice.
 18 | 
 19 | Recently (March 2021) I have been teaching myself DSA. Implementing simple examples of each topic within DSA was not so bad (I am currently working on a study guide/reference repository containing these implementations in both Python and Rust that I will make public soon), but practicing Leetcode problems was and still is a difficult process for me. I will continue to power through the struggle because my livelihood and future career depends on it, though. 
 20 | 
 21 | While it has not been a smooth journey, I have come to realize how useful DSA is and am implementing what I have learned in a real-world use case. I do not think I would have been able to figure out a solution to the structured comments scraper's prior shortcomings if I had not studied this area within computer science. I recently implemented my first [trie][trie] and was fascinated by how abstract data structures worked. I immediately realized I needed to use a tree data structure for the structured comments scraper in order to take it to the next level, which is the purpose of [this pull request][Pull Request].
 22 | 
 23 | ### Inspiration
 24 | 
 25 | The `Forest` is named after PRAW's [`CommentForest`][CommentForest]. The `CommentForest` does not return comments in structured format, so I wrote my own implementation of it.
 26 | 
 27 | The trie was a huge inspiration for the `Forest`. I will quickly explain my implementation of the trie node.
 28 | 
 29 | ```python
 30 | class TrieNode():
 31 |     def __init__(self, char, is_word):
 32 |         self.char = char
 33 |         self.is_word = is_word
 34 |         self.children = dict()
 35 | ```
 36 | 
 37 | Each node of the trie contains a character, a boolean flag indicating whether the node denotes the end of a word, and holds a dictionary filled with child nodes as values and their respective characters as keys. I could have used an array and the indices within it to emulate a dictionary, but I figured I could save some access time at the cost of extra space.
 38 | 
 39 | Anyways, the trie implementation is very similar to how the `Forest` works.
 40 | 
 41 | ## How the Forest Works
 42 | 
 43 | I will strip docstring comments from the source code to keep it relatively short.
 44 | 
 45 | ### The `CommentNode`
 46 | 
 47 | I created a class `CommentNode` to store each comment's metadata and replies:
 48 | 
 49 | ```python
 50 | class CommentNode():
 51 |     def __init__(self, metadata):
 52 |         for key, value in metadata.items():
 53 |             self.__setattr__(key, value)
 54 | 
 55 |         self.replies = []
 56 | ```
 57 | 
 58 | I used `__setattr__()` because the root node defers from the standard comment node schema. By using `__setattr__()`, `CommentNode` attributes will be dynamically set based on the `metadata` dictionary that has been passed in. `self.replies` holds additional `CommentNode`s.
 59 | 
 60 | ### The `Forest`
 61 | 
 62 | Next, I created a class `Forest` which holds the root node and includes methods for insertion.
 63 | 
 64 | **The Root Node**
 65 | 
 66 | First, let's go over the root node.
 67 | 
 68 | ```python
 69 | class Forest():
 70 |     def __init__(self):
 71 |         self.root = CommentNode({ "id": "abc123" })
 72 | ```
 73 | 
 74 | The only key in the dictionary passed into `CommentNode` is `id`, therefore the root `CommentNode` will only contain the attributes `self.id` and `self.replies`. A mock submission ID is shown. The actual source code will pull the submission's ID based on the URL that was passed into the `-c` flag and set the `id` value accordingly.
 75 | 
 76 | Before I get to the insertion methods, I will explain how comments and their replies are linked.
 77 | 
 78 | **How PRAW Comments Are Linked**
 79 | 
 80 | PRAW returns all submission comments by level order. This means all top levels are returned first, followed by all second-level replies, then third, so on and so forth.
 81 | 
 82 | I will create some mock comment objects to demonstrate. Here is a top level comment corresponding to the mock submisssion ID. Note the `parent_id` contains the submission's `id`, which is stored in `self.root.id`:
 83 | 
 84 | ```json
 85 | {
 86 |     "author": "u/asdfasdfasdfasdf",
 87 |     "body": "A top level comment here.",
 88 |     "created_utc": "06-06-2006 06:06:06",
 89 |     "distinguished": null,
 90 |     "edited": false,
 91 |     "id": "qwerty1",
 92 |     "is_submitter": false,
 93 |     "link_id": "t3_asdfgh",
 94 |     "parent_id": "t3_abc123",
 95 |     "score": 666,
 96 |     "stickied": false
 97 | }
 98 | ```
 99 | 
100 | Here is a second-level reply to the top comment. Note the `parent_id` contains the top comment's `id`:
101 | 
102 | ```json
103 | {
104 |     "author": "u/hjklhjklhjklhjkl",
105 |     "body": "A reply here.",
106 |     "created_utc": "06-06-2006 18:06:06",
107 |     "distinguished": null,
108 |     "edited": false,
109 |     "id": "hjkl234",
110 |     "is_submitter": true,
111 |     "link_id": "t3_1a2b3c",
112 |     "parent_id": "t1_qwerty1",
113 |     "score": 6,
114 |     "stickied": false
115 | }
116 | ```
117 | 
118 | This pattern continues all the way down to the last level of comments. It is now very easy to link the correct comments together. I do this by calling `split("_", 1)` on the `parent_id` and then getting the second item in the split list to compare values. I also specify the `maxsplit` parameter to force one split.
119 | 
120 | **The Insertion Methods**
121 | 
122 | I then defined the methods for `CommentNode` insertion.
123 | 
124 | ```python
125 |     def _dfs_insert(self, new_comment):
126 |         stack = []
127 |         stack.append(self.root)
128 |         
129 |         visited = set()
130 |         visited.add(self.root)
131 | 
132 |         found = False
133 |         while not found:
134 |             current_comment = stack.pop(0)
135 |             
136 |             for reply in current_comment.replies:
137 |                 if new_comment.parent_id.split("_", 1)[1] == reply.id:
138 |                     reply.replies.append(new_comment)
139 |                     found = True
140 |                 else:
141 |                     if reply not in visited:
142 |                         stack.insert(0, reply)
143 |                         visited.add(reply)
144 | 
145 |     def seed(self, new_comment):
146 |         parent_id = new_comment.parent_id.split("_", 1)[1]
147 | 
148 |         self.root.replies.append(new_comment) \
149 |             if parent_id == getattr(self.root, "id") \
150 |             else self._dfs_insert(new_comment)
151 | ```
152 | 
153 | I implemented the [depth-first search][Depth-First Search] algorithm to find a comment's parent node and insert it into the parent node's `replies` array. I defined a separate `visited` set to keep track of visited `CommentNode`s to avoid an infinite loop of inserting `CommentNode`s that were already visited into the `stack`. At first I wrote a recursive version of depth-first search, but then opted for an iterative version because it would not scale well for submissions that included large amounts of comments, ie. stack overflow.
154 | 
155 | Within the `seed` method, I first check if the `CommentNode` is a top level comment by comparing its parent ID to the submission ID. Depth-first search is triggered if the `CommentNode` is not a top level comment.
156 | 
157 | ### Serializing the `Forest`
158 | 
159 | Since Python's built-in JSON module can only handle primitive types that have a direct JSON equivalent, a custom encoder is necessary to convert the `Forest` into JSON format. I defined this in `Export.py`.
160 | 
161 | ```python
162 | from json import JSONEncoder
163 | 
164 | class EncodeNode(JSONEncoder):
165 |     def default(self, object):
166 |         return object.__dict__
167 | ```
168 | 
169 | The `default()` method overrides `JSONEncoder`'s `default()` method and serializes the `CommentNode` by converting it into a dictionary, which is a primitive type that has a direct JSON equivalent:
170 | 
171 | ```python
172 | EncodeNode().encode(CommentNode)
173 | ```
174 | 
175 | This ensures the node is correctly encoded before I call the `seed()` method to insert a new `CommentNode` into the `replies` arrays of its respective parent `CommentNode`.
176 | 
177 | I can then use this custom `JSONEncoder` subclass while exporting by specifying it within `json.dump()` with the `cls` kwarg:
178 | 
179 | ```python
180 | with open(filename, "w", encoding = "utf-8") as results:
181 |     json.dump(data, results, indent = 4, cls = EncodeNode)
182 | ```
183 | 
184 | This was how the structured comments export was implemented. Refer to the source code located in `urs/praw_scrapers/Comments.py` to see more. I hope this was somewhat interesting and/or informative. Thanks for reading!
185 | 
186 | <!-- LINKS -->
187 | [Pull Request]: https://github.com/JosephLai241/URS/pull/24
188 | 
189 | [CommentForest]: https://praw.readthedocs.io/en/latest/code_overview/other/commentforest.html
190 | [trie]: https://www.interviewcake.com/concept/java/trie
191 | [Depth-First Search]: https://www.interviewcake.com/concept/java/dfs
192 | 


--------------------------------------------------------------------------------
/taisun/comments.rs:
--------------------------------------------------------------------------------
  1 | //! This module provides computational functions pertaining to submission comments.
  2 | 
  3 | use pyo3::{
  4 |     exceptions::PyValueError,
  5 |     prelude::*,
  6 |     types::{PyBool, PyDict, PyString},
  7 | };
  8 | use serde::{Deserialize, Serialize};
  9 | 
 10 | use std::collections::{HashSet, VecDeque};
 11 | 
 12 | /// An enum used for the `edited` field in the `CommentNode`. The `edited` field may be a `bool`
 13 | /// (`False`) indicating the comment was not edited, or a `String` representing the date of the
 14 | /// change.
 15 | #[derive(Clone, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)]
 16 | #[serde(untagged)]
 17 | pub enum BoolOrDate {
 18 |     /// Hold a boolean type value.
 19 |     Bool(bool),
 20 |     /// Hold a string type value.
 21 |     Str(String),
 22 | }
 23 | 
 24 | impl ToPyObject for BoolOrDate {
 25 |     /// Convert either the Rust `bool` or `String` into a Python `bool` or `str`.
 26 |     fn to_object(&self, py: Python<'_>) -> PyObject {
 27 |         match self {
 28 |             Self::Bool(boolean) => PyBool::new(py, *boolean).into(),
 29 |             Self::Str(string) => PyString::new(py, string).into(),
 30 |         }
 31 |     }
 32 | }
 33 | 
 34 | /// A node object that contains comment metadata for the comment `Forest`.
 35 | #[derive(Clone, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)]
 36 | #[pyclass]
 37 | pub struct CommentNode {
 38 |     /// This comment's author.
 39 |     pub author: String,
 40 |     /// The body of the comment, as Markdown.
 41 |     pub body: String,
 42 |     /// The body of the comment, as HTML.
 43 |     pub body_html: String,
 44 |     /// The comment's created UTC timestamp.
 45 |     pub created_utc: String,
 46 |     /// Whether the comment is distinguished.
 47 |     pub distinguished: Option<String>,
 48 |     /// Whether the comment has been edited. This is set to a UTC timestamp if it has been
 49 |     /// edited.
 50 |     pub edited: BoolOrDate,
 51 |     /// The comment's ID.
 52 |     pub id: String,
 53 |     /// Whether the comment author is also the author of the submission (OP).
 54 |     pub is_submitter: bool,
 55 |     /// The submission ID that the comment belongs to.
 56 |     pub link_id: String,
 57 |     /// The comment's parent ID.
 58 |     pub parent_id: String,
 59 |     /// The comment's score.
 60 |     pub score: i32,
 61 |     /// Whether the comment is stickied.
 62 |     pub stickied: bool,
 63 |     /// The comment's replies.
 64 |     #[serde(skip)]
 65 |     pub replies: Vec<CommentNode>,
 66 | }
 67 | 
 68 | #[pymethods]
 69 | impl CommentNode {
 70 |     /// Create a new `CommentNode`.
 71 |     #[new]
 72 |     fn new(comment_data: String) -> PyResult<Self> {
 73 |         serde_json::from_str(&comment_data).map_or_else(
 74 |             |error| {
 75 |                 Err(PyValueError::new_err(format!(
 76 |                     "Could not deserialize comment data to the CommentNode struct! {}",
 77 |                     error
 78 |                 )))
 79 |             },
 80 |             Ok,
 81 |         )
 82 |     }
 83 | 
 84 |     /// Return this `CommentNode` in a Python `dict`. This overrides the built-in Python `__dict__`
 85 |     /// dunder method.
 86 |     #[getter]
 87 |     fn __dict__(&self, py: Python) -> PyResult<PyObject> {
 88 |         let dict = PyDict::new(py);
 89 | 
 90 |         dict.set_item("author", self.author.clone())?;
 91 |         dict.set_item("body", self.body.clone())?;
 92 |         dict.set_item("body_html", self.body_html.clone())?;
 93 |         dict.set_item("created_utc", self.created_utc.clone())?;
 94 |         dict.set_item("distinguished", self.distinguished.clone())?;
 95 |         dict.set_item("edited", self.edited.clone())?;
 96 |         dict.set_item("id", self.id.clone())?;
 97 |         dict.set_item("is_submitter", self.is_submitter)?;
 98 |         dict.set_item("link_id", self.link_id.clone())?;
 99 |         dict.set_item("parent_id", self.parent_id.clone())?;
100 |         dict.set_item("score", self.score)?;
101 |         dict.set_item("stickied", self.stickied)?;
102 |         dict.set_item("replies", self.replies.clone())?;
103 | 
104 |         Ok(dict.into())
105 |     }
106 | 
107 |     /// Get this `CommentNode`'s `replies`.
108 |     #[getter]
109 |     fn replies(&self) -> Vec<CommentNode> {
110 |         self.replies.clone()
111 |     }
112 | }
113 | 
114 | impl ToPyObject for CommentNode {
115 |     /// Convert the `CommentNode` into a Python `Object`.
116 |     fn to_object(&self, py: Python<'_>) -> PyObject {
117 |         let dict = PyDict::new(py);
118 | 
119 |         dict.set_item("author", self.author.clone())
120 |             .expect("Could not set the author attribute in the PyObject!");
121 |         dict.set_item("body", self.body.clone())
122 |             .expect("Could not set the body attribute in the PyObject!");
123 |         dict.set_item("body_html", self.body_html.clone())
124 |             .expect("Could not set the body_html attribute in the PyObject!");
125 |         dict.set_item("created_utc", self.created_utc.clone())
126 |             .expect("Could not set the created_utc attribute in the PyObject!");
127 |         dict.set_item("distinguished", self.distinguished.clone())
128 |             .expect("Could not set the distinguished attribute in the PyObject!");
129 |         dict.set_item("edited", self.edited.clone())
130 |             .expect("Could not set the edited attribute in the PyObject!");
131 |         dict.set_item("id", self.id.clone())
132 |             .expect("Could not set the id attribute in the PyObject!");
133 |         dict.set_item("is_submitter", self.is_submitter)
134 |             .expect("Could not set the is_submitter attribute in the PyObject!");
135 |         dict.set_item("link_id", self.link_id.clone())
136 |             .expect("Could not set the link_id attribute in the PyObject!");
137 |         dict.set_item("parent_id", self.parent_id.clone())
138 |             .expect("Could not set the parent_id attribute in the PyObject!");
139 |         dict.set_item("score", self.score)
140 |             .expect("Could not set the score attribute in the PyObject!");
141 |         dict.set_item("stickied", self.stickied)
142 |             .expect("Could not set the stickied attribute in the PyObject!");
143 |         dict.set_item("replies", self.replies.clone())
144 |             .expect("Could not set the replies attribute in the PyObject!");
145 | 
146 |         dict.into()
147 |     }
148 | }
149 | 
150 | /// The comment `Forest` - a data structure that resembles comment threads as seen on Reddit.
151 | #[derive(Debug, Deserialize, Serialize)]
152 | #[pyclass]
153 | pub struct Forest {
154 |     /// The root of the forest.
155 |     pub root: CommentNode,
156 | }
157 | 
158 | #[pymethods]
159 | impl Forest {
160 |     /// Create a new `Forest`.
161 |     #[new]
162 |     fn new(submission_id: String) -> PyResult<Self> {
163 |         let root = CommentNode {
164 |             author: "".to_string(),
165 |             body: "".to_string(),
166 |             body_html: "".to_string(),
167 |             created_utc: "".to_string(),
168 |             distinguished: None,
169 |             edited: BoolOrDate::Bool(false),
170 |             id: submission_id,
171 |             is_submitter: true,
172 |             link_id: "".to_string(),
173 |             parent_id: "".to_string(),
174 |             score: 0,
175 |             stickied: false,
176 |             replies: vec![],
177 |         };
178 | 
179 |         Ok(Self { root })
180 |     }
181 | 
182 |     /// An iterative implementation of depth-first search that inserts a new comment into the
183 |     /// `Forest`.
184 |     fn _dfs_insert(&mut self, new_comment: CommentNode) {
185 |         let root_id = &self.root.id.clone();
186 | 
187 |         let mut stack: VecDeque<&mut CommentNode> = VecDeque::new();
188 |         stack.push_front(&mut self.root);
189 | 
190 |         let mut visited: HashSet<String> = HashSet::new();
191 |         visited.insert(root_id.to_string());
192 | 
193 |         let target_id = &new_comment
194 |             .parent_id
195 |             .split('_')
196 |             .last()
197 |             .unwrap_or(&new_comment.parent_id)
198 |             .to_string();
199 | 
200 |         let mut found = false;
201 | 
202 |         while !found {
203 |             if let Some(comment_node) = stack.pop_front() {
204 |                 for reply in comment_node.replies.iter_mut() {
205 |                     if target_id == &reply.id {
206 |                         reply.replies.push(new_comment.clone());
207 |                         found = true;
208 |                     } else {
209 |                         let child_id = reply.id.clone();
210 | 
211 |                         if !visited.contains(child_id.as_str()) {
212 |                             stack.push_front(reply);
213 |                             visited.insert(child_id);
214 |                         }
215 |                     }
216 |                 }
217 |             }
218 |         }
219 |     }
220 | 
221 |     /// Plant a new comment in the `Forest`.
222 |     fn seed_comment(&mut self, new_comment: CommentNode) {
223 |         let parent_id = &new_comment
224 |             .parent_id
225 |             .split('_')
226 |             .last()
227 |             .unwrap_or(&new_comment.parent_id)
228 |             .to_string();
229 | 
230 |         if parent_id == &self.root.id {
231 |             self.root.replies.push(new_comment);
232 |         } else {
233 |             self._dfs_insert(new_comment);
234 |         }
235 |     }
236 | 
237 |     /// Return an array of `CommentNode`s in the form of a `String`. This enables
238 |     /// Python to `json.loads()` this string to convert the `Forest` into a Python
239 |     /// native type.
240 |     #[getter]
241 |     fn comments(&self) -> String {
242 |         serde_json::to_string(&self.root.replies).unwrap_or("None".to_string())
243 |     }
244 | 
245 |     /// Returns the `root` of the `Forest`.
246 |     #[getter]
247 |     fn root(&self) -> CommentNode {
248 |         self.root.clone()
249 |     }
250 | }
251 | 


--------------------------------------------------------------------------------
/manual/src/implementation-details/the-forest.md:
--------------------------------------------------------------------------------
  1 | # The Forest
  2 | 
  3 | **Created:** March 17, 2021
  4 | 
  5 | > This Python code has been deprecated as of `URS v3.4.0` and has been rewritten in Rust. However, the concepts discussed in this document as well as the implementation are still applicable to the Rust rewrite.
  6 | >
  7 | > See [Speeding Up Python with Rust](./speeding-up-python-with-rust.md) for details on how I rewrote this code in Rust and how it yielded drastic performance improvements if you are interested in learning more.
  8 | 
  9 | # Table of Contents
 10 | 
 11 | - [Introduction](#introduction)
 12 |   - [Motivation](#motivation)
 13 |   - [Inspiration](#inspiration)
 14 | - [How the Forest Works](#how-the-forest-works)
 15 |   - [The `CommentNode`](#the-commentnode)
 16 |   - [The `Forest`](#the-forest-1)
 17 |     - [The Root Node](#the-root-node)
 18 |     - [How `PRAW` Comments Are Linked](#how-praw-comments-are-linked)
 19 |     - [The Insertion Methods](#the-insertion-methods)
 20 |   - [Serializing the `Forest`](#serializing-the-forest)
 21 | 
 22 | # Introduction
 23 | 
 24 | ## Motivation
 25 | 
 26 | I am a self-taught software developer who just recently graduated from college and am currently looking for my first full-time job. I do not have a computer science degree, so I have had to teach myself a ton of concepts that I would have learned if I got the degree. A class I wish I was able to take in college is data structures and algorithms because that seems to be all the buzz when it comes to the technical interview, which I unfortunately struggle with greatly due to my lack of experience and practice.
 27 | 
 28 | Recently (March 2021) I have been teaching myself DSA. Implementing simple examples of each topic within DSA was not so bad (I am currently working on a study guide/reference repository containing these implementations in both Python and Rust that I will make public soon), but practicing Leetcode problems was and still is a difficult process for me. I will continue to power through the struggle because my livelihood and future career depends on it, though.
 29 | 
 30 | While it has not been a smooth journey, I have come to realize how useful DSA is and am implementing what I have learned in a real-world use case. I do not think I would have been able to figure out a solution to the structured comments scraper's prior shortcomings if I had not studied this area within computer science. I recently implemented my first [trie][trie] and was fascinated by how abstract data structures worked. I immediately realized I needed to use a tree data structure for the structured comments scraper in order to take it to the next level, which is the purpose of [this pull request][pull request].
 31 | 
 32 | ## Inspiration
 33 | 
 34 | The `Forest` is named after `PRAW`'s [`CommentForest`][commentforest]. The `CommentForest` does not return comments in structured format, so I wrote my own implementation of it.
 35 | 
 36 | The trie was a huge inspiration for the `Forest`. I will quickly explain my implementation of the trie node.
 37 | 
 38 | ```python
 39 | class TrieNode():
 40 |     def __init__(self, char, is_word):
 41 |         self.char = char
 42 |         self.is_word = is_word
 43 |         self.children = dict()
 44 | ```
 45 | 
 46 | Each node of the trie contains a character, a boolean flag indicating whether the node denotes the end of a word, and holds a dictionary filled with child nodes as values and their respective characters as keys. I could have used an array and the indices within it to emulate a dictionary, but I figured I could save some access time at the cost of extra space.
 47 | 
 48 | Anyways, the trie implementation is very similar to how the `Forest` works.
 49 | 
 50 | # How the Forest Works
 51 | 
 52 | ## The `CommentNode`
 53 | 
 54 | I created a class `CommentNode` to store each comment's metadata and replies:
 55 | 
 56 | ```python
 57 | class CommentNode():
 58 |     def __init__(self, metadata):
 59 |         for key, value in metadata.items():
 60 |             self.__setattr__(key, value)
 61 | 
 62 |         self.replies = []
 63 | ```
 64 | 
 65 | I used `__setattr__()` because the root node defers from the standard comment node schema. By using `__setattr__()`, `CommentNode` attributes will be dynamically set based on the `metadata` dictionary that has been passed in. `self.replies` holds additional `CommentNode`s.
 66 | 
 67 | ## The `Forest`
 68 | 
 69 | Next, I created a class `Forest` which holds the root node and includes methods for insertion.
 70 | 
 71 | ### The Root Node
 72 | 
 73 | First, let's go over the root node.
 74 | 
 75 | ```python
 76 | class Forest():
 77 |     def __init__(self):
 78 |         self.root = CommentNode({ "id": "abc123" })
 79 | ```
 80 | 
 81 | The only key in the dictionary passed into `CommentNode` is `id`, therefore the root `CommentNode` will only contain the attributes `self.id` and `self.replies`. A mock submission ID is shown. The actual source code will pull the submission's ID based on the URL that was passed into the `-c` flag and set the `id` value accordingly.
 82 | 
 83 | Before I get to the insertion methods, I will explain how comments and their replies are linked.
 84 | 
 85 | ### How `PRAW` Comments Are Linked
 86 | 
 87 | `PRAW` returns all submission comments by level order. This means all top levels are returned first, followed by all second-level replies, then third, so on and so forth.
 88 | 
 89 | I will create some mock comment objects to demonstrate. Here is a top level comment corresponding to the mock submisssion ID. Note the `parent_id` contains the submission's `id`, which is stored in `self.root.id`:
 90 | 
 91 | ```json
 92 | {
 93 |   "author": "u/asdfasdfasdfasdf",
 94 |   "body": "A top level comment here.",
 95 |   "created_utc": "06-06-2006 06:06:06",
 96 |   "distinguished": null,
 97 |   "edited": false,
 98 |   "id": "qwerty1",
 99 |   "is_submitter": false,
100 |   "link_id": "t3_asdfgh",
101 |   "parent_id": "t3_abc123",
102 |   "score": 666,
103 |   "stickied": false
104 | }
105 | ```
106 | 
107 | Here is a second-level reply to the top comment. Note the `parent_id` contains the top comment's `id`:
108 | 
109 | ```json
110 | {
111 |   "author": "u/hjklhjklhjklhjkl",
112 |   "body": "A reply here.",
113 |   "created_utc": "06-06-2006 18:06:06",
114 |   "distinguished": null,
115 |   "edited": false,
116 |   "id": "hjkl234",
117 |   "is_submitter": true,
118 |   "link_id": "t3_1a2b3c",
119 |   "parent_id": "t1_qwerty1",
120 |   "score": 6,
121 |   "stickied": false
122 | }
123 | ```
124 | 
125 | This pattern continues all the way down to the last level of comments. It is now very easy to link the correct comments together. I do this by calling `split("_", 1)` on the `parent_id` and then getting the second item in the split list to compare values. I also specify the `maxsplit` parameter to force one split.
126 | 
127 | ### The Insertion Methods
128 | 
129 | I then defined the methods for `CommentNode` insertion.
130 | 
131 | ```python
132 |     def _dfs_insert(self, new_comment):
133 |         stack = []
134 |         stack.append(self.root)
135 | 
136 |         visited = set()
137 |         visited.add(self.root)
138 | 
139 |         found = False
140 |         while not found:
141 |             current_comment = stack.pop(0)
142 | 
143 |             for reply in current_comment.replies:
144 |                 if new_comment.parent_id.split("_", 1)[1] == reply.id:
145 |                     reply.replies.append(new_comment)
146 |                     found = True
147 |                 else:
148 |                     if reply not in visited:
149 |                         stack.insert(0, reply)
150 |                         visited.add(reply)
151 | 
152 |     def seed(self, new_comment):
153 |         parent_id = new_comment.parent_id.split("_", 1)[1]
154 | 
155 |         self.root.replies.append(new_comment) \
156 |             if parent_id == getattr(self.root, "id") \
157 |             else self._dfs_insert(new_comment)
158 | ```
159 | 
160 | I implemented the [depth-first search][depth-first search] algorithm to find a comment's parent node and insert it into the parent node's `replies` array. I defined a separate `visited` set to keep track of visited `CommentNode`s to avoid an infinite loop of inserting `CommentNode`s that were already visited into the `stack`. At first I wrote a recursive version of depth-first search, but then opted for an iterative version because it would not scale well for submissions that included large amounts of comments, ie. stack overflow.
161 | 
162 | Within the `seed` method, I first check if the `CommentNode` is a top level comment by comparing its parent ID to the submission ID. Depth-first search is triggered if the `CommentNode` is not a top level comment.
163 | 
164 | ## Serializing the `Forest`
165 | 
166 | Since Python's built-in JSON module can only handle primitive types that have a direct JSON equivalent, a custom encoder is necessary to convert the `Forest` into JSON format. I defined this in `Export.py`.
167 | 
168 | ```python
169 | from json import JSONEncoder
170 | 
171 | class EncodeNode(JSONEncoder):
172 |     def default(self, object):
173 |         return object.__dict__
174 | ```
175 | 
176 | The `default()` method overrides `JSONEncoder`'s `default()` method and serializes the `CommentNode` by converting it into a dictionary, which is a primitive type that has a direct JSON equivalent:
177 | 
178 | ```python
179 | EncodeNode().encode(CommentNode)
180 | ```
181 | 
182 | This ensures the node is correctly encoded before I call the `seed()` method to insert a new `CommentNode` into the `replies` arrays of its respective parent `CommentNode`.
183 | 
184 | I can then use this custom `JSONEncoder` subclass while exporting by specifying it within `json.dump()` with the `cls` kwarg:
185 | 
186 | ```python
187 | with open(filename, "w", encoding = "utf-8") as results:
188 |     json.dump(data, results, indent = 4, cls = EncodeNode)
189 | ```
190 | 
191 | This was how the structured comments export was implemented. Refer to the source code located in `urs/praw_scrapers/Comments.py` to see more. I hope this was somewhat interesting and/or informative. Thanks for reading!
192 | 
193 | <!-- LINKS -->
194 | 
195 | [pull request]: https://github.com/JosephLai241/URS/pull/24
196 | [commentforest]: https://praw.readthedocs.io/en/latest/code_overview/other/commentforest.html
197 | [trie]: https://www.interviewcake.com/concept/java/trie
198 | [depth-first search]: https://www.interviewcake.com/concept/java/dfs
199 | 


--------------------------------------------------------------------------------
/tests/test_praw_scrapers/test_live_scrapers/test_Livestream.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Testing `Livestream.py`.
  3 | """
  4 | 
  5 | 
  6 | import argparse
  7 | import os
  8 | import types
  9 | 
 10 | import praw
 11 | from dotenv import load_dotenv
 12 | 
 13 | from urs.praw_scrapers.live_scrapers import Livestream
 14 | from urs.utils.Global import date
 15 | 
 16 | 
 17 | class MakeArgs:
 18 |     """
 19 |     Making dummy args to test Comments.py methods.
 20 |     """
 21 | 
 22 |     @staticmethod
 23 |     def parser_for_testing():
 24 |         parser = argparse.ArgumentParser()
 25 |         return parser
 26 | 
 27 |     @staticmethod
 28 |     def make_scraper_args():
 29 |         parser = MakeArgs.parser_for_testing()
 30 |         parser.add_argument("--live-subreddit")
 31 |         parser.add_argument("--live-redditor")
 32 |         parser.add_argument("--stream-submissions", action="store_true")
 33 | 
 34 |         return parser
 35 | 
 36 | 
 37 | class Login:
 38 |     """
 39 |     Create a Reddit object with PRAW API credentials.
 40 |     """
 41 | 
 42 |     @staticmethod
 43 |     def create_reddit_object():
 44 |         load_dotenv()
 45 | 
 46 |         return praw.Reddit(
 47 |             client_id=os.getenv("CLIENT_ID"),
 48 |             client_secret=os.getenv("CLIENT_SECRET"),
 49 |             user_agent=os.getenv("USER_AGENT"),
 50 |             username=os.getenv("REDDIT_USERNAME"),
 51 |             password=os.getenv("REDDIT_PASSWORD"),
 52 |         )
 53 | 
 54 | 
 55 | class TestSaveStreamCreateSkeletonMethod:
 56 |     """
 57 |     Testing SaveStream class _create_skeleton() method.
 58 |     """
 59 | 
 60 |     def test_create_skeleton_method_live_subreddit_default_streaming_comments_args(
 61 |         self,
 62 |     ):
 63 |         parser = MakeArgs.make_scraper_args()
 64 |         args = parser.parse_args("--live-subreddit askreddit".split())
 65 | 
 66 |         skeleton = Livestream.SaveStream._create_skeleton(args)
 67 | 
 68 |         assert skeleton["livestream_settings"]["subreddit"] == "askreddit"
 69 |         assert skeleton["livestream_settings"]["included_reddit_objects"] == "comments"
 70 |         assert skeleton["data"] == []
 71 | 
 72 |     def test_create_skeleton_method_live_subreddit_streaming_submissions_args(self):
 73 |         parser = MakeArgs.make_scraper_args()
 74 |         args = parser.parse_args(
 75 |             "--live-subreddit askreddit --stream-submissions".split()
 76 |         )
 77 | 
 78 |         skeleton = Livestream.SaveStream._create_skeleton(args)
 79 | 
 80 |         assert skeleton["livestream_settings"]["subreddit"] == "askreddit"
 81 |         assert (
 82 |             skeleton["livestream_settings"]["included_reddit_objects"] == "submissions"
 83 |         )
 84 |         assert skeleton["data"] == []
 85 | 
 86 |     def test_create_skeleton_method_live_redditor_default_streaming_comments_args(self):
 87 |         parser = MakeArgs.make_scraper_args()
 88 |         args = parser.parse_args("--live-redditor spez".split())
 89 | 
 90 |         skeleton = Livestream.SaveStream._create_skeleton(args)
 91 | 
 92 |         assert skeleton["livestream_settings"]["redditor"] == "spez"
 93 |         assert skeleton["livestream_settings"]["included_reddit_objects"] == "comments"
 94 |         assert skeleton["data"] == []
 95 | 
 96 |     def test_create_skeleton_method_live_redditor_streaming_submissions_args(self):
 97 |         parser = MakeArgs.make_scraper_args()
 98 |         args = parser.parse_args("--live-redditor spez --stream-submissions".split())
 99 | 
100 |         skeleton = Livestream.SaveStream._create_skeleton(args)
101 | 
102 |         assert skeleton["livestream_settings"]["redditor"] == "spez"
103 |         assert (
104 |             skeleton["livestream_settings"]["included_reddit_objects"] == "submissions"
105 |         )
106 |         assert skeleton["data"] == []
107 | 
108 | 
109 | class TestSaveStreamMakeLivestreamDirMethod:
110 |     """
111 |     Testing SaveStream class _make_livestream_dir() method.
112 |     """
113 | 
114 |     def test_make_livestream_dir_method_subreddits_subdirectory(self):
115 |         test_split_stream_info = ["r"]
116 | 
117 |         stream_directory = Livestream.SaveStream._make_livestream_dir(
118 |             test_split_stream_info
119 |         )
120 | 
121 |         assert stream_directory == f"../scrapes/{date}/livestream/subreddits"
122 | 
123 |     def test_make_livestream_dir_method_redditors_subdirectory(self):
124 |         test_split_stream_info = ["u"]
125 | 
126 |         stream_directory = Livestream.SaveStream._make_livestream_dir(
127 |             test_split_stream_info
128 |         )
129 | 
130 |         assert stream_directory == f"../scrapes/{date}/livestream/redditors"
131 | 
132 | 
133 | class TestSaveStreamGetTempFilenameMethod:
134 |     """
135 |     Testing SaveStream class _get_temp_filename() method.
136 |     """
137 | 
138 |     def test_get_temp_filename_method_with_subreddit(self):
139 |         test_stream_info = "in r/askreddit"
140 | 
141 |         stream_path = Livestream.SaveStream._get_temp_filename(test_stream_info)
142 | 
143 |         assert stream_path == f"../scrapes/{date}/livestream/subreddits/askreddit.json"
144 | 
145 |     def test_get_temp_filename_method_with_redditor(self):
146 |         test_stream_info = "by u/spez"
147 | 
148 |         stream_path = Livestream.SaveStream._get_temp_filename(test_stream_info)
149 | 
150 |         assert stream_path == f"../scrapes/{date}/livestream/redditors/spez.json"
151 | 
152 | 
153 | class TestSaveStreamCreateTempFileMethod:
154 |     """
155 |     Testing SaveStream class _create_temp_file() method.
156 |     """
157 | 
158 |     def test_create_temp_file_method(self):
159 |         test_skeleton = {"test": 1}
160 |         test_stream_path = "../scrapes/livestream/subreddits/askreddit.json"
161 | 
162 |         if not os.path.isdir("../scrapes/livestream/subreddits"):
163 |             os.makedirs("../scrapes/livestream/subreddits")
164 | 
165 |         Livestream.SaveStream._create_temp_file(test_skeleton, test_stream_path)
166 | 
167 |         assert os.path.isfile(test_stream_path)
168 | 
169 | 
170 | class TestSaveStreamRenameMethod:
171 |     """
172 |     Testing SaveStream class _rename() method.
173 |     """
174 | 
175 |     def test_rename_method_with_subreddit(self):
176 |         test_duration = "00:00:15"
177 |         test_object_info = "comments"
178 |         test_start_stream = "18:06:06"
179 |         test_stream_path = f"../scrapes/{date}/livestream/subreddits/askreddit.json"
180 | 
181 |         with open(test_stream_path, "w", encoding="utf-8") as _:
182 |             pass
183 | 
184 |         Livestream.SaveStream._rename(
185 |             test_duration, test_object_info, test_start_stream, test_stream_path
186 |         )
187 | 
188 |         renamed_file = f"../scrapes/{date}/livestream/subreddits/askreddit-comments-18_06_06-00_00_15.json"
189 | 
190 |         assert os.path.isfile(renamed_file)
191 | 
192 |     def test_rename_method_with_redditor(self):
193 |         test_duration = "00:00:15"
194 |         test_object_info = "submissions"
195 |         test_start_stream = "18:06:06"
196 |         test_stream_path = f"../scrapes/{date}/livestream/redditors/spez.json"
197 | 
198 |         with open(test_stream_path, "w", encoding="utf-8") as _:
199 |             pass
200 | 
201 |         Livestream.SaveStream._rename(
202 |             test_duration, test_object_info, test_start_stream, test_stream_path
203 |         )
204 | 
205 |         renamed_file = f"../scrapes/{date}/livestream/redditors/spez-submissions-18_06_06-00_00_15.json"
206 | 
207 |         assert os.path.isfile(renamed_file)
208 | 
209 | 
210 | class TestSaveStreamWriteMethod:
211 |     """
212 |     Testing SaveStream class write() method.
213 |     """
214 | 
215 |     def test_write_method(self):
216 |         pass
217 | 
218 | 
219 | class TestLivestreamSetInfoAndObjectMethod:
220 |     """
221 |     Testing Livestream class _set_info_and_object() method.
222 |     """
223 | 
224 |     def test_set_info_and_object_live_subreddit(self):
225 |         reddit = Login.create_reddit_object()
226 | 
227 |         parser = MakeArgs.make_scraper_args()
228 |         args = parser.parse_args("--live-subreddit askreddit".split())
229 | 
230 |         reddit_object, stream_info = Livestream.Livestream._set_info_and_object(
231 |             args, reddit
232 |         )
233 | 
234 |         assert isinstance(reddit_object, praw.models.Subreddit)
235 |         assert stream_info == "in r/askreddit"
236 | 
237 |     def test_set_info_and_object_live_redditor(self):
238 |         reddit = Login.create_reddit_object()
239 | 
240 |         parser = MakeArgs.make_scraper_args()
241 |         args = parser.parse_args("--live-redditor spez".split())
242 | 
243 |         reddit_object, stream_info = Livestream.Livestream._set_info_and_object(
244 |             args, reddit
245 |         )
246 | 
247 |         assert isinstance(reddit_object, praw.models.Redditor)
248 |         assert stream_info == "by u/spez"
249 | 
250 | 
251 | class TestLivestreamStreamSwitchMethod:
252 |     """
253 |     Testing Livestream class _stream_switch() method.
254 |     """
255 | 
256 |     def test_stream_switch_method_default_stream_comments(self):
257 |         reddit = Login.create_reddit_object()
258 |         subreddit = reddit.subreddit("askreddit")
259 | 
260 |         parser = MakeArgs.make_scraper_args()
261 |         args = parser.parse_args("--live-subreddit askreddit".split())
262 | 
263 |         generator, object_info = Livestream.Livestream._stream_switch(args, subreddit)
264 | 
265 |         assert isinstance(generator, types.GeneratorType)
266 |         assert object_info == "comments"
267 | 
268 |     def test_stream_switch_method_stream_submissions(self):
269 |         reddit = Login.create_reddit_object()
270 |         subreddit = reddit.subreddit("askreddit")
271 | 
272 |         parser = MakeArgs.make_scraper_args()
273 |         args = parser.parse_args(
274 |             "--live-subreddit askreddit --stream-submissions".split()
275 |         )
276 | 
277 |         generator, object_info = Livestream.Livestream._stream_switch(args, subreddit)
278 | 
279 |         assert isinstance(generator, types.GeneratorType)
280 |         assert object_info == "submissions"
281 | 
282 | 
283 | class TestLivestreamNoSaveStreamMethod:
284 |     """
285 |     Testing livestream class _no_save_stream() method.
286 |     """
287 | 
288 |     def test_no_save_stream_method(self):
289 |         pass
290 | 
291 | 
292 | class TestLivestreamStreamMethod:
293 |     """
294 |     Testing Livestream class stream() method.
295 |     """
296 | 
297 |     def test_stream_method_live_subreddit(self):
298 |         pass
299 | 
300 |     def test_stream_method_live_redditor(self):
301 |         pass
302 | 


--------------------------------------------------------------------------------
/urs/praw_scrapers/static_scrapers/Comments.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Submission comments scraper
  3 | ===========================
  4 | Defining methods for the submission comments scraper.
  5 | """
  6 | 
  7 | 
  8 | import json
  9 | import logging
 10 | from argparse import Namespace
 11 | from typing import Any, Dict, List
 12 | 
 13 | from colorama import Fore, Style
 14 | from halo import Halo
 15 | from praw import Reddit
 16 | from praw.models import Submission
 17 | from rich.progress import (
 18 |     BarColumn,
 19 |     MofNCompleteColumn,
 20 |     Progress,
 21 |     RenderableColumn,
 22 |     SpinnerColumn,
 23 |     TextColumn,
 24 |     TimeRemainingColumn,
 25 | )
 26 | from taisun.comments_utils import CommentNode, Forest
 27 | 
 28 | from urs.praw_scrapers.utils.Objectify import Objectify
 29 | from urs.praw_scrapers.utils.Validation import Validation
 30 | from urs.utils.Cli import GetPRAWScrapeSettings
 31 | from urs.utils.Export import Export, NameFile
 32 | from urs.utils.Global import Status, convert_time, make_none_dict
 33 | from urs.utils.Logger import LogExport, LogPRAWScraper
 34 | from urs.utils.Titles import PRAWTitles
 35 | 
 36 | 
 37 | class SortComments:
 38 |     """
 39 |     Methods for sorting comments depending on which style of comments was
 40 |     specified (raw or structured).
 41 |     """
 42 | 
 43 |     @staticmethod
 44 |     def sort_raw(all_comments: List[Dict[str, Any]], submission: Submission) -> None:
 45 |         """
 46 |         Sort all comments in raw format.
 47 | 
 48 |         :param list[dict[str, Any]] all_comments: A `list[dict[str, Any]]` containing
 49 |             all comments within a submission.
 50 |         :param Submission submission: PRAW `Submission` object.
 51 |         """
 52 | 
 53 |         for comment in submission.comments.list():
 54 |             all_comments.append(Objectify().make_comment(comment, False))
 55 | 
 56 |     @staticmethod
 57 |     def sort_structured(submission: Submission, url: str) -> List[Dict[str, Any]]:
 58 |         """
 59 |         Sort all comments in structured format.
 60 | 
 61 |         :param Submission submission: PRAW `Submission` object.
 62 |         :param str url: The submission's URL.
 63 | 
 64 |         :returns: A `list[dict[str, Any]]` containing `CommentNode`s in `dict`
 65 |             form.
 66 |         :rtype: `list[dict[str, Any]]`
 67 |         """
 68 | 
 69 |         renderable_column = RenderableColumn(renderable="|")
 70 |         spinner_column = SpinnerColumn(spinner_name="noise")
 71 |         text_column = TextColumn("Seeding Forest")
 72 | 
 73 |         progress_bar = Progress(
 74 |             spinner_column,
 75 |             text_column,
 76 |             BarColumn(),
 77 |             MofNCompleteColumn(),
 78 |             renderable_column,
 79 |             TimeRemainingColumn(),
 80 |         )
 81 | 
 82 |         forest = Forest(submission.id_from_url(url))
 83 | 
 84 |         with progress_bar:
 85 |             for comment in progress_bar.track(submission.comments.list()):
 86 |                 comment_node = CommentNode(
 87 |                     json.dumps((Objectify().make_comment(comment, False)))
 88 |                 )
 89 | 
 90 |                 forest.seed_comment(comment_node)
 91 | 
 92 |         return forest.root.replies
 93 | 
 94 | 
 95 | class GetSort:
 96 |     """
 97 |     Methods for getting comments from a Reddit submission.
 98 |     """
 99 | 
100 |     def __init__(self, args: Namespace, submission: Submission, url: str) -> None:
101 |         """
102 |         Initialize variables used in later methods:
103 | 
104 |         :param Namespace args: A `Namespace` object containing all arguments used
105 |             in the CLI.
106 |         :param Submission submission: PRAW `Submission` object.
107 |         :param str url: The submission's URL.
108 |         """
109 | 
110 |         self._args = args
111 |         self._url = url
112 | 
113 |         more_comments_status = Status(
114 |             "Finished resolving instances of MoreComments.",
115 |             Fore.CYAN
116 |             + Style.BRIGHT
117 |             + "Resolving instances of MoreComments. This may take a while. Please wait.",
118 |             "cyan",
119 |         )
120 | 
121 |         more_comments_status.start()
122 |         self._submission = submission
123 |         self._submission.comments.replace_more(limit=None)
124 |         more_comments_status.succeed()
125 | 
126 |     def get_sort(self, args: Namespace, limit: str) -> List[Dict[str, Any]]:
127 |         """
128 |         Get comments from posts.
129 | 
130 |         :param Namespace args: A `Namespace` object containing all arguments used
131 |             in the CLI.
132 |         :param str limit: A `str` indicating the number of results to return.
133 | 
134 |         :returns: A `list[dict[str, Any]]` containing all comments within a submission.
135 |         :rtype: `list[dict[str, Any]]`
136 |         """
137 | 
138 |         if args.raw:
139 |             all_comments = []
140 |             SortComments().sort_raw(all_comments, self._submission)
141 |         else:
142 |             all_comments = SortComments().sort_structured(self._submission, self._url)
143 | 
144 |         return all_comments[: int(limit)] if int(limit) != 0 else all_comments
145 | 
146 | 
147 | class Write:
148 |     """
149 |     Methods for writing scraped comments to CSV or JSON.
150 |     """
151 | 
152 |     @staticmethod
153 |     def _make_json_skeleton(
154 |         args: Namespace, limit: str, submission: Submission, url: str
155 |     ) -> Dict[str, Dict[str, Any]]:
156 |         """
157 |         Create a skeleton for JSON export. Include scrape details at the top.
158 | 
159 |         :param Namespace args: A `Namespace` object containing all arguments used
160 |             in the CLI.
161 |         :param str limit: A `str` indicating the number of results to return.
162 |         :param Submission submission: PRAW `Submission` object.
163 |         :param str url: The submission's URL.
164 | 
165 |         :returns: A `dict[str, dict[str, Any]]` containing scrape settings and
166 |             all scrape data.
167 |         :rtype: `dict[str, dict[str, Any]]`
168 |         """
169 | 
170 |         metadata_status = Status(
171 |             "Extracted submission metadata.", "Extracting submission metadata.", "white"
172 |         )
173 | 
174 |         metadata_status.start()
175 |         skeleton = {
176 |             "scrape_settings": {
177 |                 "n_results": int(limit) if int(limit) > 0 else "all",
178 |                 "style": "structured" if not args.raw else "raw",
179 |                 "url": url,
180 |             },
181 |             "data": {
182 |                 "submission_metadata": {
183 |                     "author": "u/" + submission.author.name
184 |                     if hasattr(submission.author, "name")
185 |                     else "[deleted]",
186 |                     "created_utc": convert_time(submission.created_utc),
187 |                     "distinguished": submission.distinguished,
188 |                     "edited": submission.edited
189 |                     if submission.edited == False
190 |                     else convert_time(submission.edited),
191 |                     "is_original_content": submission.is_original_content,
192 |                     "is_self": submission.is_self,
193 |                     "link_flair_text": submission.link_flair_text,
194 |                     "locked": submission.locked,
195 |                     "nsfw": submission.over_18,
196 |                     "num_comments": submission.num_comments,
197 |                     "permalink": submission.permalink,
198 |                     "score": submission.score,
199 |                     "selftext": submission.selftext,
200 |                     "spoiler": submission.spoiler,
201 |                     "stickied": submission.stickied,
202 |                     "subreddit": submission.subreddit.display_name,
203 |                     "title": submission.title,
204 |                     "upvote_ratio": submission.upvote_ratio,
205 |                 },
206 |                 "comments": None,
207 |             },
208 |         }
209 | 
210 |         try:
211 |             skeleton["data"]["submission_metadata"][
212 |                 "gallery_data"
213 |             ] = submission.gallery_data
214 |             skeleton["data"]["submission_metadata"][
215 |                 "media_metadata"
216 |             ] = submission.media_metadata
217 | 
218 |             skeleton["data"]["submission_metadata"] = dict(
219 |                 sorted(skeleton["data"]["submission_metadata"].items())
220 |             )
221 |         except AttributeError:
222 |             pass
223 | 
224 |         metadata_status.succeed()
225 | 
226 |         return skeleton
227 | 
228 |     @staticmethod
229 |     def _determine_export(args: Namespace, data: Dict[str, Any], f_name: str) -> None:
230 |         """
231 |         Export either structured or raw comments.
232 | 
233 |         :param Namespace args: A `Namespace` object containing all arguments used
234 |             in the CLI.
235 |         :param dict[str, Any] data: A `dict[str, Any]` containing all scraped data.
236 |         :param str f_name: The filename.
237 |         """
238 | 
239 |         if args.raw:
240 |             export_status = f"Exporting {data['scrape_settings']['n_results']} comments in raw format."
241 |             Halo().info(export_status)
242 |             logging.info(export_status)
243 |             Export.export(data, f_name, "json", "comments")
244 |         else:
245 |             export_status = f"Exporting {data['scrape_settings']['n_results']} comments in structured format."
246 |             Halo().info(export_status)
247 |             logging.info(export_status)
248 |             Export.write_structured_comments(data, f_name)
249 | 
250 |     @staticmethod
251 |     def write(args: Namespace, c_master: Dict[str, Any], reddit: Reddit):
252 |         """
253 |         Get, sort, then write scraped comments to CSV or JSON.
254 | 
255 |         :param Namespace args: A `Namespace` object containing all arguments used
256 |             in the CLI.
257 |         :param dict[str, Any] data: A `dict[str, Any]` containing all scraped data.
258 |         :param Reddit reddit: PRAW Reddit instance.
259 |         """
260 | 
261 |         for url, limit in c_master.items():
262 |             submission = reddit.submission(url=url)
263 |             data = Write._make_json_skeleton(args, limit, submission, url)
264 |             data["data"]["comments"] = GetSort(args, submission, url).get_sort(
265 |                 args, limit
266 |             )
267 | 
268 |             f_name = NameFile().c_fname(args, limit, submission.title)
269 |             Write._determine_export(args, data, f_name)
270 | 
271 |             print()
272 |             Halo(
273 |                 color="green",
274 |                 text=Style.BRIGHT
275 |                 + Fore.GREEN
276 |                 + f"JSON file for '{submission.title}' comments created.",
277 |             ).succeed()
278 |             print()
279 | 
280 | 
281 | class RunComments:
282 |     """
283 |     Run the comments scraper.
284 |     """
285 | 
286 |     @staticmethod
287 |     @LogExport.log_export
288 |     @LogPRAWScraper.scraper_timer("comments")
289 |     def run(args: Namespace, reddit: Reddit) -> Dict[str, Any]:
290 |         """
291 |         Run comments scraper.
292 | 
293 |         :param Namespace args: A `Namespace` object containing all arguments used
294 |             in the CLI.
295 |         :param Reddit reddit: PRAW Reddit instance.
296 | 
297 |         :returns: A `dict[str, Any]` containing all submission comments scrape
298 |             settings.
299 |         :rtype: `dict[str, Any]`
300 |         """
301 | 
302 |         PRAWTitles.c_title()
303 | 
304 |         post_list = GetPRAWScrapeSettings().create_list(args, "comments")
305 |         not_posts, posts = Validation.validate(post_list, reddit, "comments")
306 |         c_master = make_none_dict(posts)
307 |         GetPRAWScrapeSettings().get_settings(args, not_posts, c_master, "comments")
308 | 
309 |         Write.write(args, c_master, reddit)
310 | 
311 |         return c_master
312 | 


--------------------------------------------------------------------------------