├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── README_PyPI.md ├── images ├── demo.gif ├── imgur_app.png ├── logo.png └── reddit_app.png ├── pyproject.toml ├── requirements.txt ├── setup.cfg └── src └── saveddit ├── __init__.py ├── _version.py ├── configuration.py ├── multireddit_downloader.py ├── multireddit_downloader_config.py ├── saveddit.py ├── search_config.py ├── search_subreddits.py ├── submission_downloader.py ├── subreddit_downloader.py ├── subreddit_downloader_config.py ├── user_downloader.py └── user_downloader_config.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | .DS_Store 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | 135 | # pytype static type analyzer 136 | .pytype/ 137 | 138 | # Cython debug symbols 139 | cython_debug/ 140 | 141 | # Configuration file 142 | **/user_config.yaml -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | Contributions are welcomed. Open a pull-request or an issue. 3 | 4 | ## Code of conduct 5 | This project adheres to the [Open Code of Conduct][code-of-conduct]. By participating, you are expected to honor this code. 6 | 7 | [code-of-conduct]: https://github.com/spotify/code-of-conduct/blob/master/code-of-conduct.md 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Pranav 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include images/* 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 5 |

6 | 7 | PyPI version 8 | 9 | 10 | license 11 | 12 |

13 | 14 | `saveddit` is a bulk media downloader for reddit 15 | 16 | ```console 17 | pip3 install saveddit 18 | ``` 19 | 20 | ## Setting up authorization 21 | 22 | * [Register an application with Reddit](https://ssl.reddit.com/prefs/apps/) 23 | - Write down your client ID and secret from the app 24 | - More about Reddit API access [here](https://ssl.reddit.com/wiki/api) 25 | - Wiki page about Reddit OAuth2 applications [here](https://github.com/reddit-archive/reddit/wiki/OAuth2) 26 | 27 |

28 |          29 |

30 | 31 | * [Register an application with Imgur](https://api.imgur.com/oauth2/addclient) 32 | - Write down the Imgur client ID from the app 33 | 34 |

35 |          36 |

37 | 38 | These registrations will authorize you to use the Reddit and Imgur APIs to download publicly available information. 39 | 40 | ## User configuration 41 | 42 | The first time you run `saveddit`, you will see something like this: 43 | 44 | ```console 45 | foo@bar:~$ saveddit 46 | Retrieving configuration from ~/.saveddit/user_config.yaml file 47 | No configuration file found. 48 | Creating one. Would you like to edit it now? 49 | > Choose Y for yes and N for no 50 | ``` 51 | 52 | Once you choose 'yes', the program will request you to enter these credentials: 53 | - Your imgur client ID 54 | - Your reddit client ID 55 | - Your reddit client secret 56 | - Your reddit username 57 | 58 | In case you choose 'no', the program will create a file which you can edit later, this is how to edit it: 59 | 60 | * Open the generated `~/.saveddit/user_config.yaml` 61 | * Update the client IDs and secrets from the previous step 62 | * If you plan on using the `user` API, add your reddit username as well 63 | 64 | ```yaml 65 | imgur_client_id: '' 66 | reddit_client_id: '' 67 | reddit_client_secret: '' 68 | reddit_username: '' 69 | ``` 70 | 71 | ## Download from Subreddit 72 | 73 | ```console 74 | foo@bar:~$ saveddit subreddit -h 75 | Retrieving configuration from /Users/pranav/.saveddit/user_config.yaml file 76 | 77 | usage: saveddit subreddit [-h] [-f categories [categories ...]] [-l post_limit] [--skip-comments] [--skip-meta] [--skip-videos] -o output_path subreddits [subreddits ...] 78 | 79 | positional arguments: 80 | subreddits Names of subreddits to download, e.g., AskReddit 81 | 82 | optional arguments: 83 | -h, --help show this help message and exit 84 | -f categories [categories ...] 85 | Categories of posts to download (default: ['hot', 'new', 'rising', 'controversial', 'top', 'gilded']) 86 | -l post_limit Limit the number of submissions downloaded in each category (default: None, i.e., all submissions) 87 | --skip-comments When true, saveddit will not save comments to a comments.json file 88 | --skip-meta When true, saveddit will not save meta to a submission.json file on submissions 89 | --skip-videos When true, saveddit will not download videos (e.g., gfycat, redgifs, youtube, v.redd.it links) 90 | --all-comments When true, saveddit will download all the comments in a post instead of just downloading the top ones.) 91 | -o output_path Directory where saveddit will save downloaded content 92 | ``` 93 | 94 | ```console 95 | foo@bar:~$ saveddit subreddit pics -f hot -l 5 -o ~/Desktop 96 | ``` 97 | 98 | ```console 99 | foo@bar:~$ tree -L 4 ~/Desktop/www.reddit.com 100 | /Users/pranav/Desktop/www.reddit.com 101 | └── r 102 | └── pics 103 | └── hot 104 | ├── 000_Prince_Philip_Duke_of_Edinburgh_... 105 | ├── 001_Day_10_of_Nobody_Noticing_the_Ap... 106 | ├── 002_First_edited_picture 107 | ├── 003_Reorganized_a_few_months_ago_and... 108 | └── 004_Van_Gogh_inspired_rainy_street_I... 109 | ``` 110 | 111 | You can download from multiple subreddits and use multiple filters: 112 | 113 | ```console 114 | foo@bar:~$ saveddit subreddit funny AskReddit -f hot top new rising -l 5 -o ~/Downloads/Reddit/. 115 | ``` 116 | 117 | The downloads from each subreddit to go to a separate folder like so: 118 | 119 | ```console 120 | foo@bar:~$ tree -L 3 ~/Downloads/Reddit/www.reddit.com 121 | /Users/pranav/Downloads/Reddit/www.reddit.com 122 | └── r 123 | ├── AskReddit 124 | │ ├── hot 125 | │ ├── new 126 | │ ├── rising 127 | │ └── top 128 | └── funny 129 | ├── hot 130 | ├── new 131 | ├── rising 132 | └── top 133 | ``` 134 | 135 | ## Download from anonymous Multireddit 136 | 137 | To download from an anonymous multireddit, use the `multireddit` option and pass a number of subreddit names 138 | 139 | ```console 140 | foo@bar:~$ saveddit multireddit -h 141 | usage: saveddit multireddit [-h] [-f categories [categories ...]] [-l post_limit] [--skip-comments] [--skip-meta] [--skip-videos] -o output_path subreddits [subreddits ...] 142 | 143 | positional arguments: 144 | subreddits Names of subreddits to download, e.g., aww, pics. The downloads will be stored in /www.reddit.com/m/aww+pics/. 145 | 146 | optional arguments: 147 | -h, --help show this help message and exit 148 | -f categories [categories ...] 149 | Categories of posts to download (default: ['hot', 'new', 'random_rising', 'rising', 'controversial', 'top', 'gilded']) 150 | -l post_limit Limit the number of submissions downloaded in each category (default: None, i.e., all submissions) 151 | --skip-comments When true, saveddit will not save comments to a comments.json file 152 | --skip-meta When true, saveddit will not save meta to a submission.json file on submissions 153 | --skip-videos When true, saveddit will not download videos (e.g., gfycat, redgifs, youtube, v.redd.it links) 154 | -o output_path Directory where saveddit will save downloaded content 155 | ``` 156 | 157 | ```console 158 | foo@bar:~$ saveddit multireddit EarthPorn NaturePics -f hot -l 5 -o ~/Desktop 159 | ``` 160 | 161 | Anonymous multireddits are saved in `www.reddit.com/m///` like so: 162 | 163 | ```console 164 | tree -L 4 ~/Desktop/www.reddit.com 165 | /Users/pranav/Desktop/www.reddit.com 166 | └── m 167 | └── EarthPorn+NaturePics 168 | └── hot 169 | ├── 000_Banning_State_Park_Minnesota_OC_... 170 | ├── 001_Misty_forest_in_the_mountains_of... 171 | ├── 002_One_of_the_highlights_of_my_last... 172 | ├── 003__OC_Japan_Kyoto_Garden_of_the_Go... 173 | └── 004_Sunset_at_Mt_Rainier_National_Pa... 174 | ``` 175 | 176 | ## Download from User's page 177 | 178 | ```console 179 | foo@bar:~$ saveddit user -h 180 | usage: saveddit user [-h] users [users ...] {saved,gilded,submitted,multireddits,upvoted,comments} ... 181 | 182 | positional arguments: 183 | users Names of users to download, e.g., Poem_for_your_sprog 184 | {saved,gilded,submitted,multireddits,upvoted,comments} 185 | 186 | optional arguments: 187 | -h, --help show this help message and exit 188 | ``` 189 | 190 | Here's a usage example for downloading all comments made by `Poem_for_your_sprog` 191 | 192 | ```console 193 | foo@bar:~$ saveddit user "Poem_for_your_sprog" comments -s top -l 5 -o ~/Desktop 194 | ``` 195 | 196 | Here's another example for downloading `kemitche`'s multireddits: 197 | 198 | ```console 199 | foo@bar:~$ saveddit user kemitche multireddits -n reddit -f hot -l 5 -o ~/Desktop 200 | ``` 201 | 202 | User-specific content is downloaded to `www.reddit.com/u//...` like so: 203 | 204 | ```console 205 | foo@bar:~$ tree ~/Desktop/www.reddit.com 206 | /Users/pranav/Desktop/www.reddit.com 207 | └── u 208 | ├── Poem_for_your_sprog 209 | │ ├── comments 210 | │ │ └── top 211 | │ │ ├── 000_Comment_my_name_is_Cow_and_wen_its_ni....json 212 | │ │ ├── 001_Comment_It_stopped_at_six_and_life....json 213 | │ │ ├── 002_Comment__Perhaps_I_could_listen_to_podca....json 214 | │ │ ├── 003_Comment__I_don_t_have_regret_for_the_thi....json 215 | │ │ └── 004_Comment__So_throw_off_the_chains_of_oppr....json 216 | │ └── user.json 217 | └── kemitche 218 | ├── m 219 | │ └── reddit 220 | │ └── hot 221 | │ ├── 000_When_posting_to_my_u_channel_NSF... 222 | │ │ ├── comments.json 223 | │ │ └── submission.json 224 | │ ├── 001_How_to_remove_popular_near_you 225 | │ │ ├── comments.json 226 | │ │ └── submission.json 227 | │ ├── 002__IOS_2021_13_0_Reddit_is_just_su... 228 | │ │ ├── comments.json 229 | │ │ └── submission.json 230 | │ ├── 003_The_Approve_User_button_should_n... 231 | │ │ ├── comments.json 232 | │ │ └── submission.json 233 | │ └── 004_non_moderators_unable_to_view_su... 234 | │ ├── comments.json 235 | │ └── submission.json 236 | └── user.json 237 | ``` 238 | 239 | ## Search and Download 240 | 241 | `saveddit` support searching subreddits and downloading search results 242 | 243 | ```console 244 | foo@bar:~$ saveddit search -h 245 | usage: saveddit search [-h] -q query [-s sort] [-t time_filter] [--include-nsfw] [--skip-comments] [--skip-meta] [--skip-videos] -o output_path subreddits [subreddits ...] 246 | 247 | positional arguments: 248 | subreddits Names of subreddits to search, e.g., all, aww, pics 249 | 250 | optional arguments: 251 | -h, --help show this help message and exit 252 | -q query Search query string 253 | -s sort Sort to apply on search (default: relevance, choices: [relevance, hot, top, new, comments]) 254 | -t time_filter Time filter to apply on search (default: all, choices: [all, day, hour, month, week, year]) 255 | --include-nsfw When true, saveddit will include NSFW results in search 256 | --skip-comments When true, saveddit will not save comments to a comments.json file 257 | --skip-meta When true, saveddit will not save meta to a submission.json file on submissions 258 | --skip-videos When true, saveddit will not download videos (e.g., gfycat, redgifs, youtube, v.redd.it links) 259 | -o output_path Directory where saveddit will save downloaded content 260 | ``` 261 | 262 | e.g., 263 | 264 | ```console 265 | foo@bar:~$ saveddit search soccer -q "Chelsea" -o ~/Desktop 266 | ``` 267 | 268 | The downloaded search results are stored in `www.reddit.com/q////.` 269 | 270 | ```console 271 | foo@bar:~$ tree -L 4 ~/Desktop/www.reddit.com/q 272 | /Users/pranav/Desktop/www.reddit.com/q 273 | └── Chelsea 274 | └── soccer 275 | └── relevance 276 | ├── 000__Official_Results_for_UEFA_Champ... 277 | ├── 001_Porto_0_1_Chelsea_Mason_Mount_32... 278 | ├── 002_Crystal_Palace_0_2_Chelsea_Chris... 279 | ├── 003_Post_Match_Thread_Chelsea_2_5_We... 280 | ├── 004_Match_Thread_Porto_vs_Chelsea_UE... 281 | ├── 005_Crystal_Palace_1_4_Chelsea_Chris... 282 | ├── 006_Porto_0_2_Chelsea_Ben_Chilwell_8... 283 | ├── 007_Post_Match_Thread_Porto_0_2_Chel... 284 | ├── 008_UCL_Quaterfinalists_are_Bayern_D... 285 | ├── 009__MD_Mino_Raiola_and_Haaland_s_fa... 286 | ├── 010_Chelsea_2_5_West_Brom_Callum_Rob... 287 | ├── 011_Chelsea_1_2_West_Brom_Matheus_Pe... 288 | ├── 012__Bild_Sport_via_Sport_Witness_Ch... 289 | ├── 013_Match_Thread_Chelsea_vs_West_Bro... 290 | ├── 014_Chelsea_1_3_West_Brom_Callum_Rob... 291 | ├── 015_Match_Thread_Chelsea_vs_Atletico... 292 | ├── 016_Stefan_Savi�\207_Atlético_Madrid_str... 293 | ├── 017_Chelsea_1_0_West_Brom_Christian_... 294 | └── 018_Alvaro_Morata_I_ve_never_had_dep... 295 | ``` 296 | 297 | ## Supported Links: 298 | 299 | * Direct links to images or videos, e.g., `.png`, `.jpg`, `.mp4`, `.gif` etc. 300 | * Reddit galleries `reddit.com/gallery/...` 301 | * Reddit videos `v.redd.it/...` 302 | * Gfycat links `gfycat.com/...` 303 | * Redgif links `redgifs.com/...` 304 | * Imgur images `imgur.com/...` 305 | * Imgur albums `imgur.com/a/...` and `imgur.com/gallery/...` 306 | * Youtube links `youtube.com/...` and `yout.be/...` 307 | * These [sites](https://ytdl-org.github.io/youtube-dl/supportedsites.html) supported by `youtube-dl` 308 | * Self posts 309 | * For all other cases, `saveddit` will simply fetch the HTML of the URL 310 | 311 | ## Contributing 312 | Contributions are welcome, have a look at the [CONTRIBUTING.md](CONTRIBUTING.md) document for more information. 313 | 314 | ## License 315 | The project is available under the [MIT](https://opensource.org/licenses/MIT) license. 316 | -------------------------------------------------------------------------------- /README_PyPI.md: -------------------------------------------------------------------------------- 1 | ![](https://raw.githubusercontent.com/p-ranav/saveddit/master/images/logo.png) 2 | 3 | `saveddit` is a bulk media downloader for reddit 4 | 5 | ```console 6 | pip3 install saveddit 7 | ``` 8 | 9 | ## Setting up authorization 10 | 11 | * [Register an application with Reddit](https://ssl.reddit.com/prefs/apps/) 12 | - Write down your client ID and secret from the app 13 | - More about Reddit API access [here](https://ssl.reddit.com/wiki/api) 14 | - Wiki page about Reddit OAuth2 applications [here](https://github.com/reddit-archive/reddit/wiki/OAuth2) 15 | 16 | ![](https://raw.githubusercontent.com/p-ranav/saveddit/master/images/reddit_app.png) 17 | 18 | * [Register an application with Imgur](https://api.imgur.com/oauth2/addclient) 19 | - Write down the Imgur client ID from the app 20 | 21 | ![](https://raw.githubusercontent.com/p-ranav/saveddit/master/images/imgur_app.png) 22 | 23 | These registrations will authorize you to use the Reddit and Imgur APIs to download publicly available information. 24 | 25 | ## User configuration 26 | 27 | The first time you run `saveddit`, you will see something like this: 28 | 29 | ```console 30 | foo@bar:~$ saveddit 31 | Retrieving configuration from ~/.saveddit/user_config.yaml file 32 | No configuration file found. 33 | Creating one. Would you like to edit it now? 34 | > Choose Y for yes and N for no 35 | ``` 36 | 37 | Once you choose 'yes', the program will request you to enter these credentials: 38 | - Your imgur client ID 39 | - Your reddit client ID 40 | - Your reddit client secret 41 | - Your reddit username 42 | 43 | In case you choose 'no', the program will create a file which you can edit later, this is how to edit it: 44 | 45 | * Open the generated `~/.saveddit/user_config.yaml` 46 | * Update the client IDs and secrets from the previous step 47 | * If you plan on using the `user` API, add your reddit username as well 48 | 49 | ```yaml 50 | imgur_client_id: '' 51 | reddit_client_id: '' 52 | reddit_client_secret: '' 53 | reddit_username: '' 54 | ``` 55 | 56 | ## Download from Subreddit 57 | 58 | ```console 59 | foo@bar:~$ saveddit subreddit -h 60 | Retrieving configuration from /Users/pranav/.saveddit/user_config.yaml file 61 | 62 | usage: saveddit subreddit [-h] [-f categories [categories ...]] [-l post_limit] [--skip-comments] [--skip-meta] [--skip-videos] -o output_path subreddits [subreddits ...] 63 | 64 | positional arguments: 65 | subreddits Names of subreddits to download, e.g., AskReddit 66 | 67 | optional arguments: 68 | -h, --help show this help message and exit 69 | -f categories [categories ...] 70 | Categories of posts to download (default: ['hot', 'new', 'rising', 'controversial', 'top', 'gilded']) 71 | -l post_limit Limit the number of submissions downloaded in each category (default: None, i.e., all submissions) 72 | --skip-comments When true, saveddit will not save comments to a comments.json file 73 | --skip-meta When true, saveddit will not save meta to a submission.json file on submissions 74 | --skip-videos When true, saveddit will not download videos (e.g., gfycat, redgifs, youtube, v.redd.it links) 75 | --all-comments When true, saveddit will download all the comments in a post instead of just downloading the top ones.) 76 | -o output_path Directory where saveddit will save downloaded content 77 | ``` 78 | 79 | ```console 80 | foo@bar:~$ saveddit subreddit pics -f hot -l 5 -o ~/Desktop 81 | ``` 82 | 83 | ```console 84 | foo@bar:~$ tree -L 4 ~/Desktop/www.reddit.com 85 | /Users/pranav/Desktop/www.reddit.com 86 | └── r 87 | └── pics 88 | └── hot 89 | ├── 000_Prince_Philip_Duke_of_Edinburgh_... 90 | ├── 001_Day_10_of_Nobody_Noticing_the_Ap... 91 | ├── 002_First_edited_picture 92 | ├── 003_Reorganized_a_few_months_ago_and... 93 | └── 004_Van_Gogh_inspired_rainy_street_I... 94 | ``` 95 | 96 | You can download from multiple subreddits and use multiple filters: 97 | 98 | ```console 99 | foo@bar:~$ saveddit subreddit funny AskReddit -f hot top new rising -l 5 -o ~/Downloads/Reddit/. 100 | ``` 101 | 102 | The downloads from each subreddit to go to a separate folder like so: 103 | 104 | ```console 105 | foo@bar:~$ tree -L 3 ~/Downloads/Reddit/www.reddit.com 106 | /Users/pranav/Downloads/Reddit/www.reddit.com 107 | └── r 108 | ├── AskReddit 109 | │ ├── hot 110 | │ ├── new 111 | │ ├── rising 112 | │ └── top 113 | └── funny 114 | ├── hot 115 | ├── new 116 | ├── rising 117 | └── top 118 | ``` 119 | 120 | ## Download from anonymous Multireddit 121 | 122 | To download from an anonymous multireddit, use the `multireddit` option and pass a number of subreddit names 123 | 124 | ```console 125 | foo@bar:~$ saveddit multireddit -h 126 | usage: saveddit multireddit [-h] [-f categories [categories ...]] [-l post_limit] [--skip-comments] [--skip-meta] [--skip-videos] -o output_path subreddits [subreddits ...] 127 | 128 | positional arguments: 129 | subreddits Names of subreddits to download, e.g., aww, pics. The downloads will be stored in /www.reddit.com/m/aww+pics/. 130 | 131 | optional arguments: 132 | -h, --help show this help message and exit 133 | -f categories [categories ...] 134 | Categories of posts to download (default: ['hot', 'new', 'random_rising', 'rising', 'controversial', 'top', 'gilded']) 135 | -l post_limit Limit the number of submissions downloaded in each category (default: None, i.e., all submissions) 136 | --skip-comments When true, saveddit will not save comments to a comments.json file 137 | --skip-meta When true, saveddit will not save meta to a submission.json file on submissions 138 | --skip-videos When true, saveddit will not download videos (e.g., gfycat, redgifs, youtube, v.redd.it links) 139 | -o output_path Directory where saveddit will save downloaded content 140 | ``` 141 | 142 | ```console 143 | foo@bar:~$ saveddit multireddit EarthPorn NaturePics -f hot -l 5 -o ~/Desktop 144 | ``` 145 | 146 | Anonymous multireddits are saved in `www.reddit.com/m///` like so: 147 | 148 | ```console 149 | tree -L 4 ~/Desktop/www.reddit.com 150 | /Users/pranav/Desktop/www.reddit.com 151 | └── m 152 | └── EarthPorn+NaturePics 153 | └── hot 154 | ├── 000_Banning_State_Park_Minnesota_OC_... 155 | ├── 001_Misty_forest_in_the_mountains_of... 156 | ├── 002_One_of_the_highlights_of_my_last... 157 | ├── 003__OC_Japan_Kyoto_Garden_of_the_Go... 158 | └── 004_Sunset_at_Mt_Rainier_National_Pa... 159 | ``` 160 | 161 | ## Download from User's page 162 | 163 | ```console 164 | foo@bar:~$ saveddit user -h 165 | usage: saveddit user [-h] users [users ...] {saved,gilded,submitted,multireddits,upvoted,comments} ... 166 | 167 | positional arguments: 168 | users Names of users to download, e.g., Poem_for_your_sprog 169 | {saved,gilded,submitted,multireddits,upvoted,comments} 170 | 171 | optional arguments: 172 | -h, --help show this help message and exit 173 | ``` 174 | 175 | Here's a usage example for downloading all comments made by `Poem_for_your_sprog` 176 | 177 | ```console 178 | foo@bar:~$ saveddit user "Poem_for_your_sprog" comments -s top -l 5 -o ~/Desktop 179 | ``` 180 | 181 | Here's another example for downloading `kemitche`'s multireddits: 182 | 183 | ```console 184 | foo@bar:~$ saveddit user kemitche multireddits -n reddit -f hot -l 5 -o ~/Desktop 185 | ``` 186 | 187 | User-specific content is downloaded to `www.reddit.com/u//...` like so: 188 | 189 | ```console 190 | foo@bar:~$ tree ~/Desktop/www.reddit.com 191 | /Users/pranav/Desktop/www.reddit.com 192 | └── u 193 | ├── Poem_for_your_sprog 194 | │ ├── comments 195 | │ │ └── top 196 | │ │ ├── 000_Comment_my_name_is_Cow_and_wen_its_ni....json 197 | │ │ ├── 001_Comment_It_stopped_at_six_and_life....json 198 | │ │ ├── 002_Comment__Perhaps_I_could_listen_to_podca....json 199 | │ │ ├── 003_Comment__I_don_t_have_regret_for_the_thi....json 200 | │ │ └── 004_Comment__So_throw_off_the_chains_of_oppr....json 201 | │ └── user.json 202 | └── kemitche 203 | ├── m 204 | │ └── reddit 205 | │ └── hot 206 | │ ├── 000_When_posting_to_my_u_channel_NSF... 207 | │ │ ├── comments.json 208 | │ │ └── submission.json 209 | │ ├── 001_How_to_remove_popular_near_you 210 | │ │ ├── comments.json 211 | │ │ └── submission.json 212 | │ ├── 002__IOS_2021_13_0_Reddit_is_just_su... 213 | │ │ ├── comments.json 214 | │ │ └── submission.json 215 | │ ├── 003_The_Approve_User_button_should_n... 216 | │ │ ├── comments.json 217 | │ │ └── submission.json 218 | │ └── 004_non_moderators_unable_to_view_su... 219 | │ ├── comments.json 220 | │ └── submission.json 221 | └── user.json 222 | ``` 223 | 224 | ## Search and Download 225 | 226 | `saveddit` support searching subreddits and downloading search results 227 | 228 | ```console 229 | foo@bar:~$ saveddit search -h 230 | usage: saveddit search [-h] -q query [-s sort] [-t time_filter] [--include-nsfw] [--skip-comments] [--skip-meta] [--skip-videos] -o output_path subreddits [subreddits ...] 231 | 232 | positional arguments: 233 | subreddits Names of subreddits to search, e.g., all, aww, pics 234 | 235 | optional arguments: 236 | -h, --help show this help message and exit 237 | -q query Search query string 238 | -s sort Sort to apply on search (default: relevance, choices: [relevance, hot, top, new, comments]) 239 | -t time_filter Time filter to apply on search (default: all, choices: [all, day, hour, month, week, year]) 240 | --include-nsfw When true, saveddit will include NSFW results in search 241 | --skip-comments When true, saveddit will not save comments to a comments.json file 242 | --skip-meta When true, saveddit will not save meta to a submission.json file on submissions 243 | --skip-videos When true, saveddit will not download videos (e.g., gfycat, redgifs, youtube, v.redd.it links) 244 | -o output_path Directory where saveddit will save downloaded content 245 | ``` 246 | 247 | e.g., 248 | 249 | ```console 250 | foo@bar:~$ saveddit search soccer -q "Chelsea" -o ~/Desktop 251 | ``` 252 | 253 | The downloaded search results are stored in `www.reddit.com/q////.` 254 | 255 | ```console 256 | foo@bar:~$ tree -L 4 ~/Desktop/www.reddit.com/q 257 | /Users/pranav/Desktop/www.reddit.com/q 258 | └── Chelsea 259 | └── soccer 260 | └── relevance 261 | ├── 000__Official_Results_for_UEFA_Champ... 262 | ├── 001_Porto_0_1_Chelsea_Mason_Mount_32... 263 | ├── 002_Crystal_Palace_0_2_Chelsea_Chris... 264 | ├── 003_Post_Match_Thread_Chelsea_2_5_We... 265 | ├── 004_Match_Thread_Porto_vs_Chelsea_UE... 266 | ├── 005_Crystal_Palace_1_4_Chelsea_Chris... 267 | ├── 006_Porto_0_2_Chelsea_Ben_Chilwell_8... 268 | ├── 007_Post_Match_Thread_Porto_0_2_Chel... 269 | ├── 008_UCL_Quaterfinalists_are_Bayern_D... 270 | ├── 009__MD_Mino_Raiola_and_Haaland_s_fa... 271 | ├── 010_Chelsea_2_5_West_Brom_Callum_Rob... 272 | ├── 011_Chelsea_1_2_West_Brom_Matheus_Pe... 273 | ├── 012__Bild_Sport_via_Sport_Witness_Ch... 274 | ├── 013_Match_Thread_Chelsea_vs_West_Bro... 275 | ├── 014_Chelsea_1_3_West_Brom_Callum_Rob... 276 | ├── 015_Match_Thread_Chelsea_vs_Atletico... 277 | ├── 016_Stefan_Savi�\207_Atlético_Madrid_str... 278 | ├── 017_Chelsea_1_0_West_Brom_Christian_... 279 | └── 018_Alvaro_Morata_I_ve_never_had_dep... 280 | ``` 281 | 282 | ## Supported Links: 283 | 284 | * Direct links to images or videos, e.g., `.png`, `.jpg`, `.mp4`, `.gif` etc. 285 | * Reddit galleries `reddit.com/gallery/...` 286 | * Reddit videos `v.redd.it/...` 287 | * Gfycat links `gfycat.com/...` 288 | * Redgif links `redgifs.com/...` 289 | * Imgur images `imgur.com/...` 290 | * Imgur albums `imgur.com/a/...` and `imgur.com/gallery/...` 291 | * Youtube links `youtube.com/...` and `yout.be/...` 292 | * These [sites](https://ytdl-org.github.io/youtube-dl/supportedsites.html) supported by `youtube-dl` 293 | * Self posts 294 | * For all other cases, `saveddit` will simply fetch the HTML of the URL 295 | 296 | ## Contributing 297 | Contributions are welcome, have a look at the [CONTRIBUTING.md](CONTRIBUTING.md) document for more information. 298 | 299 | ## License 300 | The project is available under the [MIT](https://opensource.org/licenses/MIT) license. 301 | -------------------------------------------------------------------------------- /images/demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/p-ranav/saveddit/f4aa0749eec1020bb9927c6dd7fd5059a6d989af/images/demo.gif -------------------------------------------------------------------------------- /images/imgur_app.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/p-ranav/saveddit/f4aa0749eec1020bb9927c6dd7fd5059a6d989af/images/imgur_app.png -------------------------------------------------------------------------------- /images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/p-ranav/saveddit/f4aa0749eec1020bb9927c6dd7fd5059a6d989af/images/logo.png -------------------------------------------------------------------------------- /images/reddit_app.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/p-ranav/saveddit/f4aa0749eec1020bb9927c6dd7fd5059a6d989af/images/reddit_app.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | colorama==0.4.4 2 | coloredlogs==15.0 3 | verboselogs==1.7 4 | praw==7.2.0 5 | tqdm==4.60.0 6 | ffmpeg_python==0.2.0 7 | youtube_dl==2021.4.7 8 | requests==2.25.1 9 | beautifulsoup4==4.9.3 10 | PyYAML==5.4.1 11 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | # replace with your username: 3 | name = saveddit 4 | version = 2.2.1 5 | author = Pranav Srinivas Kumar 6 | author_email = pranav.srinivas.kumar@gmail.com 7 | description = Bulk Downloader for Reddit 8 | long_description = file: README_PyPI.md 9 | long_description_content_type = text/markdown 10 | url = https://github.com/p-ranav/saveddit 11 | project_urls = 12 | Bug Tracker = https://github.com/p-ranav/saveddit/issues 13 | classifiers = 14 | Programming Language :: Python :: 3 15 | License :: OSI Approved :: MIT License 16 | Operating System :: OS Independent 17 | 18 | [options] 19 | package_dir = 20 | = src 21 | packages = find: 22 | python_requires = >=3.8 23 | install_requires = 24 | praw 25 | verboselogs 26 | requests 27 | colorama 28 | coloredlogs 29 | youtube_dl 30 | tqdm 31 | ffmpeg_python 32 | beautifulsoup4 33 | PyYAML 34 | 35 | [options.packages.find] 36 | where = src 37 | 38 | [options.entry_points] 39 | console_scripts = 40 | saveddit = saveddit.saveddit:main -------------------------------------------------------------------------------- /src/saveddit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/p-ranav/saveddit/f4aa0749eec1020bb9927c6dd7fd5059a6d989af/src/saveddit/__init__.py -------------------------------------------------------------------------------- /src/saveddit/_version.py: -------------------------------------------------------------------------------- 1 | __version__ = "2.2.1" -------------------------------------------------------------------------------- /src/saveddit/configuration.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Union 3 | import yaml 4 | import pathlib 5 | import colorama 6 | import sys 7 | 8 | 9 | class ConfigurationLoader: 10 | PURPLE = colorama.Fore.MAGENTA 11 | WHITE = colorama.Style.RESET_ALL 12 | RED = colorama.Fore.RED 13 | 14 | @staticmethod 15 | def load(path): 16 | """ 17 | Loads Saveddit configuration from a configuration file. 18 | If ifle is not found, create one and exit. 19 | 20 | Arguments: 21 | path: path to user_config.yaml file 22 | 23 | Returns: 24 | A Python dictionary with Saveddit configuration info 25 | """ 26 | 27 | def _create_config(_path): 28 | _STD_CONFIG = { 29 | "reddit_client_id": "", 30 | "reddit_client_secret": "", 31 | "reddit_username": "", 32 | "imgur_client_id": "", 33 | } 34 | with open(_path, "x") as _f: 35 | yaml.dump(_STD_CONFIG, _f) 36 | sys.exit(0) 37 | 38 | # Explicitly converting path to POSIX-like path (to avoid '\\' hell) 39 | print( 40 | "{notice}Retrieving configuration from {path} file{white}".format( 41 | path=path, 42 | notice=ConfigurationLoader.PURPLE, 43 | white=ConfigurationLoader.WHITE, 44 | ) 45 | ) 46 | path = pathlib.Path(path).absolute().as_posix() 47 | 48 | # Check if file exists. If not, create one and fill it with std config template 49 | if not os.path.exists(path): 50 | print( 51 | "{red}No configuration file found.\nCreating one. Would you like to edit it now?\n > Choose {purple}Y{red} for yes and {purple}N{red} for no.{white}".format( 52 | red=ConfigurationLoader.RED, 53 | path=path, 54 | white=ConfigurationLoader.WHITE, 55 | purple=ConfigurationLoader.PURPLE, 56 | ) 57 | ) 58 | getchoice = str(input("> ")) 59 | if getchoice == "Y": 60 | reddit_client = str(input("Reddit Client ID: ")) 61 | reddit_client_sec = str(input("Reddit Client Secret: ")) 62 | reddit_user = str(input("Reddit Username: ")) 63 | imgur_client = str(input("Imgur Client ID: ")) 64 | STD_CONFIG = { 65 | "reddit_client_id": "{}".format(reddit_client), 66 | "reddit_client_secret": "{}".format(reddit_client_sec), 67 | "reddit_username": "{}".format(reddit_user), 68 | "imgur_client_id": "{}".format(imgur_client), 69 | } 70 | with open(path, "x") as f: 71 | yaml.dump(STD_CONFIG, f) 72 | sys.exit(0) 73 | elif getchoice == "N": 74 | print( 75 | "{red}Alright.\nPlease edit {path} with valid credentials.\nExiting{white}".format( 76 | red=ConfigurationLoader.RED, 77 | path=path, 78 | white=ConfigurationLoader.WHITE, 79 | ) 80 | ) 81 | _create_config(path) 82 | else: 83 | print("Invalid choice.") 84 | exit() 85 | 86 | with open(path, "r") as _f: 87 | return yaml.safe_load(_f.read()) 88 | -------------------------------------------------------------------------------- /src/saveddit/multireddit_downloader.py: -------------------------------------------------------------------------------- 1 | import coloredlogs 2 | from colorama import Fore, Style 3 | from datetime import datetime, timezone 4 | import logging 5 | import verboselogs 6 | import getpass 7 | import json 8 | import os 9 | import praw 10 | from pprint import pprint 11 | import re 12 | from saveddit.submission_downloader import SubmissionDownloader 13 | from saveddit.subreddit_downloader import SubredditDownloader 14 | from saveddit.multireddit_downloader_config import MultiredditDownloaderConfig 15 | import sys 16 | from tqdm import tqdm 17 | 18 | class MultiredditDownloader: 19 | config = SubredditDownloader.config 20 | REDDIT_CLIENT_ID = config['reddit_client_id'] 21 | REDDIT_CLIENT_SECRET = config['reddit_client_secret'] 22 | IMGUR_CLIENT_ID = config['imgur_client_id'] 23 | 24 | def __init__(self, multireddit_names): 25 | self.logger = verboselogs.VerboseLogger(__name__) 26 | level_styles = { 27 | 'critical': {'bold': True, 'color': 'red'}, 28 | 'debug': {'color': 'green'}, 29 | 'error': {'color': 'red'}, 30 | 'info': {'color': 'white'}, 31 | 'notice': {'color': 'magenta'}, 32 | 'spam': {'color': 'white', 'faint': True}, 33 | 'success': {'bold': True, 'color': 'green'}, 34 | 'verbose': {'color': 'blue'}, 35 | 'warning': {'color': 'yellow'} 36 | } 37 | coloredlogs.install(level='SPAM', logger=self.logger, 38 | fmt='%(message)s', level_styles=level_styles) 39 | 40 | self.reddit = praw.Reddit( 41 | client_id=MultiredditDownloader.REDDIT_CLIENT_ID, 42 | client_secret=MultiredditDownloader.REDDIT_CLIENT_SECRET, 43 | user_agent="saveddit (by /u/p_ranav)" 44 | ) 45 | 46 | self.multireddit_name = "+".join(multireddit_names) 47 | self.multireddit = self.reddit.subreddit(self.multireddit_name) 48 | 49 | def download(self, output_path, categories=MultiredditDownloaderConfig.DEFAULT_CATEGORIES, post_limit=MultiredditDownloaderConfig.DEFAULT_POST_LIMIT, skip_videos=False, skip_meta=False, skip_comments=False, comment_limit=0): 50 | ''' 51 | categories: List of categories within the multireddit to download (see MultiredditDownloaderConfig.DEFAULT_CATEGORIES) 52 | post_limit: Number of posts to download (default: None, i.e., all posts) 53 | comment_limit: Number of comment levels to download from submission (default: `0`, i.e., only top-level comments) 54 | - to get all comments, set comment_limit to `None` 55 | ''' 56 | 57 | multireddit_dir_name = self.multireddit_name 58 | if len(multireddit_dir_name) > 64: 59 | multireddit_dir_name = multireddit_dir_name[0:63] 60 | multireddit_dir_name += "..." 61 | 62 | root_dir = os.path.join(os.path.join(os.path.join( 63 | output_path, "www.reddit.com"), "m"), multireddit_dir_name) 64 | categories = categories 65 | 66 | for c in categories: 67 | self.logger.notice("Downloading from /m/" + 68 | self.multireddit_name + "/" + c + "/") 69 | category_dir = os.path.join(root_dir, c) 70 | if not os.path.exists(category_dir): 71 | os.makedirs(category_dir) 72 | category_function = getattr(self.multireddit, c) 73 | 74 | for i, submission in enumerate(category_function(limit=post_limit)): 75 | SubmissionDownloader(submission, i, self.logger, category_dir, 76 | skip_videos, skip_meta, skip_comments, comment_limit, 77 | {'imgur_client_id': MultiredditDownloader.IMGUR_CLIENT_ID}) 78 | -------------------------------------------------------------------------------- /src/saveddit/multireddit_downloader_config.py: -------------------------------------------------------------------------------- 1 | class MultiredditDownloaderConfig: 2 | DEFAULT_CATEGORIES = ["hot", "new", "random_rising", "rising", 3 | "controversial", "top", "gilded"] 4 | DEFAULT_POST_LIMIT = None -------------------------------------------------------------------------------- /src/saveddit/saveddit.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | from saveddit.multireddit_downloader_config import MultiredditDownloaderConfig 4 | from saveddit.search_config import SearchConfig 5 | from saveddit.subreddit_downloader_config import SubredditDownloaderConfig 6 | from saveddit.user_downloader_config import UserDownloaderConfig 7 | from saveddit._version import __version__ 8 | 9 | 10 | def asciiart(): 11 | return r''' .___ .___.__ __ 12 | ___________ ___ __ ____ __| _/__| _/|__|/ |_ 13 | / ___/\__ \\ \/ // __ \ / __ |/ __ | | \ __\ 14 | \___ \ / __ \\ /\ ___// /_/ / /_/ | | || | 15 | /____ >(____ /\_/ \___ >____ \____ | |__||__| 16 | \/ \/ \/ \/ \/ 17 | 18 | Downloader for Reddit 19 | version : ''' + __version__ + ''' 20 | URL : https://github.com/p-ranav/saveddit 21 | ''' 22 | 23 | 24 | def check_positive(value): 25 | ivalue = int(value) 26 | if ivalue <= 0: 27 | raise argparse.ArgumentTypeError( 28 | "%s is an invalid positive int value" % value) 29 | return ivalue 30 | 31 | class UniqueAppendAction(argparse.Action): 32 | ''' 33 | Class used to discard duplicates in list arguments 34 | https://stackoverflow.com/questions/9376670/python-argparse-force-a-list-item-to-be-unique 35 | ''' 36 | def __call__(self, parser, namespace, values, option_string=None): 37 | unique_values = set(values) 38 | setattr(namespace, self.dest, unique_values) 39 | 40 | def main(): 41 | argv = sys.argv[1:] 42 | 43 | parser = argparse.ArgumentParser(prog="saveddit") 44 | parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + __version__) 45 | 46 | subparsers = parser.add_subparsers(dest="subparser_name") 47 | 48 | subreddit_parser = subparsers.add_parser('subreddit') 49 | subreddit_parser.add_argument('subreddits', 50 | metavar='subreddits', 51 | nargs='+', 52 | action=UniqueAppendAction, 53 | help='Names of subreddits to download, e.g., AskReddit') 54 | subreddit_parser.add_argument('-f', 55 | metavar='categories', 56 | default=SubredditDownloaderConfig.DEFAULT_CATEGORIES, 57 | nargs='+', 58 | action=UniqueAppendAction, 59 | help='Categories of posts to download (default: %(default)s)') 60 | subreddit_parser.add_argument('-l', 61 | default=SubredditDownloaderConfig.DEFAULT_POST_LIMIT, 62 | metavar='post_limit', 63 | type=check_positive, 64 | help='Limit the number of submissions downloaded in each category (default: %(default)s, i.e., all submissions)') 65 | subreddit_parser.add_argument('--skip-comments', 66 | default=False, 67 | action='store_true', 68 | help='When true, saveddit will not save comments to a comments.json file') 69 | subreddit_parser.add_argument('--skip-meta', 70 | default=False, 71 | action='store_true', 72 | help='When true, saveddit will not save meta to a submission.json file on submissions') 73 | subreddit_parser.add_argument('--skip-videos', 74 | default=False, 75 | action='store_true', 76 | help='When true, saveddit will not download videos (e.g., gfycat, redgifs, youtube, v.redd.it links)') 77 | subreddit_parser.add_argument('--all-comments', 78 | default=False, 79 | action='store_true', 80 | help='When true, saveddit will download all the comments in a post instead of just the top ones.') 81 | subreddit_parser.add_argument('-o', 82 | required=True, 83 | type=str, 84 | metavar='output_path', 85 | help='Directory where saveddit will save downloaded content' 86 | ) 87 | 88 | multireddit_parser = subparsers.add_parser('multireddit') 89 | multireddit_parser.add_argument('subreddits', 90 | metavar='subreddits', 91 | nargs='+', 92 | action=UniqueAppendAction, 93 | help='Names of subreddits to download, e.g., aww, pics. The downloads will be stored in /www.reddit.com/m/aww+pics/.') 94 | multireddit_parser.add_argument('-f', 95 | metavar='categories', 96 | default=MultiredditDownloaderConfig.DEFAULT_CATEGORIES, 97 | nargs='+', 98 | action=UniqueAppendAction, 99 | help='Categories of posts to download (default: %(default)s)') 100 | multireddit_parser.add_argument('-l', 101 | default=MultiredditDownloaderConfig.DEFAULT_POST_LIMIT, 102 | metavar='post_limit', 103 | type=check_positive, 104 | help='Limit the number of submissions downloaded in each category (default: %(default)s, i.e., all submissions)') 105 | multireddit_parser.add_argument('--skip-comments', 106 | default=False, 107 | action='store_true', 108 | help='When true, saveddit will not save comments to a comments.json file') 109 | multireddit_parser.add_argument('--skip-meta', 110 | default=False, 111 | action='store_true', 112 | help='When true, saveddit will not save meta to a submission.json file on submissions') 113 | multireddit_parser.add_argument('--skip-videos', 114 | default=False, 115 | action='store_true', 116 | help='When true, saveddit will not download videos (e.g., gfycat, redgifs, youtube, v.redd.it links)') 117 | multireddit_parser.add_argument('-o', 118 | required=True, 119 | type=str, 120 | metavar='output_path', 121 | help='Directory where saveddit will save downloaded content' 122 | ) 123 | 124 | search_parser = subparsers.add_parser('search') 125 | search_parser.add_argument('subreddits', 126 | metavar='subreddits', 127 | nargs='+', 128 | action=UniqueAppendAction, 129 | help='Names of subreddits to search, e.g., all, aww, pics') 130 | search_parser.add_argument('-q', 131 | metavar='query', 132 | required=True, 133 | help='Search query string') 134 | search_parser.add_argument('-s', 135 | metavar='sort', 136 | default=SearchConfig.DEFAULT_SORT, 137 | choices=SearchConfig.DEFAULT_SORT_CATEGORIES, 138 | help='Sort to apply on search (default: %(default)s, choices: [%(choices)s])') 139 | search_parser.add_argument('-t', 140 | metavar='time_filter', 141 | default=SearchConfig.DEFAULT_TIME_FILTER, 142 | choices=SearchConfig.DEFAULT_TIME_FILTER_CATEGORIES, 143 | help='Time filter to apply on search (default: %(default)s, choices: [%(choices)s])') 144 | search_parser.add_argument('--include-nsfw', 145 | default=False, 146 | action='store_true', 147 | help='When true, saveddit will include NSFW results in search') 148 | search_parser.add_argument('--skip-comments', 149 | default=False, 150 | action='store_true', 151 | help='When true, saveddit will not save comments to a comments.json file') 152 | search_parser.add_argument('--skip-meta', 153 | default=False, 154 | action='store_true', 155 | help='When true, saveddit will not save meta to a submission.json file on submissions') 156 | search_parser.add_argument('--skip-videos', 157 | default=False, 158 | action='store_true', 159 | help='When true, saveddit will not download videos (e.g., gfycat, redgifs, youtube, v.redd.it links)') 160 | search_parser.add_argument('-o', 161 | required=True, 162 | type=str, 163 | metavar='output_path', 164 | help='Directory where saveddit will save downloaded content' 165 | ) 166 | 167 | user_parser = subparsers.add_parser('user') 168 | user_parser.add_argument('users', 169 | metavar='users', 170 | nargs='+', 171 | help='Names of users to download, e.g., Poem_for_your_sprog') 172 | 173 | 174 | user_subparsers = user_parser.add_subparsers(dest="user_subparser_name") 175 | user_subparsers.required = True 176 | 177 | # user.saved subparser 178 | saved_parser = user_subparsers.add_parser('saved') 179 | saved_parser.add_argument('--skip-meta', 180 | default=False, 181 | action='store_true', 182 | help='When true, saveddit will not save meta to a submission.json file on submissions') 183 | saved_parser.add_argument('--skip-comments', 184 | default=False, 185 | action='store_true', 186 | help='When true, saveddit will not save comments to a comments.json file') 187 | saved_parser.add_argument('--skip-videos', 188 | default=False, 189 | action='store_true', 190 | help='When true, saveddit will not download videos (e.g., gfycat, redgifs, youtube, v.redd.it links)') 191 | saved_parser.add_argument('-l', 192 | default=UserDownloaderConfig.DEFAULT_POST_LIMIT, 193 | metavar='post_limit', 194 | type=check_positive, 195 | help='Limit the number of saved submissions downloaded (default: %(default)s, i.e., all submissions)') 196 | saved_parser.add_argument('-o', 197 | required=True, 198 | type=str, 199 | metavar='output_path', 200 | help='Directory where saveddit will save downloaded content' 201 | ) 202 | 203 | # user.gilded subparser 204 | gilded_parser = user_subparsers.add_parser('gilded') 205 | gilded_parser.add_argument('--skip-meta', 206 | default=False, 207 | action='store_true', 208 | help='When true, saveddit will not save meta to a submission.json file on submissions') 209 | gilded_parser.add_argument('--skip-comments', 210 | default=False, 211 | action='store_true', 212 | help='When true, saveddit will not save comments to a comments.json file') 213 | gilded_parser.add_argument('--skip-videos', 214 | default=False, 215 | action='store_true', 216 | help='When true, saveddit will not download videos (e.g., gfycat, redgifs, youtube, v.redd.it links)') 217 | gilded_parser.add_argument('-l', 218 | default=UserDownloaderConfig.DEFAULT_POST_LIMIT, 219 | metavar='post_limit', 220 | type=check_positive, 221 | help='Limit the number of saved submissions downloaded (default: %(default)s, i.e., all submissions)') 222 | gilded_parser.add_argument('-o', 223 | required=True, 224 | type=str, 225 | metavar='output_path', 226 | help='Directory where saveddit will save downloaded content' 227 | ) 228 | 229 | # user.submitted subparser 230 | submitted_parser = user_subparsers.add_parser('submitted') 231 | submitted_parser.add_argument('-s', 232 | metavar='sort', 233 | default=UserDownloaderConfig.DEFAULT_SORT, 234 | choices=UserDownloaderConfig.DEFAULT_SORT_OPTIONS, 235 | help='Download submissions sorted by this option (default: %(default)s, choices: [%(choices)s])') 236 | submitted_parser.add_argument('--skip-comments', 237 | default=False, 238 | action='store_true', 239 | help='When true, saveddit will not save comments to a comments.json file for the submissions') 240 | submitted_parser.add_argument('--skip-meta', 241 | default=False, 242 | action='store_true', 243 | help='When true, saveddit will not save meta to a submission.json file on submissions') 244 | submitted_parser.add_argument('--skip-videos', 245 | default=False, 246 | action='store_true', 247 | help='When true, saveddit will not download videos (e.g., gfycat, redgifs, youtube, v.redd.it links)') 248 | submitted_parser.add_argument('-l', 249 | default=UserDownloaderConfig.DEFAULT_POST_LIMIT, 250 | metavar='post_limit', 251 | type=check_positive, 252 | help='Limit the number of submissions downloaded (default: %(default)s, i.e., all submissions)') 253 | submitted_parser.add_argument('-o', 254 | required=True, 255 | type=str, 256 | metavar='output_path', 257 | help='Directory where saveddit will save downloaded posts' 258 | ) 259 | 260 | # user.multireddits subparser 261 | submitted_parser = user_subparsers.add_parser('multireddits') 262 | submitted_parser.add_argument('-n', 263 | metavar='names', 264 | default=None, 265 | nargs='+', 266 | action=UniqueAppendAction, 267 | help='Names of specific multireddits to download (default: %(default)s, i.e., all multireddits for this user)') 268 | submitted_parser.add_argument('-f', 269 | metavar='categories', 270 | default=UserDownloaderConfig.DEFAULT_CATEGORIES, 271 | nargs='+', 272 | action=UniqueAppendAction, 273 | help='Categories of posts to download (default: %(default)s)') 274 | submitted_parser.add_argument('--skip-comments', 275 | default=False, 276 | action='store_true', 277 | help='When true, saveddit will not save comments to a comments.json file for the submissions') 278 | submitted_parser.add_argument('--skip-meta', 279 | default=False, 280 | action='store_true', 281 | help='When true, saveddit will not save meta to a submission.json file on submissions') 282 | submitted_parser.add_argument('--skip-videos', 283 | default=False, 284 | action='store_true', 285 | help='When true, saveddit will not download videos (e.g., gfycat, redgifs, youtube, v.redd.it links)') 286 | submitted_parser.add_argument('-l', 287 | default=UserDownloaderConfig.DEFAULT_POST_LIMIT, 288 | metavar='post_limit', 289 | type=check_positive, 290 | help='Limit the number of submissions downloaded (default: %(default)s, i.e., all submissions)') 291 | submitted_parser.add_argument('-o', 292 | required=True, 293 | type=str, 294 | metavar='output_path', 295 | help='Directory where saveddit will save downloaded posts' 296 | ) 297 | 298 | # user.upvoted subparser 299 | upvoted_parser = user_subparsers.add_parser('upvoted') 300 | upvoted_parser.add_argument('--skip-comments', 301 | default=False, 302 | action='store_true', 303 | help='When true, saveddit will not save comments to a comments.json file for the upvoted submissions') 304 | upvoted_parser.add_argument('--skip-meta', 305 | default=False, 306 | action='store_true', 307 | help='When true, saveddit will not save meta to a submission.json file on upvoted submissions') 308 | upvoted_parser.add_argument('--skip-videos', 309 | default=False, 310 | action='store_true', 311 | help='When true, saveddit will not download videos (e.g., gfycat, redgifs, youtube, v.redd.it links)') 312 | upvoted_parser.add_argument('-l', 313 | default=UserDownloaderConfig.DEFAULT_POST_LIMIT, 314 | metavar='post_limit', 315 | type=check_positive, 316 | help='Limit the number of submissions downloaded (default: %(default)s, i.e., all submissions)') 317 | upvoted_parser.add_argument('-o', 318 | required=True, 319 | type=str, 320 | metavar='output_path', 321 | help='Directory where saveddit will save downloaded posts' 322 | ) 323 | 324 | # user.comments subparser 325 | comments_parser = user_subparsers.add_parser('comments') 326 | comments_parser.add_argument('-s', 327 | metavar='sort', 328 | default=UserDownloaderConfig.DEFAULT_SORT, 329 | choices=UserDownloaderConfig.DEFAULT_SORT_OPTIONS, 330 | help='Download comments sorted by this option (default: %(default)s, choices: [%(choices)s])') 331 | comments_parser.add_argument('-l', 332 | default=UserDownloaderConfig.DEFAULT_COMMENT_LIMIT, 333 | metavar='post_limit', 334 | type=check_positive, 335 | help='Limit the number of comments downloaded (default: %(default)s, i.e., all comments)') 336 | comments_parser.add_argument('-o', 337 | required=True, 338 | type=str, 339 | metavar='output_path', 340 | help='Directory where saveddit will save downloaded comments' 341 | ) 342 | 343 | args = parser.parse_args(argv) 344 | print(asciiart()) 345 | 346 | if args.subparser_name == "subreddit": 347 | from saveddit.subreddit_downloader import SubredditDownloader 348 | for subreddit in args.subreddits: 349 | downloader = SubredditDownloader(subreddit) 350 | downloader.download(args.o, 351 | download_all_comments=args.all_comments, categories=args.f, post_limit=args.l, skip_videos=args.skip_videos, skip_meta=args.skip_meta, skip_comments=args.skip_comments) 352 | elif args.subparser_name == "multireddit": 353 | from saveddit.multireddit_downloader import MultiredditDownloader 354 | downloader = MultiredditDownloader(args.subreddits) 355 | downloader.download(args.o, 356 | categories=args.f, post_limit=args.l, skip_videos=args.skip_videos, skip_meta=args.skip_meta, skip_comments=args.skip_comments) 357 | elif args.subparser_name == "search": 358 | from saveddit.search_subreddits import SearchSubreddits 359 | downloader = SearchSubreddits(args.subreddits) 360 | downloader.download(args) 361 | elif args.subparser_name == "user": 362 | from saveddit.user_downloader import UserDownloader 363 | downloader = UserDownloader() 364 | downloader.download_user_meta(args) 365 | if args.user_subparser_name == "comments": 366 | downloader.download_comments(args) 367 | elif args.user_subparser_name == "multireddits": 368 | downloader.download_multireddits(args) 369 | elif args.user_subparser_name == "submitted": 370 | downloader.download_submitted(args) 371 | elif args.user_subparser_name == "saved": 372 | downloader.download_saved(args) 373 | elif args.user_subparser_name == "upvoted": 374 | downloader.download_upvoted(args) 375 | elif args.user_subparser_name == "gilded": 376 | downloader.download_gilded(args) 377 | else: 378 | parser.print_help() 379 | 380 | if __name__ == "__main__": 381 | main() 382 | -------------------------------------------------------------------------------- /src/saveddit/search_config.py: -------------------------------------------------------------------------------- 1 | class SearchConfig: 2 | DEFAULT_SORT = "relevance" 3 | DEFAULT_SORT_CATEGORIES = ["relevance", "hot", "top", "new", "comments"] 4 | DEFAULT_SYNTAX = "lucene" 5 | DEFAULT_SYNTAX_CATEGORIES = ["cloud search", "lucene", "plain"] 6 | DEFAULT_TIME_FILTER = "all" 7 | DEFAULT_TIME_FILTER_CATEGORIES = ["all", "day", "hour", "month", "week", "year"] -------------------------------------------------------------------------------- /src/saveddit/search_subreddits.py: -------------------------------------------------------------------------------- 1 | import coloredlogs 2 | from colorama import Fore, Style 3 | from datetime import datetime, timezone 4 | import logging 5 | import verboselogs 6 | import getpass 7 | import json 8 | import os 9 | import praw 10 | from pprint import pprint 11 | import re 12 | from saveddit.submission_downloader import SubmissionDownloader 13 | from saveddit.subreddit_downloader import SubredditDownloader 14 | from saveddit.search_config import SearchConfig 15 | import sys 16 | from tqdm import tqdm 17 | 18 | class SearchSubreddits: 19 | config = SubredditDownloader.config 20 | REDDIT_CLIENT_ID = config['reddit_client_id'] 21 | REDDIT_CLIENT_SECRET = config['reddit_client_secret'] 22 | IMGUR_CLIENT_ID = config['imgur_client_id'] 23 | 24 | REDDIT_USERNAME = None 25 | try: 26 | REDDIT_USERNAME = config['reddit_username'] 27 | except Exception as e: 28 | pass 29 | 30 | REDDIT_PASSWORD = None 31 | if REDDIT_USERNAME: 32 | if sys.stdin.isatty(): 33 | print("Username: " + REDDIT_USERNAME) 34 | REDDIT_PASSWORD = getpass.getpass("Password: ") 35 | else: 36 | # echo "foobar" > password 37 | # saveddit user .... < password 38 | REDDIT_PASSWORD = sys.stdin.readline().rstrip() 39 | 40 | def __init__(self, subreddit_names): 41 | self.logger = verboselogs.VerboseLogger(__name__) 42 | level_styles = { 43 | 'critical': {'bold': True, 'color': 'red'}, 44 | 'debug': {'color': 'green'}, 45 | 'error': {'color': 'red'}, 46 | 'info': {'color': 'white'}, 47 | 'notice': {'color': 'magenta'}, 48 | 'spam': {'color': 'white', 'faint': True}, 49 | 'success': {'bold': True, 'color': 'green'}, 50 | 'verbose': {'color': 'blue'}, 51 | 'warning': {'color': 'yellow'} 52 | } 53 | coloredlogs.install(level='SPAM', logger=self.logger, 54 | fmt='%(message)s', level_styles=level_styles) 55 | 56 | if not SearchSubreddits.REDDIT_USERNAME: 57 | self.logger.error("`reddit_username` in user_config.yaml is empty") 58 | self.logger.error("If you plan on using the user API of saveddit, then add your username to user_config.yaml") 59 | print("Exiting now") 60 | exit() 61 | else: 62 | if not len(SearchSubreddits.REDDIT_PASSWORD): 63 | if sys.stdin.isatty(): 64 | print("Username: " + REDDIT_USERNAME) 65 | REDDIT_PASSWORD = getpass.getpass("Password: ") 66 | else: 67 | # echo "foobar" > password 68 | # saveddit user .... < password 69 | REDDIT_PASSWORD = sys.stdin.readline().rstrip() 70 | 71 | self.reddit = praw.Reddit( 72 | client_id=SearchSubreddits.REDDIT_CLIENT_ID, 73 | client_secret=SearchSubreddits.REDDIT_CLIENT_SECRET, 74 | user_agent="saveddit (by /u/p_ranav)" 75 | ) 76 | 77 | self.multireddit_name = "+".join(subreddit_names) 78 | self.subreddit = self.reddit.subreddit(self.multireddit_name) 79 | 80 | def download(self, args): 81 | output_path = args.o 82 | query = args.q 83 | sort = args.s 84 | syntax = SearchConfig.DEFAULT_SYNTAX 85 | time_filter = args.t 86 | include_nsfw = args.include_nsfw 87 | skip_comments = args.skip_comments 88 | skip_videos = args.skip_videos 89 | skip_meta = args.skip_meta 90 | comment_limit = 0 # top-level comments ONLY 91 | 92 | self.logger.verbose("Searching '" + query + "' in " + self.multireddit_name + ", sorted by " + sort) 93 | if include_nsfw: 94 | self.logger.spam(" * Including NSFW results") 95 | 96 | search_dir = os.path.join(os.path.join(os.path.join(os.path.join(os.path.join( 97 | output_path, "www.reddit.com"), "q"), query), self.multireddit_name), sort) 98 | 99 | if not os.path.exists(search_dir): 100 | os.makedirs(search_dir) 101 | 102 | search_results = None 103 | if include_nsfw: 104 | search_params = {"include_over_18": "on"} 105 | search_results = self.subreddit.search(query, sort, syntax, time_filter, params=search_params) 106 | else: 107 | search_results = self.subreddit.search(query, sort, syntax, time_filter) 108 | 109 | results_found = False 110 | for i, submission in enumerate(search_results): 111 | if not results_found: 112 | results_found = True 113 | SubmissionDownloader(submission, i, self.logger, search_dir, 114 | skip_videos, skip_meta, skip_comments, comment_limit, 115 | {'imgur_client_id': SubredditDownloader.IMGUR_CLIENT_ID}) 116 | 117 | if not results_found: 118 | self.logger.spam(" * No results found") -------------------------------------------------------------------------------- /src/saveddit/submission_downloader.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import coloredlogs 3 | from colorama import Fore 4 | import contextlib 5 | import logging 6 | import verboselogs 7 | from datetime import datetime 8 | import os 9 | from io import StringIO 10 | import json 11 | import mimetypes 12 | import ffmpeg 13 | import praw 14 | from pprint import pprint 15 | import re 16 | import requests 17 | from tqdm import tqdm 18 | import urllib.request 19 | import youtube_dl 20 | import os 21 | 22 | 23 | class SubmissionDownloader: 24 | def __init__(self, submission, submission_index, logger, output_dir, skip_videos, skip_meta, skip_comments, comment_limit, config): 25 | self.IMGUR_CLIENT_ID = config["imgur_client_id"] 26 | 27 | self.logger = logger 28 | i = submission_index 29 | prefix_str = '#' + str(i).zfill(3) + ' ' 30 | self.indent_1 = ' ' * len(prefix_str) + "* " 31 | self.indent_2 = ' ' * len(self.indent_1) + "- " 32 | 33 | has_url = getattr(submission, "url", None) 34 | if has_url: 35 | title = submission.title 36 | self.logger.verbose(prefix_str + '"' + title + '"') 37 | title = re.sub(r'\W+', '_', title) 38 | 39 | # Truncate title 40 | if len(title) > 32: 41 | title = title[0:32] 42 | if os.name == "nt": 43 | pass 44 | else: 45 | title += "..." 46 | 47 | # Prepare directory for the submission 48 | post_dir = str(i).zfill(3) + "_" + title.replace(" ", "_") 49 | submission_dir = os.path.join(output_dir, post_dir) 50 | if not os.path.exists(submission_dir): 51 | os.makedirs(submission_dir) 52 | elif os.path.exists(submission_dir): 53 | print("File exists, Skipping it.") 54 | return 55 | 56 | self.logger.spam( 57 | self.indent_1 + "Processing `" + submission.url + "`") 58 | 59 | success = False 60 | 61 | should_create_files_dir = True 62 | if skip_comments and skip_meta: 63 | should_create_files_dir = False 64 | 65 | def create_files_dir(submission_dir): 66 | if should_create_files_dir: 67 | files_dir = os.path.join(submission_dir, "files") 68 | if not os.path.exists(files_dir): 69 | os.makedirs(files_dir) 70 | return files_dir 71 | else: 72 | return submission_dir 73 | 74 | if self.is_direct_link_to_content(submission.url, [".png", ".jpg", ".jpeg", ".gif"]): 75 | files_dir = create_files_dir(submission_dir) 76 | 77 | filename = submission.url.split("/")[-1] 78 | self.logger.spam( 79 | self.indent_1 + "This is a direct link to a " + filename.split(".")[-1] + " file") 80 | save_path = os.path.join(files_dir, filename) 81 | self.download_direct_link(submission, save_path) 82 | success = True 83 | elif self.is_direct_link_to_content(submission.url, [".mp4"]): 84 | filename = submission.url.split("/")[-1] 85 | self.logger.spam( 86 | self.indent_1 + "This is a direct link to a " + filename.split(".")[-1] + " file") 87 | if not skip_videos: 88 | files_dir = create_files_dir(submission_dir) 89 | save_path = os.path.join(files_dir, filename) 90 | self.download_direct_link(submission, save_path) 91 | success = True 92 | else: 93 | self.logger.spam(self.indent_1 + "Skipping download of video content") 94 | success = True 95 | elif self.is_reddit_gallery(submission.url): 96 | files_dir = create_files_dir(submission_dir) 97 | 98 | self.logger.spam( 99 | self.indent_1 + "This is a reddit gallery") 100 | self.download_reddit_gallery(submission, files_dir, skip_videos) 101 | success = True 102 | elif self.is_reddit_video(submission.url): 103 | self.logger.spam( 104 | self.indent_1 + "This is a reddit video") 105 | 106 | if not skip_videos: 107 | files_dir = create_files_dir(submission_dir) 108 | self.download_reddit_video(submission, files_dir) 109 | success = True 110 | else: 111 | self.logger.spam(self.indent_1 + "Skipping download of video content") 112 | success = True 113 | elif self.is_gfycat_link(submission.url) or self.is_redgifs_link(submission.url): 114 | if self.is_gfycat_link(submission.url): 115 | self.logger.spam( 116 | self.indent_1 + "This is a gfycat link") 117 | else: 118 | self.logger.spam( 119 | self.indent_1 + "This is a redgif link") 120 | 121 | if not skip_videos: 122 | files_dir = create_files_dir(submission_dir) 123 | self.download_gfycat_or_redgif(submission, files_dir) 124 | success = True 125 | else: 126 | self.logger.spam(self.indent_1 + "Skipping download of video content") 127 | success = True 128 | elif self.is_imgur_album(submission.url): 129 | files_dir = create_files_dir(submission_dir) 130 | 131 | self.logger.spam( 132 | self.indent_1 + "This is an imgur album") 133 | self.download_imgur_album(submission, files_dir) 134 | success = True 135 | elif self.is_imgur_image(submission.url): 136 | files_dir = create_files_dir(submission_dir) 137 | 138 | self.logger.spam( 139 | self.indent_1 + "This is an imgur image or video") 140 | self.download_imgur_image(submission, files_dir) 141 | success = True 142 | elif self.is_self_post(submission): 143 | self.logger.spam(self.indent_1 + "This is a self-post") 144 | success = True 145 | elif (not skip_videos) and (self.is_youtube_link(submission.url) or self.is_supported_by_youtubedl(submission.url)): 146 | if self.is_youtube_link(submission.url): 147 | self.logger.spam( 148 | self.indent_1 + "This is a youtube link") 149 | else: 150 | self.logger.spam( 151 | self.indent_1 + "This link is supported by a youtube-dl extractor") 152 | 153 | if not skip_videos: 154 | files_dir = create_files_dir(submission_dir) 155 | self.download_youtube_video(submission.url, files_dir) 156 | success = True 157 | else: 158 | self.logger.spam(self.indent_1 + "Skipping download of video content") 159 | success = True 160 | else: 161 | success = True 162 | 163 | # Download submission meta 164 | if not skip_meta: 165 | self.logger.spam(self.indent_1 + "Saving submission.json") 166 | self.download_submission_meta(submission, submission_dir) 167 | else: 168 | self.logger.spam( 169 | self.indent_1 + "Skipping submissions meta") 170 | 171 | # Downlaod comments if requested 172 | if not skip_comments: 173 | if comment_limit == None: 174 | self.logger.spam( 175 | self.indent_1 + "Saving all comments to comments.json") 176 | else: 177 | self.logger.spam( 178 | self.indent_1 + "Saving top-level comments to comments.json") 179 | self.download_comments( 180 | submission, submission_dir, comment_limit) 181 | else: 182 | self.logger.spam( 183 | self.indent_1 + "Skipping comments") 184 | 185 | if success: 186 | self.logger.spam( 187 | self.indent_1 + "Saved to " + submission_dir + "\n") 188 | else: 189 | self.logger.warning( 190 | self.indent_1 + "Failed to download from link " + submission.url + "\n" 191 | ) 192 | 193 | def print_formatted_error(self, e): 194 | for line in str(e).split("\n"): 195 | self.logger.error(self.indent_2 + line) 196 | 197 | def is_direct_link_to_content(self, url, supported_file_formats): 198 | url_leaf = url.split("/")[-1] 199 | return any([i in url_leaf for i in supported_file_formats]) and ".gifv" not in url_leaf 200 | 201 | def download_direct_link(self, submission, output_path): 202 | try: 203 | urllib.request.urlretrieve(submission.url, output_path) 204 | except Exception as e: 205 | self.print_formatted_error(e) 206 | 207 | def is_youtube_link(self, url): 208 | return "youtube.com" in url or "youtu.be" in url 209 | 210 | def is_supported_by_youtubedl(self, url): 211 | try: 212 | # Since youtube-dl's quiet mode is amything BUT quiet 213 | # using contextlib to redirect stdout to a local StringIO variable 214 | local_stderr = StringIO() 215 | with contextlib.redirect_stderr(local_stderr): 216 | if "flickr.com/photos" in url: 217 | return False 218 | 219 | # Try to extract info from URL 220 | try: 221 | download_options = { 222 | 'quiet': True, 223 | 'warnings': True, 224 | 'ignoreerrors': True, 225 | } 226 | ydl = youtube_dl.YoutubeDL(download_options) 227 | r = ydl.extract_info(url, download=False) 228 | except Exception as e: 229 | # No media found through youtube-dl 230 | self.logger.spam(self.indent_2 + "No media found in '" + url + "' that could be downloaded with youtube-dl") 231 | return False 232 | 233 | extractors = youtube_dl.extractor.gen_extractors() 234 | for e in extractors: 235 | if e.suitable(url) and e.IE_NAME != 'generic': 236 | return True 237 | self.logger.spam(self.indent_2 + "This link could potentially be downloaded with youtube-dl") 238 | self.logger.spam(self.indent_2 + "No media found in '" + url + "' that could be downloaded with youtube-dl") 239 | return False 240 | except Exception as e: 241 | return False 242 | 243 | def download_youtube_video(self, url, output_path): 244 | try: 245 | local_stderr = StringIO() 246 | with contextlib.redirect_stderr(local_stderr): 247 | download_options = { 248 | 'format': "299+bestaudio/298+bestaudio/137+bestaudio/136+bestaudio/best", 249 | 'quiet': True, 250 | 'warnings': True, 251 | 'ignoreerrors': True, 252 | 'nooverwrites': True, 253 | 'continuedl': True, 254 | 'outtmpl': output_path + '/%(id)s.%(ext)s' 255 | } 256 | self.logger.spam(self.indent_2 + "Downloading " + 257 | url + " with youtube-dl") 258 | with youtube_dl.YoutubeDL(download_options) as ydl: 259 | ydl.download([url]) 260 | errors = local_stderr.getvalue() 261 | if not len(errors): 262 | self.logger.spam(self.indent_2 + "Finished downloading video from " + 263 | url) 264 | else: 265 | self.logger.error(self.indent_2 + errors.strip()) 266 | except Exception as e: 267 | self.logger.error(self.indent_2 + "Failed to download with youtube-dl") 268 | self.print_formatted_error(e) 269 | 270 | def is_reddit_gallery(self, url): 271 | return "reddit.com/gallery" in url 272 | 273 | def download_reddit_gallery(self, submission, output_path, skip_videos): 274 | gallery_data = getattr(submission, "gallery_data", None) 275 | media_metadata = getattr(submission, "media_metadata", None) 276 | self.logger.spam( 277 | self.indent_2 + "Looking for submission.gallery_data and submission.media_metadata") 278 | 279 | if gallery_data == None and media_metadata == None: 280 | # gallery_data not in submission 281 | # could be a crosspost 282 | crosspost_parent_list = getattr( 283 | submission, "crosspost_parent_list", None) 284 | if crosspost_parent_list != None: 285 | self.logger.spam( 286 | self.indent_2 + "This is a crosspost to a reddit gallery") 287 | first_parent = crosspost_parent_list[0] 288 | gallery_data = first_parent["gallery_data"] 289 | media_metadata = first_parent["media_metadata"] 290 | 291 | if gallery_data != None and media_metadata != None: 292 | image_count = len(gallery_data["items"]) 293 | self.logger.spam(self.indent_2 + "This reddit gallery has " + 294 | str(image_count) + " images") 295 | for j, item in tqdm(enumerate(gallery_data["items"]), total=image_count, bar_format='%s%s{l_bar}{bar:20}{r_bar}%s' % (self.indent_2, Fore.WHITE + Fore.LIGHTBLACK_EX, Fore.RESET)): 296 | try: 297 | media_id = item["media_id"] 298 | item_metadata = media_metadata[media_id] 299 | item_format = item_metadata['m'] 300 | if "image/" in item_format or "video/" in item_format: 301 | if not os.path.exists(output_path): 302 | os.makedirs(output_path) 303 | if "image/" in item_format: 304 | item_format = item_format.split("image/")[-1] 305 | elif "video/" in item_format: 306 | item_format = item_format.split("video/")[-1] 307 | # Skip video content if requested by user 308 | if skip_videos: 309 | continue 310 | item_filename = media_id + "." + item_format 311 | item_url = item_metadata["s"]["u"] 312 | save_path = os.path.join(output_path, item_filename) 313 | try: 314 | urllib.request.urlretrieve(item_url, save_path) 315 | except Exception as e: 316 | self.print_formatted_error(e) 317 | except Exception as e: 318 | self.print_formatted_error(e) 319 | 320 | def is_reddit_video(self, url): 321 | return "v.redd.it" in url 322 | 323 | def download_reddit_video(self, submission, output_path): 324 | media = getattr(submission, "media", None) 325 | media_id = submission.url.split("v.redd.it/")[-1] 326 | 327 | self.logger.spam(self.indent_2 + "Looking for submission.media") 328 | 329 | if media == None: 330 | # link might be a crosspost 331 | crosspost_parent_list = getattr( 332 | submission, "crosspost_parent_list", None) 333 | if crosspost_parent_list != None: 334 | self.logger.spam( 335 | self.indent_2 + "This is a crosspost to a reddit video") 336 | first_parent = crosspost_parent_list[0] 337 | media = first_parent["media"] 338 | 339 | if media != None: 340 | self.logger.spam(self.indent_2 + "Downloading video component") 341 | url = media["reddit_video"]["fallback_url"] 342 | video_save_path = os.path.join( 343 | output_path, media_id + "_video.mp4") 344 | try: 345 | urllib.request.urlretrieve(url, video_save_path) 346 | except Exception as e: 347 | self.print_formatted_error(e) 348 | 349 | # Download the audio 350 | self.logger.spam(self.indent_2 + "Downloading audio component") 351 | audio_downloaded = False 352 | audio_save_path = os.path.join( 353 | output_path, media_id + "_audio.mp4") 354 | try: 355 | urllib.request.urlretrieve( 356 | submission.url + "/DASH_audio.mp4", audio_save_path) 357 | audio_downloaded = True 358 | except Exception as e: 359 | pass 360 | 361 | if audio_downloaded == True: 362 | # Merge mp4 files 363 | self.logger.spam( 364 | self.indent_2 + "Merging video & audio components with ffmpeg") 365 | output_save_path = os.path.join(output_path, media_id + ".mp4") 366 | input_video = ffmpeg.input(video_save_path) 367 | input_audio = ffmpeg.input(audio_save_path) 368 | ffmpeg.concat(input_video, input_audio, v=1, a=1)\ 369 | .output(output_save_path)\ 370 | .global_args('-loglevel', 'error')\ 371 | .global_args('-y')\ 372 | .run() 373 | self.logger.spam(self.indent_2 + "Done merging with ffmpeg") 374 | else: 375 | self.logger.spam( 376 | self.indent_2 + "This video does not have an audio component") 377 | 378 | self.logger.spam( 379 | self.indent_2 + "Sucessfully saved video") 380 | 381 | def is_gfycat_link(self, url): 382 | return "gfycat.com/" in url 383 | 384 | def is_redgifs_link(self, url): 385 | return "redgifs.com/" in url 386 | 387 | def get_gfycat_embedded_video_url(self, url): 388 | try: 389 | response = requests.get(url) 390 | data = response.text 391 | soup = BeautifulSoup(data, features="html.parser") 392 | 393 | # Cycle through all links 394 | giant_url_found = False 395 | giant_url = "" 396 | thumbs_url_found = False 397 | thumbs_url = "" 398 | for link in soup.find_all(): 399 | link_src = link.get('src') 400 | src_url = str(link_src) 401 | if ".mp4" in src_url: 402 | # Looking for giant.gfycat.com 403 | if "giant." in src_url: 404 | giant_url_found = True 405 | giant_url = src_url 406 | elif "thumbs." in src_url: 407 | thumbs_url_found = True 408 | thumbs_url = src_url 409 | except Exception as e: 410 | self.print_formatted_error(e) 411 | return "" 412 | 413 | if giant_url_found: 414 | return giant_url 415 | elif thumbs_url_found: 416 | return thumbs_url 417 | else: 418 | return "" 419 | 420 | def guess_extension(self, url): 421 | response = requests.get(url) 422 | content_type = response.headers['content-type'] 423 | return mimetypes.guess_extension(content_type) 424 | 425 | def get_redirect_url(self, url): 426 | r = requests.get(url) 427 | return r.url 428 | 429 | def download_gfycat_or_redgif(self, submission, output_dir): 430 | # Check if gfycat redirects to gifdeliverynetwork 431 | redirect_url = self.get_redirect_url(submission.url) 432 | if "gfycat.com" in submission.url and "gifdeliverynetwork.com" in redirect_url: 433 | self.logger.spam( 434 | self.indent_2 + "This is a gfycat link that redirects to gifdeliverynetwork.com") 435 | try: 436 | # Gfycat link that redirects to gifdeliverynetwork 437 | # True source in this case is hiding in redgifs.com 438 | response = requests.get(redirect_url) 439 | html = BeautifulSoup(response.content, features="html.parser") 440 | links = html.find_all() 441 | for i in links: 442 | if "src" in str(i): 443 | attrs = i.attrs 444 | if "src" in attrs: 445 | src = attrs["src"] 446 | if "redgifs.com/" in src: 447 | self.logger.spam( 448 | self.indent_2 + "Found embedded media at " + src) 449 | filename = src.split("/")[-1] 450 | save_path = os.path.join(output_dir, filename) 451 | try: 452 | r = requests.get(src) 453 | with open(save_path, 'wb') as outfile: 454 | outfile.write(r.content) 455 | except Exception as e: 456 | self.print_formatted_error(e) 457 | except Exception as e: 458 | self.print_formatted_error(e) 459 | 460 | self.logger.spam( 461 | self.indent_2 + "Looking for submission.preview.reddit_video_preview.fallback_url") 462 | 463 | preview = None 464 | try: 465 | preview = getattr(submission, "preview") 466 | if preview: 467 | if "reddit_video_preview" in preview: 468 | if "fallback_url" in preview["reddit_video_preview"]: 469 | self.logger.spam(self.indent_2 + "Found submission.preview.reddit_video_preview.fallback_url") 470 | fallback_url = preview["reddit_video_preview"]["fallback_url"] 471 | if "." in fallback_url.split("/")[-1]: 472 | file_format = fallback_url.split(".")[-1] 473 | filename = submission.url.split("/")[-1] + "." + file_format 474 | else: 475 | filename = submission.url.split("/")[-1] + ".mp4" 476 | save_path = os.path.join(output_dir, filename) 477 | try: 478 | urllib.request.urlretrieve(fallback_url, save_path) 479 | return 480 | except Exception as e: 481 | self.print_formatted_error(e) 482 | elif "images" in preview: 483 | if "source" in preview["images"][0]: 484 | self.logger.spam(self.indent_2 + "Found submission.preview.images instead") 485 | source_url = preview["images"][0]["source"]["url"] 486 | try: 487 | extension = self.guess_extension(source_url) 488 | filename = submission.url.split("/")[-1] + extension 489 | save_path = os.path.join(output_dir, filename) 490 | try: 491 | urllib.request.urlretrieve(source_url, save_path) 492 | except Exception as e: 493 | self.print_formatted_error(e) 494 | except Exception as e: 495 | self.print_formatted_error(e) 496 | 497 | except Exception as e: 498 | self.print_formatted_error(e) 499 | 500 | try: 501 | self.logger.spam( 502 | self.indent_2 + "Looking for submission.media_embed") 503 | media_embed = getattr(submission, "media_embed") 504 | if media_embed: 505 | content = media_embed["content"] 506 | self.logger.spam( 507 | self.indent_2 + "Found submission.media_embed") 508 | if "iframe" in content: 509 | if "gfycat.com" in submission.url: 510 | self.logger.spam( 511 | self.indent_2 + "This is an embedded video in gfycat.com") 512 | # This is likely an embedded video in gfycat 513 | video_url = self.get_gfycat_embedded_video_url(submission.url) 514 | if video_url: 515 | filename = video_url.split("/")[-1] 516 | save_path = os.path.join(output_dir, filename) 517 | 518 | self.logger.spam( 519 | self.indent_2 + "Embedded video URL: " + video_url) 520 | try: 521 | r = requests.get(video_url) 522 | with open(save_path, 'wb') as outfile: 523 | outfile.write(r.content) 524 | except Exception as e: 525 | self.print_formatted_error(e) 526 | except Exception as e: 527 | self.print_formatted_error(e) 528 | 529 | def is_imgur_album(self, url): 530 | return "imgur.com/a/" in url or "imgur.com/gallery/" in url 531 | 532 | def get_imgur_album_images_count(self, album_id): 533 | request = "https://api.imgur.com/3/album/" + album_id 534 | res = requests.get(request, headers={ 535 | "Authorization": "Client-ID " + self.IMGUR_CLIENT_ID}) 536 | if res.status_code == 200: 537 | return res.json()["data"]["images_count"] 538 | else: 539 | self.logger.spam(self.indent_2 + "This imgur album is empty") 540 | return 0 541 | 542 | def get_imgur_image_meta(self, image_id): 543 | request = "https://api.imgur.com/3/image/" + image_id 544 | res = requests.get(request, headers={ 545 | "Authorization": "Client-ID " + self.IMGUR_CLIENT_ID}) 546 | return res.json()["data"] 547 | 548 | def download_imgur_album(self, submission, output_dir): 549 | # Imgur album 550 | album_id = "" 551 | if "imgur.com/a/" in submission.url: 552 | album_id = submission.url.split("imgur.com/a/")[-1] 553 | elif "imgur.com/gallery/" in submission.url: 554 | album_id = submission.url.split("imgur.com/gallery/")[-1] 555 | 556 | self.logger.spam(self.indent_2 + "Album ID " + album_id) 557 | 558 | images_count = self.get_imgur_album_images_count(album_id) 559 | if images_count > 0: 560 | request = "https://api.imgur.com/3/album/" + album_id 561 | res = requests.get(request, headers={ 562 | "Authorization": "Client-ID " + self.IMGUR_CLIENT_ID}) 563 | self.logger.spam(self.indent_2 + "This imgur album has " + 564 | str(images_count) + " images") 565 | for i, image in tqdm(enumerate(res.json()["data"]["images"]), total=images_count, bar_format='%s%s{l_bar}{bar:20}{r_bar}%s' % (self.indent_2, Fore.WHITE + Fore.LIGHTBLACK_EX, Fore.RESET)): 566 | url = image["link"] 567 | filename = str(i).zfill(3) + "_" + url.split("/")[-1] 568 | save_path = os.path.join(output_dir, filename) 569 | try: 570 | if not os.path.exists(output_dir): 571 | os.makedirs(output_dir) 572 | urllib.request.urlretrieve(url, save_path) 573 | except Exception as e: 574 | self.print_formatted_error(e) 575 | 576 | def is_imgur_image(self, url): 577 | return "imgur.com" in url 578 | 579 | def download_imgur_image(self, submission, output_dir): 580 | # Other imgur content, e.g., .gifv, '.mp4', '.jpg', etc. 581 | url_leaf = submission.url.split("/")[-1] 582 | if "." in url_leaf: 583 | image_id = url_leaf.split(".")[0] 584 | else: 585 | image_id = url_leaf 586 | 587 | try: 588 | data = self.get_imgur_image_meta(image_id) 589 | url = data["link"] 590 | image_type = data["type"] 591 | if "video/" in image_type: 592 | self.logger.spam( 593 | self.indent_2 + "This is an imgur link to a video file") 594 | image_type = image_type.split("video/")[-1] 595 | elif "image/" in image_type: 596 | self.logger.spam( 597 | self.indent_2 + "This is an imgur link to an image file") 598 | image_type = image_type.split("image/")[-1] 599 | 600 | filename = image_id + "." + image_type 601 | save_path = os.path.join(output_dir, filename) 602 | 603 | urllib.request.urlretrieve(url, save_path) 604 | except Exception as e: 605 | self.print_formatted_error(e) 606 | 607 | def download_comments(self, submission, output_dir, comment_limit): 608 | # Save comments - Breath first unwrap of comment forest 609 | comments_list = [] 610 | with open(os.path.join(output_dir, 'comments.json'), 'w') as file: 611 | submission.comments.replace_more(limit=comment_limit) 612 | limited_comments = submission.comments.list() 613 | if not len(limited_comments): 614 | # No comments 615 | self.logger.spam(self.indent_2 + "No comments found") 616 | return 617 | 618 | for comment in tqdm(limited_comments, total=len(limited_comments), bar_format='%s%s{l_bar}{bar:20}{r_bar}%s' % (self.indent_2, Fore.WHITE + Fore.LIGHTBLACK_EX, Fore.RESET)): 619 | comment_dict = {} 620 | try: 621 | if comment.author: 622 | comment_dict["author"] = comment.author.name 623 | else: 624 | comment_dict["author"] = None 625 | comment_dict["body"] = comment.body 626 | comment_dict["created_utc"] = int(comment.created_utc) 627 | comment_dict["distinguished"] = comment.distinguished 628 | comment_dict["downs"] = comment.downs 629 | comment_dict["edited"] = comment.edited 630 | comment_dict["id"] = comment.id 631 | comment_dict["is_submitter"] = comment.is_submitter 632 | comment_dict["link_id"] = comment.link_id 633 | comment_dict["parent_id"] = comment.parent_id 634 | comment_dict["permalink"] = comment.permalink 635 | comment_dict["score"] = comment.score 636 | comment_dict["stickied"] = comment.stickied 637 | comment_dict["subreddit_name_prefixed"] = comment.subreddit_name_prefixed 638 | comment_dict["subreddit_id"] = comment.subreddit_id 639 | comment_dict["total_awards_received"] = comment.total_awards_received 640 | comment_dict["ups"] = comment.ups 641 | except Exception as e: 642 | self.print_formatted_error(e) 643 | comments_list.append(comment_dict) 644 | file.write(json.dumps(comments_list, indent=2)) 645 | 646 | def is_self_post(self, submission): 647 | return submission.is_self 648 | 649 | def download_submission_meta(self, submission, submission_dir): 650 | submission_dict = {} 651 | if submission.author: 652 | submission_dict["author"] = submission.author.name 653 | else: 654 | submission_dict["author"] = None 655 | submission_dict["created_utc"] = int(submission.created_utc) 656 | submission_dict["distinguished"] = submission.distinguished 657 | submission_dict["downs"] = submission.downs 658 | submission_dict["edited"] = submission.edited 659 | submission_dict["id"] = submission.id 660 | submission_dict["link_flair_text"] = submission.link_flair_text 661 | submission_dict["locked"] = submission.locked 662 | submission_dict["num_comments"] = submission.num_comments 663 | submission_dict["num_crossposts"] = submission.num_crossposts 664 | submission_dict["permalink"] = submission.permalink 665 | submission_dict["selftext"] = submission.selftext 666 | submission_dict["selftext"] = submission.selftext 667 | submission_dict["selftext_html"] = submission.selftext_html 668 | submission_dict["send_replies"] = submission.send_replies 669 | submission_dict["spoiler"] = submission.spoiler 670 | submission_dict["stickied"] = submission.stickied 671 | submission_dict["subreddit_name_prefixed"] = submission.subreddit_name_prefixed 672 | submission_dict["subreddit_id"] = submission.subreddit_id 673 | submission_dict["subreddit_subscribers"] = submission.subreddit_subscribers 674 | submission_dict["subreddit_type"] = submission.subreddit_type 675 | submission_dict["title"] = submission.title 676 | submission_dict["total_awards_received"] = submission.total_awards_received 677 | submission_dict["ups"] = submission.ups 678 | submission_dict["upvote_ratio"] = submission.upvote_ratio 679 | submission_dict["url"] = submission.url 680 | 681 | with open(os.path.join(submission_dir, "submission.json"), 'w') as file: 682 | file.write(json.dumps(submission_dict, indent=2)) 683 | -------------------------------------------------------------------------------- /src/saveddit/subreddit_downloader.py: -------------------------------------------------------------------------------- 1 | import coloredlogs 2 | from colorama import Fore 3 | import logging 4 | import verboselogs 5 | import os 6 | import praw 7 | from saveddit.configuration import ConfigurationLoader 8 | from saveddit.submission_downloader import SubmissionDownloader 9 | from saveddit.subreddit_downloader_config import SubredditDownloaderConfig 10 | 11 | class SubredditDownloader: 12 | app_config_dir = os.path.expanduser("~/.saveddit") 13 | if not os.path.exists(app_config_dir): 14 | os.makedirs(app_config_dir) 15 | 16 | config_file_location = os.path.expanduser("~/.saveddit/user_config.yaml") 17 | config = ConfigurationLoader.load(config_file_location) 18 | 19 | REDDIT_CLIENT_ID = config['reddit_client_id'] 20 | REDDIT_CLIENT_SECRET = config['reddit_client_secret'] 21 | IMGUR_CLIENT_ID = config['imgur_client_id'] 22 | 23 | def __init__(self, subreddit_name): 24 | self.subreddit_name = subreddit_name 25 | reddit = praw.Reddit( 26 | client_id=SubredditDownloader.REDDIT_CLIENT_ID, 27 | client_secret=SubredditDownloader.REDDIT_CLIENT_SECRET, 28 | user_agent="saveddit (by /u/p_ranav)", 29 | ) 30 | self.subreddit = reddit.subreddit(subreddit_name) 31 | 32 | self.logger = verboselogs.VerboseLogger(__name__) 33 | level_styles = { 34 | 'critical': {'bold': True, 'color': 'red'}, 35 | 'debug': {'color': 'green'}, 36 | 'error': {'color': 'red'}, 37 | 'info': {'color': 'white'}, 38 | 'notice': {'color': 'magenta'}, 39 | 'spam': {'color': 'white', 'faint': True}, 40 | 'success': {'bold': True, 'color': 'green'}, 41 | 'verbose': {'color': 'blue'}, 42 | 'warning': {'color': 'yellow'} 43 | } 44 | coloredlogs.install(level='SPAM', logger=self.logger, 45 | fmt='%(message)s', level_styles=level_styles) 46 | 47 | def download(self, output_path, download_all_comments, categories=SubredditDownloaderConfig.DEFAULT_CATEGORIES, post_limit=SubredditDownloaderConfig.DEFAULT_POST_LIMIT, skip_videos=False, skip_meta=False, skip_comments=False): 48 | ''' 49 | categories: List of categories within the subreddit to download (see SubredditDownloaderConfig.DEFAULT_CATEGORIES) 50 | post_limit: Number of posts to download (default: None, i.e., all posts) 51 | comment_limit: Number of comment levels to download from submission (default: `0`, i.e., only top-level comments) 52 | - to get all comments, set comment_limit to `None` 53 | ''' 54 | root_dir = os.path.join(os.path.join(os.path.join( 55 | output_path, "www.reddit.com"), "r"), self.subreddit_name) 56 | categories = categories 57 | 58 | if download_all_comments == False: 59 | comment_limit = 0 60 | elif download_all_comments == True: 61 | comment_limit = None 62 | 63 | for c in categories: 64 | self.logger.notice("Downloading from /r/" + 65 | self.subreddit_name + "/" + c + "/") 66 | category_dir = os.path.join(root_dir, c) 67 | if not os.path.exists(category_dir): 68 | os.makedirs(category_dir) 69 | category_function = getattr(self.subreddit, c) 70 | 71 | for i, submission in enumerate(category_function(limit=post_limit)): 72 | SubmissionDownloader(submission, i, self.logger, category_dir, 73 | skip_videos, skip_meta, skip_comments, comment_limit, 74 | {'imgur_client_id': SubredditDownloader.IMGUR_CLIENT_ID}) 75 | -------------------------------------------------------------------------------- /src/saveddit/subreddit_downloader_config.py: -------------------------------------------------------------------------------- 1 | class SubredditDownloaderConfig: 2 | DEFAULT_CATEGORIES = ["hot", "new", "random_rising", "rising", 3 | "controversial", "top", "gilded"] 4 | DEFAULT_POST_LIMIT = None -------------------------------------------------------------------------------- /src/saveddit/user_downloader.py: -------------------------------------------------------------------------------- 1 | import coloredlogs 2 | from colorama import Fore, Style 3 | from datetime import datetime, timezone 4 | import logging 5 | import verboselogs 6 | import getpass 7 | import json 8 | import os 9 | import praw 10 | from pprint import pprint 11 | import re 12 | from saveddit.submission_downloader import SubmissionDownloader 13 | from saveddit.subreddit_downloader import SubredditDownloader 14 | import sys 15 | from tqdm import tqdm 16 | 17 | class UserDownloader: 18 | config = SubredditDownloader.config 19 | 20 | REDDIT_CLIENT_ID = config['reddit_client_id'] 21 | REDDIT_CLIENT_SECRET = config['reddit_client_secret'] 22 | REDDIT_USERNAME = None 23 | try: 24 | REDDIT_USERNAME = config['reddit_username'] 25 | except Exception as e: 26 | pass 27 | 28 | REDDIT_PASSWORD = None 29 | if REDDIT_USERNAME: 30 | if sys.stdin.isatty(): 31 | print("Username: " + REDDIT_USERNAME) 32 | REDDIT_PASSWORD = getpass.getpass("Password: ") 33 | else: 34 | # echo "foobar" > password 35 | # saveddit user .... < password 36 | REDDIT_PASSWORD = sys.stdin.readline().rstrip() 37 | 38 | IMGUR_CLIENT_ID = config['imgur_client_id'] 39 | 40 | def __init__(self): 41 | self.logger = verboselogs.VerboseLogger(__name__) 42 | level_styles = { 43 | 'critical': {'bold': True, 'color': 'red'}, 44 | 'debug': {'color': 'green'}, 45 | 'error': {'color': 'red'}, 46 | 'info': {'color': 'white'}, 47 | 'notice': {'color': 'magenta'}, 48 | 'spam': {'color': 'white', 'faint': True}, 49 | 'success': {'bold': True, 'color': 'green'}, 50 | 'verbose': {'color': 'blue'}, 51 | 'warning': {'color': 'yellow'} 52 | } 53 | coloredlogs.install(level='SPAM', logger=self.logger, 54 | fmt='%(message)s', level_styles=level_styles) 55 | 56 | if not UserDownloader.REDDIT_USERNAME: 57 | self.logger.error("`reddit_username` in user_config.yaml is empty") 58 | self.logger.error("If you plan on using the user API of saveddit, then add your username to user_config.yaml") 59 | print("Exiting now") 60 | exit() 61 | else: 62 | if not len(UserDownloader.REDDIT_PASSWORD): 63 | if sys.stdin.isatty(): 64 | print("Username: " + UserDownloader.REDDIT_USERNAME) 65 | REDDIT_PASSWORD = getpass.getpass("Password: ") 66 | else: 67 | # echo "foobar" > password 68 | # saveddit user .... < password 69 | REDDIT_PASSWORD = sys.stdin.readline().rstrip() 70 | 71 | self.reddit = praw.Reddit( 72 | client_id=UserDownloader.REDDIT_CLIENT_ID, 73 | client_secret=UserDownloader.REDDIT_CLIENT_SECRET, 74 | user_agent="saveddit (by /u/p_ranav)", 75 | username=UserDownloader.REDDIT_USERNAME, 76 | password=UserDownloader.REDDIT_PASSWORD 77 | ) 78 | 79 | def download_user_meta(self, args): 80 | output_path = args.o 81 | 82 | for username in args.users: 83 | user = self.reddit.redditor(name=username) 84 | 85 | root_dir = os.path.join(os.path.join(os.path.join( 86 | output_path, "www.reddit.com"), "u"), username) 87 | 88 | if not os.path.exists(root_dir): 89 | os.makedirs(root_dir) 90 | 91 | with open(os.path.join(root_dir, 'user.json'), 'w') as file: 92 | user_dict = {} 93 | user_dict["comment_karma"] = user.comment_karma 94 | user_dict["created_utc"] = int(user.created_utc) 95 | user_dict["has_verified_email"] = user.has_verified_email 96 | user_dict["icon_img"] = user.icon_img 97 | user_dict["id"] = user.id 98 | user_dict["is_employee"] = user.is_employee 99 | user_dict["is_friend"] = user.is_friend 100 | user_dict["is_mod"] = user.is_mod 101 | user_dict["is_gold"] = user.is_gold 102 | try: 103 | user_dict["is_suspended"] = user.is_suspended 104 | except Exception as e: 105 | user_dict["is_suspended"] = None 106 | user_dict["link_karma"] = user.link_karma 107 | user_dict["name"] = user.name 108 | 109 | file.write(json.dumps(user_dict, indent=2)) 110 | 111 | def download_comments(self, args): 112 | output_path = args.o 113 | 114 | for username in args.users: 115 | user = self.reddit.redditor(name=username) 116 | 117 | self.logger.notice("Downloading from /u/" + username + "/comments") 118 | 119 | root_dir = os.path.join(os.path.join(os.path.join( 120 | output_path, "www.reddit.com"), "u"), username) 121 | 122 | try: 123 | sort = args.s 124 | limit = args.l 125 | 126 | comments_dir = os.path.join(root_dir, "comments") 127 | if not os.path.exists(comments_dir): 128 | os.makedirs(comments_dir) 129 | 130 | self.logger.verbose("Downloading comments sorted by " + sort) 131 | category_function = getattr(user.comments, sort) 132 | 133 | category_dir = os.path.join(comments_dir, sort) 134 | 135 | if category_function: 136 | if not os.path.exists(category_dir): 137 | os.makedirs(category_dir) 138 | for i, comment in enumerate(category_function(limit=limit)): 139 | prefix_str = '#' + str(i).zfill(3) + ' ' 140 | self.indent_1 = ' ' * len(prefix_str) + "* " 141 | self.indent_2 = ' ' * len(self.indent_1) + "- " 142 | 143 | comment_body = comment.body 144 | comment_body = comment_body[0:32] 145 | comment_body = re.sub(r'\W+', '_', comment_body) 146 | comment_filename = str(i).zfill(3) + "_Comment_" + \ 147 | comment_body + "..." + ".json" 148 | self.logger.spam(self.indent_1 + comment.id + ' - "' + comment.body[0:64].replace("\n", "").replace("\r", "") + '..."') 149 | 150 | with open(os.path.join(category_dir, comment_filename), 'w') as file: 151 | comment_dict = {} 152 | try: 153 | if comment.author: 154 | comment_dict["author"] = comment.author.name 155 | else: 156 | comment_dict["author"] = None 157 | comment_dict["body"] = comment.body 158 | comment_dict["created_utc"] = int(comment.created_utc) 159 | comment_dict["distinguished"] = comment.distinguished 160 | comment_dict["downs"] = comment.downs 161 | comment_dict["edited"] = comment.edited 162 | comment_dict["id"] = comment.id 163 | comment_dict["is_submitter"] = comment.is_submitter 164 | comment_dict["link_id"] = comment.link_id 165 | comment_dict["parent_id"] = comment.parent_id 166 | comment_dict["permalink"] = comment.permalink 167 | comment_dict["score"] = comment.score 168 | comment_dict["stickied"] = comment.stickied 169 | comment_dict["subreddit_name_prefixed"] = comment.subreddit_name_prefixed 170 | comment_dict["subreddit_id"] = comment.subreddit_id 171 | comment_dict["total_awards_received"] = comment.total_awards_received 172 | comment_dict["ups"] = comment.ups 173 | file.write(json.dumps(comment_dict, indent=2)) 174 | except Exception as e: 175 | self.print_formatted_error(e) 176 | except Exception as e: 177 | self.logger.error("Unable to download comments for user `" + username + "` - " + str(e)) 178 | 179 | def download_multireddits(self, args): 180 | output_path = args.o 181 | 182 | for username in args.users: 183 | user = self.reddit.redditor(name=username) 184 | 185 | root_dir = os.path.join(os.path.join(os.path.join(os.path.join( 186 | output_path, "www.reddit.com"), "u"), username), "m") 187 | 188 | try: 189 | post_limit = args.l 190 | names = args.n 191 | categories = args.f 192 | skip_meta = args.skip_meta 193 | skip_videos = args.skip_videos 194 | skip_comments = args.skip_comments 195 | comment_limit = 0 # top-level comments ONLY 196 | 197 | # If names is None, download all multireddits from user's page 198 | if not names: 199 | names = [m.name.lower() for m in user.multireddits()] 200 | else: 201 | names = [n.lower() for n in names] 202 | 203 | for multireddit in user.multireddits(): 204 | if multireddit.name.lower() in names: 205 | name = multireddit.name 206 | self.logger.notice("Downloading from /u/" + username + "/m/" + name) 207 | multireddit_dir = os.path.join(root_dir, name) 208 | if not os.path.exists(multireddit_dir): 209 | os.makedirs(multireddit_dir) 210 | 211 | for category in categories: 212 | 213 | self.logger.verbose("Downloading submissions sorted by " + category) 214 | category_function = getattr(multireddit, category) 215 | 216 | category_dir = os.path.join(multireddit_dir, category) 217 | 218 | if category_function: 219 | for i, s in enumerate(category_function(limit=post_limit)): 220 | try: 221 | prefix_str = '#' + str(i).zfill(3) + ' ' 222 | self.indent_1 = ' ' * len(prefix_str) + "* " 223 | self.indent_2 = ' ' * len(self.indent_1) + "- " 224 | SubmissionDownloader(s, i, self.logger, category_dir, skip_videos, skip_meta, skip_comments, comment_limit, 225 | {'imgur_client_id': UserDownloader.IMGUR_CLIENT_ID}) 226 | except Exception as e: 227 | self.logger.error(self.indent_2 + "Unable to download post #" + str(i) + " for user `" + username + "` from multireddit " + name + " - " + str(e)) 228 | except Exception as e: 229 | self.logger.error(self.indent_1 + "Unable to download multireddit posts for user `" + username + "` - " + str(e)) 230 | 231 | def download_submitted(self, args): 232 | output_path = args.o 233 | 234 | for username in args.users: 235 | user = self.reddit.redditor(name=username) 236 | 237 | self.logger.notice("Downloading from /u/" + username + "/submitted") 238 | 239 | root_dir = os.path.join(os.path.join(os.path.join( 240 | output_path, "www.reddit.com"), "u"), username) 241 | 242 | try: 243 | post_limit = args.l 244 | sort = args.s 245 | skip_meta = args.skip_meta 246 | skip_videos = args.skip_videos 247 | skip_comments = args.skip_comments 248 | comment_limit = 0 # top-level comments ONLY 249 | 250 | submitted_dir = os.path.join(root_dir, "submitted") 251 | if not os.path.exists(submitted_dir): 252 | os.makedirs(submitted_dir) 253 | 254 | self.logger.verbose("Downloading submissions sorted by " + sort) 255 | category_function = getattr(user.submissions, sort) 256 | 257 | category_dir = os.path.join(submitted_dir, sort) 258 | 259 | if category_function: 260 | for i, s in enumerate(category_function(limit=post_limit)): 261 | try: 262 | prefix_str = '#' + str(i).zfill(3) + ' ' 263 | self.indent_1 = ' ' * len(prefix_str) + "* " 264 | self.indent_2 = ' ' * len(self.indent_1) + "- " 265 | SubmissionDownloader(s, i, self.logger, category_dir, skip_videos, skip_meta, skip_comments, comment_limit, 266 | {'imgur_client_id': UserDownloader.IMGUR_CLIENT_ID}) 267 | except Exception as e: 268 | self.logger.error(self.indent_2 + "Unable to download post #" + str(i) + " for user `" + username + "` - " + str(e)) 269 | except Exception as e: 270 | self.logger.error(self.indent_1 + "Unable to download submitted posts for user `" + username + "` - " + str(e)) 271 | 272 | def download_upvoted(self, args): 273 | output_path = args.o 274 | 275 | for username in args.users: 276 | user = self.reddit.redditor(name=username) 277 | 278 | self.logger.notice("Downloading from /u/" + username + "/upvoted") 279 | 280 | root_dir = os.path.join(os.path.join(os.path.join( 281 | output_path, "www.reddit.com"), "u"), username) 282 | 283 | try: 284 | post_limit = args.l 285 | skip_meta = args.skip_meta 286 | skip_videos = args.skip_videos 287 | skip_comments = args.skip_comments 288 | comment_limit = 0 # top-level comments ONLY 289 | 290 | upvoted_dir = os.path.join(root_dir, "upvoted") 291 | if not os.path.exists(upvoted_dir): 292 | os.makedirs(upvoted_dir) 293 | 294 | for i, s in enumerate(user.upvoted(limit=post_limit)): 295 | try: 296 | prefix_str = '#' + str(i).zfill(3) + ' ' 297 | self.indent_1 = ' ' * len(prefix_str) + "* " 298 | self.indent_2 = ' ' * len(self.indent_1) + "- " 299 | SubmissionDownloader(s, i, self.logger, upvoted_dir, skip_videos, skip_meta, skip_comments, comment_limit, 300 | {'imgur_client_id': UserDownloader.IMGUR_CLIENT_ID}) 301 | except Exception as e: 302 | self.logger.error(self.indent_2 + "Unable to download post #" + str(i) + " for user `" + username + "` - " + str(e)) 303 | except Exception as e: 304 | self.logger.error("Unable to download upvoted posts for user `" + username + "` - " + str(e)) 305 | 306 | def download_saved(self, args): 307 | output_path = args.o 308 | 309 | for username in args.users: 310 | user = self.reddit.redditor(name=username) 311 | 312 | self.logger.notice("Downloading from /u/" + username + "/saved") 313 | 314 | root_dir = os.path.join(os.path.join(os.path.join( 315 | output_path, "www.reddit.com"), "u"), username) 316 | 317 | try: 318 | post_limit = args.l 319 | skip_meta = args.skip_meta 320 | skip_videos = args.skip_videos 321 | skip_comments = args.skip_comments 322 | comment_limit = 0 # top-level comments ONLY 323 | 324 | saved_dir = os.path.join(root_dir, "saved") 325 | if not os.path.exists(saved_dir): 326 | os.makedirs(saved_dir) 327 | 328 | for i, s in enumerate(user.saved(limit=post_limit)): 329 | try: 330 | prefix_str = '#' + str(i).zfill(3) + ' ' 331 | self.indent_1 = ' ' * len(prefix_str) + "* " 332 | self.indent_2 = ' ' * len(self.indent_1) + "- " 333 | if isinstance(s, praw.models.Comment) and not skip_comments: 334 | self.logger.verbose( 335 | prefix_str + "Comment `" + str(s.id) + "` by " + str(s.author) + " \"" + s.body[0:32].replace("\n", "").replace("\r", "") + "...\"") 336 | 337 | comment_body = s.body 338 | comment_body = comment_body[0:32] 339 | comment_body = re.sub(r'\W+', '_', comment_body) 340 | post_dir = str(i).zfill(3) + "_Comment_" + \ 341 | comment_body + "..." 342 | submission_dir = os.path.join(saved_dir, post_dir) 343 | self.download_saved_comment(s, submission_dir) 344 | elif isinstance(s, praw.models.Comment): 345 | self.logger.verbose( 346 | prefix_str + "Comment `" + str(s.id) + "` by " + str(s.author)) 347 | self.logger.spam(self.indent_2 + "Skipping comment") 348 | elif isinstance(s, praw.models.Submission): 349 | SubmissionDownloader(s, i, self.logger, saved_dir, skip_videos, skip_meta, skip_comments, comment_limit, 350 | {'imgur_client_id': UserDownloader.IMGUR_CLIENT_ID}) 351 | else: 352 | pass 353 | except Exception as e: 354 | self.logger.error(self.indent_2 + "Unable to download #" + str(i) + " for user `" + username + "` - " + str(e)) 355 | except Exception as e: 356 | self.logger.error("Unable to download saved for user `" + username + "` - " + str(e)) 357 | 358 | def download_gilded(self, args): 359 | output_path = args.o 360 | 361 | for username in args.users: 362 | user = self.reddit.redditor(name=username) 363 | 364 | self.logger.notice("Downloading from /u/" + username + "/gilded") 365 | 366 | root_dir = os.path.join(os.path.join(os.path.join( 367 | output_path, "www.reddit.com"), "u"), username) 368 | 369 | try: 370 | post_limit = args.l 371 | skip_meta = args.skip_meta 372 | skip_videos = args.skip_videos 373 | skip_comments = args.skip_comments 374 | comment_limit = 0 # top-level comments ONLY 375 | 376 | saved_dir = os.path.join(root_dir, "gilded") 377 | if not os.path.exists(saved_dir): 378 | os.makedirs(saved_dir) 379 | 380 | for i, s in enumerate(user.gilded(limit=post_limit)): 381 | try: 382 | prefix_str = '#' + str(i).zfill(3) + ' ' 383 | self.indent_1 = ' ' * len(prefix_str) + "* " 384 | self.indent_2 = ' ' * len(self.indent_1) + "- " 385 | if isinstance(s, praw.models.Comment) and not skip_comments: 386 | self.logger.verbose( 387 | prefix_str + "Comment `" + str(s.id) + "` by " + str(s.author) + " \"" + s.body[0:32].replace("\n", "").replace("\r", "") + "...\"") 388 | 389 | comment_body = s.body 390 | comment_body = comment_body[0:32] 391 | comment_body = re.sub(r'\W+', '_', comment_body) 392 | post_dir = str(i).zfill(3) + "_Comment_" + \ 393 | comment_body + "..." 394 | submission_dir = os.path.join(saved_dir, post_dir) 395 | self.download_saved_comment(s, submission_dir) 396 | elif isinstance(s, praw.models.Comment): 397 | self.logger.verbose( 398 | prefix_str + "Comment `" + str(s.id) + "` by " + str(s.author)) 399 | self.logger.spam(self.indent_2 + "Skipping comment") 400 | elif isinstance(s, praw.models.Submission): 401 | SubmissionDownloader(s, i, self.logger, saved_dir, skip_videos, skip_meta, skip_comments, comment_limit, 402 | {'imgur_client_id': UserDownloader.IMGUR_CLIENT_ID}) 403 | else: 404 | pass 405 | except Exception as e: 406 | self.logger.error(self.indent_2 + "Unable to download #" + str(i) + " for user `" + username + "` - " + str(e)) 407 | except Exception as e: 408 | self.logger.error("Unable to download gilded for user `" + username + "` - " + str(e)) 409 | 410 | def print_formatted_error(self, e): 411 | for line in str(e).split("\n"): 412 | self.logger.error(self.indent_2 + line) 413 | 414 | def download_saved_comment(self, comment, output_dir): 415 | if not os.path.exists(output_dir): 416 | os.makedirs(output_dir) 417 | self.logger.spam( 418 | self.indent_2 + "Saving comment.json to " + output_dir) 419 | with open(os.path.join(output_dir, 'comments.json'), 'w') as file: 420 | comment_dict = {} 421 | try: 422 | if comment.author: 423 | comment_dict["author"] = comment.author.name 424 | else: 425 | comment_dict["author"] = None 426 | comment_dict["body"] = comment.body 427 | comment_dict["created_utc"] = int(comment.created_utc) 428 | comment_dict["distinguished"] = comment.distinguished 429 | comment_dict["downs"] = comment.downs 430 | comment_dict["edited"] = comment.edited 431 | comment_dict["id"] = comment.id 432 | comment_dict["is_submitter"] = comment.is_submitter 433 | comment_dict["link_id"] = comment.link_id 434 | comment_dict["parent_id"] = comment.parent_id 435 | comment_dict["permalink"] = comment.permalink 436 | comment_dict["score"] = comment.score 437 | comment_dict["stickied"] = comment.stickied 438 | comment_dict["subreddit_name_prefixed"] = comment.subreddit_name_prefixed 439 | comment_dict["subreddit_id"] = comment.subreddit_id 440 | comment_dict["total_awards_received"] = comment.total_awards_received 441 | comment_dict["ups"] = comment.ups 442 | file.write(json.dumps(comment_dict, indent=2)) 443 | self.logger.spam( 444 | self.indent_2 + "Successfully saved comment.json") 445 | except Exception as e: 446 | self.print_formatted_error(e) -------------------------------------------------------------------------------- /src/saveddit/user_downloader_config.py: -------------------------------------------------------------------------------- 1 | class UserDownloaderConfig: 2 | DEFAULT_CATEGORIES = ["hot", "new", "random_rising", "rising", 3 | "controversial", "top", "gilded"] 4 | DEFAULT_SORT = "hot" 5 | DEFAULT_SORT_OPTIONS = ["hot", "new", "top", "controversial"] 6 | DEFAULT_POST_LIMIT = None 7 | DEFAULT_COMMENT_LIMIT = None --------------------------------------------------------------------------------