├── .gitignore ├── .idea ├── inspectionProfiles │ ├── Project_Default.xml │ └── profiles_settings.xml └── misc.xml ├── LICENSE ├── Procfile ├── README.md ├── requirements.txt ├── runtime.txt └── snapshill.py /.gitignore: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # This .gitignore file was automatically created by Microsoft(R) Visual Studio. 3 | ################################################################################ 4 | 5 | /SnapshillBot.pyproj 6 | /.idea 7 | /.idea 8 | /.idea/scopes 9 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 12 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 7 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Procfile: -------------------------------------------------------------------------------- 1 | worker: python snapshill.py 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | SnapshillBot is a bot that archives posts in subreddits that the mods have requested the bot to run in. It uses a variety of different archiving services such as [archive.is](https://www.archive.is), [archive.org](https://www.archive.org), [megalodon.jp](http://megalodon.jp), and for reddit links, [Ceddit](http://www.ceddit.com). 2 | 3 | The bot only posts on subreddits where the moderators have requested its presence. Have more questions? Check out [the faq](https://www.reddit.com/r/SnapshillBot/wiki/faq) and if that doesn't answer your question, make a post. 4 | 5 | **Requests for adding subreddits are currently being suspended. We'll try to update you on the status of requests soon.** 6 | 7 | #More Information 8 | 9 | * [Wiki](https://www.reddit.com/r/SnapshillBot/wiki/) 10 | * [IRC Chatroom](https://kiwiirc.com/client/irc.snoonet.org/snapshillbot) 11 | * [Source Code](https://github.com/justcool393/SnapshillBot) 12 | 13 | #Subreddit Rules 14 | 15 | * Keep conversation civil and topical. Remember to follow [reddiquette](https://www.reddit.com/wiki/reddiquette). 16 | * Hate speech and personal attacks aren't tolerated and will be removed. 17 | 18 | Don't hesitate to send contact us using the 'message the moderators' link below if you have any questions. 19 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.7.1 2 | praw==6.2.0 3 | -------------------------------------------------------------------------------- /runtime.txt: -------------------------------------------------------------------------------- 1 | python-3.4.0 -------------------------------------------------------------------------------- /snapshill.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import praw 4 | import re 5 | import random 6 | import requests 7 | import sqlite3 8 | import time 9 | import traceback 10 | import warnings 11 | 12 | from bs4 import BeautifulSoup 13 | from html.parser import unescape 14 | from urllib.parse import urlencode 15 | 16 | from praw.exceptions import APIException, ClientException, PRAWException 17 | from prawcore.exceptions import PrawcoreException 18 | from requests.exceptions import ConnectionError 19 | 20 | USER_AGENT = "Archives to archive.is and archive.org (/r/SnapshillBot) v1.4" 21 | INFO = "/r/SnapshillBot" 22 | CONTACT = "/message/compose?to=\/r\/SnapshillBot" 23 | ARCHIVE_ORG_FORMAT = "%Y%m%d%H%M%S" 24 | MEGALODON_JP_FORMAT = "%Y-%m%d-%H%M-%S" 25 | DB_FILE = os.environ.get("DATABASE", "snapshill.sqlite3") 26 | LEN_MAX = 35 27 | REDDIT_API_WAIT = 2 28 | WARN_TIME = 300 # warn after spending 5 minutes on a post 29 | REDDIT_PATTERN = re.compile( 30 | "https?://(([A-z]{2})(-[A-z]{2})" "?|beta|i|m|pay|ssl|www|old|new|alpha)\.?reddit\.com" 31 | ) 32 | SUBREDDIT_OR_USER = re.compile("/(u|user|r)/[^\/]+/?$") 33 | # we have to do some manual ratelimiting because we are tunnelling through 34 | # some other websites. 35 | 36 | RECOVERABLE_EXC = ( 37 | APIException, 38 | ClientException, 39 | PRAWException, 40 | PrawcoreException, 41 | ConnectionError, 42 | ) 43 | 44 | 45 | loglevel = logging.DEBUG if os.environ.get("DEBUG") == "true" else logging.INFO 46 | TESTING = os.environ.get("TEST") == "true" 47 | 48 | logging.basicConfig(level=loglevel, format="[%(asctime)s] [%(levelname)s] %(message)s") 49 | 50 | log = logging.getLogger("snapshill") 51 | logging.getLogger("requests").setLevel(loglevel) 52 | warnings.simplefilter("ignore") # Ignore ResourceWarnings (because screw them) 53 | 54 | 55 | def get_footer(): 56 | return "\n\n*I am just a simple bot, __not__ a moderator of this subreddit* | [*bot subreddit*]({info}) | [*contact the maintainers*]({contact})".format( 57 | info=INFO, contact=CONTACT 58 | ) 59 | 60 | 61 | def should_notify(submission): 62 | """ 63 | Looks if we have seen this link before. 64 | :param submission: Submission to check 65 | :return: If we should comment or not 66 | """ 67 | cur.execute("SELECT * FROM links WHERE id=?", (submission.name,)) 68 | return not cur.fetchone() 69 | 70 | 71 | def ratelimit(url): 72 | if len(re.findall(REDDIT_PATTERN, url)) == 0: 73 | return 74 | time.sleep(REDDIT_API_WAIT) 75 | 76 | 77 | def fix_url(url): 78 | """ 79 | Change language code links, mobile links and beta links, SSL links and 80 | username/subreddit mentions 81 | :param url: URL to change. 82 | :return: Returns a fixed URL 83 | """ 84 | if url.startswith("r/") or url.startswith("u/"): 85 | url = "http://old.reddit.com/" + url 86 | if url.startswith("/r/") or url.startswith("/u/"): 87 | url = "http://old.reddit.com" + url 88 | return re.sub(REDDIT_PATTERN, "http://old.reddit.com", url) 89 | 90 | 91 | def skip_url(url): 92 | """ 93 | Skip naked username mentions and subreddit links. 94 | """ 95 | return REDDIT_PATTERN.match(url) and SUBREDDIT_OR_USER.search(url) 96 | 97 | 98 | def log_error(e): 99 | log.error("Unexpected {}:\n{}".format(e.__class__.__name__, traceback.format_exc())) 100 | 101 | 102 | class NameMixin: 103 | site_name = None 104 | 105 | @property 106 | def name(self): 107 | if self.archived: 108 | return self.site_name 109 | else: 110 | return "_{}\*_".format(self.site_name) 111 | 112 | 113 | class ArchiveIsArchive(NameMixin): 114 | site_name = "archive.today" 115 | 116 | def __init__(self, url): 117 | self.url = url 118 | self.archived = self.archive() 119 | pairs = {"url": self.url, "run": 1} 120 | self.error_link = "https://archive.today/?" + urlencode(pairs) 121 | 122 | def archive(self): 123 | """ 124 | Archives to archive.is. Returns a 200, and we have to find the 125 | JavaScript redirect through a regex in the response text. 126 | :return: URL of the archive or False if an error occurred 127 | """ 128 | pairs = {"url": self.url} 129 | 130 | try: 131 | res = requests.post("https://archive.today/submit/", pairs, verify=False) 132 | except RECOVERABLE_EXC: 133 | return False 134 | 135 | # Note; findall returns a list of tuples [('url', 'tld')] 136 | found = re.findall( 137 | "(http[s]?://archive.(fo|vn|today|is|li|md|ph)/[0-z]{1,6})", res.text 138 | ) 139 | 140 | if len(found) < 1: 141 | return False 142 | 143 | return found[0][0] 144 | 145 | 146 | class ArchiveOrgArchive(NameMixin): 147 | site_name = "archive.org" 148 | 149 | def __init__(self, url): 150 | self.url = url 151 | self.archived = self.archive() 152 | self.error_link = "https://web.archive.org/save/" + self.url 153 | 154 | def archive(self): 155 | """ 156 | Archives to archive.org. The website gives a 403 Forbidden when the 157 | archive cannot be generated (because it follows robots.txt rules) 158 | :return: URL of the archive, False if an error occurred, or None if 159 | we cannot archive this page. 160 | """ 161 | try: 162 | requests.get("https://web.archive.org/save/" + self.url) 163 | except RECOVERABLE_EXC as e: 164 | if isinstance(e, HTTPError) and e.status_code == 403: 165 | return None 166 | return False 167 | date = time.strftime(ARCHIVE_ORG_FORMAT, time.gmtime()) 168 | return "https://web.archive.org/" + date + "/" + self.url 169 | 170 | 171 | class MegalodonJPArchive(NameMixin): 172 | site_name = "megalodon.jp" 173 | 174 | def __init__(self, url): 175 | self.url = url 176 | self.archived = self.archive() 177 | self.error_link = "http://megalodon.jp/pc/get_simple/decide?url={}".format( 178 | self.url 179 | ) 180 | 181 | def archive(self): 182 | """ 183 | Archives to megalodon.jp. The website gives a 302 redirect when we 184 | POST to the webpage. We can't guess the link because a 1 second 185 | discrepancy will give an error when trying to view it. 186 | :return: URL of the archive, or False if an error occurred. 187 | """ 188 | pairs = {"url": self.url} 189 | try: 190 | res = requests.post("http://megalodon.jp/pc/get_simple/decide", pairs) 191 | except RECOVERABLE_EXC: 192 | return False 193 | if res.url == "http://megalodon.jp/pc/get_simple/decide": 194 | return False 195 | return res.url 196 | 197 | 198 | class GoldfishArchive(NameMixin): 199 | site_name = "snew.github.io" 200 | 201 | def __init__(self, url): 202 | self.url = url 203 | self.archived = re.sub(REDDIT_PATTERN, "https://snew.github.io", url) 204 | self.error_link = "https://snew.github.io/" 205 | 206 | 207 | class RemovedditArchive(NameMixin): 208 | site_name = "removeddit.com" 209 | 210 | def __init__(self, url): 211 | self.url = url 212 | self.archived = re.sub(REDDIT_PATTERN, "https://www.removeddit.com", url) 213 | self.error_link = "https://www.removeddit.com/" 214 | 215 | 216 | class ArchiveContainer: 217 | def __init__(self, url, text): 218 | log.debug("Creating ArchiveContainer") 219 | self.url = url 220 | self.text = (text[:LEN_MAX] + "...") if len(text) > LEN_MAX else text 221 | self.archives = [ArchiveOrgArchive(url), ArchiveIsArchive(url)] 222 | 223 | if re.match(REDDIT_PATTERN, url): 224 | self.archives.append(RemovedditArchive(url)) 225 | 226 | 227 | class Notification: 228 | def __init__(self, reddit, post, header, links): 229 | self.reddit = reddit 230 | self.post = post 231 | self.header = header 232 | self.links = links 233 | 234 | def notify(self): 235 | """ 236 | Replies with a comment containing the archives or if there are too 237 | many links to fit in a comment, post a submisssion to 238 | /r/SnapshillBotEx and then make a comment linking to it. 239 | :return Nothing 240 | """ 241 | try: 242 | comment = self._build() 243 | if TESTING: 244 | print(comment) 245 | return 246 | if len(comment) > 9999: 247 | link = self.post.permalink 248 | submission = self.reddit.subreddit("SnapshillBotEx").submit( 249 | "Archives for " + link, selftext=comment[:39999] 250 | ) 251 | submission.reply( 252 | "The original submission can be found " "here:\n\n" + link 253 | ) 254 | comment = self.post.reply( 255 | "Wow, that's a lot of links! The " 256 | "snapshots can be [found here.](" 257 | + submission.url 258 | + ")\n\n" 259 | + get_footer() 260 | ) 261 | log.info("Posted a comment and new submission") 262 | else: 263 | comment = self.post.reply(comment) 264 | except RECOVERABLE_EXC as e: 265 | log_error(e) 266 | return 267 | cur.execute( 268 | "INSERT INTO links (id, reply) VALUES (?, ?)", 269 | (self.post.name, comment.name), 270 | ) 271 | 272 | def _build(self): 273 | parts = [self.header.get(), "Snapshots:"] 274 | format = "[{name}]({archive})" 275 | 276 | for i, link in enumerate(self.links, 1): 277 | subparts = [] 278 | log.debug("Found link") 279 | 280 | for archive in link.archives: 281 | if archive.archived is None: 282 | continue 283 | 284 | archive_link = archive.archived 285 | 286 | if not archive_link: 287 | log.debug("Not found, using error link") 288 | archive_link = ( 289 | archive.error_link 290 | + ' "could not auto-archive; click to resubmit it!"' 291 | ) 292 | else: 293 | log.debug("Found archive") 294 | 295 | subparts.append(format.format(name=archive.name, archive=archive_link)) 296 | 297 | link_text = link.text if self.post.subreddit is not "TheseFuckingAccounts" else link.text.replace('u/', 'u\\/') 298 | parts.append("{}. {} - {}".format(i, link_text, ", ".join(subparts))) 299 | 300 | parts.append(get_footer()) 301 | 302 | return "\n\n".join(parts) 303 | 304 | 305 | class Header: 306 | def __init__(self, reddit, settings_wiki, subreddit): 307 | self.subreddit = subreddit 308 | self.texts = [] 309 | self._settings = reddit.subreddit(settings_wiki) 310 | 311 | try: 312 | content = self._get_wiki_content() 313 | if not content.startswith("!ignore"): 314 | self.texts = self._parse_quotes(content) 315 | except RECOVERABLE_EXC: 316 | pass 317 | 318 | def __len__(self): 319 | return len(self.texts) 320 | 321 | def get(self): 322 | """ 323 | Gets a random message from the extra text or nothing if there are no 324 | messages. 325 | :return: Random message or an empty string if the length of "texts" 326 | is 0. 327 | """ 328 | return "" if not self.texts else random.choice(self.texts) 329 | 330 | def _get_wiki_content(self): 331 | try: 332 | return self._settings.wiki["extxt/" + self.subreddit.lower()].content_md 333 | except TypeError as err: 334 | log.debug( 335 | "could not get wiki content for {} in {} ({})".format( 336 | self.subreddit, self._settings, err 337 | ) 338 | ) 339 | 340 | return "" 341 | 342 | def _parse_quotes(self, quotes_str): 343 | return [q.strip() for q in re.split("(\r)?\n-{3,}(\r)?\n", quotes_str) if q and q.strip()] 344 | 345 | 346 | class Snapshill: 347 | def __init__( 348 | self, username, password, client_id, client_secret, settings_wiki, limit=25 349 | ): 350 | self.username = username 351 | self.password = password 352 | self.client_id = client_id 353 | self.client_secret = client_secret 354 | self.limit = limit 355 | self.settings_wiki = settings_wiki 356 | self.headers = {} 357 | self._setup = False 358 | self.reddit = None 359 | 360 | def run(self): 361 | """ 362 | Checks through the submissions and archives and posts comments. 363 | """ 364 | if not self._setup: 365 | raise Exception("Snapshill not ready yet!") 366 | 367 | submissions = self.reddit.front.new(limit=self.limit) 368 | 369 | for submission in submissions: 370 | debugTime = time.time() 371 | warned = False 372 | 373 | log.debug("Found submission.\n" + submission.permalink) 374 | 375 | if not should_notify(submission): 376 | log.debug("Skipping.") 377 | continue 378 | 379 | archives = [ArchiveContainer(fix_url(submission.url), submission.title)] 380 | 381 | if submission.is_self and submission.selftext_html is not None: 382 | log.debug("Found text post...") 383 | 384 | links = BeautifulSoup(unescape(submission.selftext_html)).find_all("a") 385 | 386 | finishedURLs = [] 387 | 388 | for anchor in links: 389 | if time.time() > debugTime + WARN_TIME and not warned: 390 | log.warn( 391 | "Spent over {} seconds on post (ID: {})".format( 392 | WARN_TIME, submission.name 393 | ) 394 | ) 395 | 396 | warned = True 397 | 398 | log.debug("Found link in text post...") 399 | 400 | url = fix_url(anchor["href"]) 401 | 402 | if skip_url(url): 403 | continue 404 | 405 | if url in finishedURLs: 406 | continue # skip for sanity 407 | 408 | archives.append(ArchiveContainer(url, anchor.contents[0])) 409 | finishedURLs.append(url) 410 | ratelimit(url) 411 | 412 | Notification( 413 | self.reddit, 414 | submission, 415 | self._get_header(submission.subreddit), 416 | archives, 417 | ).notify() 418 | db.commit() 419 | 420 | def setup(self): 421 | """ 422 | Logs into reddit and refreshs the header text. 423 | """ 424 | self._login() 425 | self.refresh_headers() 426 | self._setup = True 427 | 428 | def quit(self): 429 | self.headers = {} 430 | self._setup = False 431 | 432 | def refresh_headers(self): 433 | """ 434 | Refreshes the header text for all subreddits. 435 | """ 436 | self.headers = {"all": Header(self.reddit, self.settings_wiki, "all")} 437 | for subreddit in self.reddit.user.subreddits(): 438 | name = subreddit.display_name.lower() 439 | log.debug("get header name: {}".format(name)) 440 | self.headers[name] = Header(self.reddit, self.settings_wiki, name) 441 | 442 | def _login(self): 443 | self.reddit = praw.Reddit( 444 | client_id=self.client_id, 445 | client_secret=self.client_secret, 446 | username=self.username, 447 | password=self.password, 448 | user_agent=USER_AGENT, 449 | ) 450 | 451 | def _get_header(self, subreddit): 452 | """ 453 | Gets the correct Header object for this subreddit. If the one for 'all' 454 | is not "!ignore", then this one will always be returned. 455 | :param subreddit: Subreddit object to get. 456 | :return: Extra text object found or the one for "all" if we can't find 457 | it or if not empty. 458 | """ 459 | all = self.headers["all"] 460 | 461 | if len(all): 462 | return all # return 'all' one for announcements 463 | 464 | return self.headers.get(subreddit.display_name.lower(), all) 465 | 466 | 467 | db = sqlite3.connect(DB_FILE) 468 | cur = db.cursor() 469 | 470 | if __name__ == "__main__": 471 | username = os.environ.get("REDDIT_USER") 472 | password = os.environ.get("REDDIT_PASS") 473 | 474 | client_id = os.environ.get("REDDIT_CLIENT_ID") 475 | client_secret = os.environ.get("REDDIT_CLIENT_SECRET") 476 | 477 | limit = int(os.environ.get("LIMIT", 25)) 478 | wait = int(os.environ.get("WAIT", 5)) 479 | refresh = int(os.environ.get("REFRESH", 1800)) 480 | 481 | log.info("Starting...") 482 | snapshill = Snapshill( 483 | username, 484 | password, 485 | client_id, 486 | client_secret, 487 | settings_wiki="SnapshillBot", 488 | limit=limit, 489 | ) 490 | snapshill.setup() 491 | 492 | log.info("Started.") 493 | try: 494 | cycles = 0 495 | while True: 496 | try: 497 | cycles += 1 498 | log.info("Running") 499 | snapshill.run() 500 | log.info("Done") 501 | # This will refresh by default around ~30 minutes (depending 502 | # on delays). 503 | if cycles > (refresh / wait) / 2: 504 | log.info("Reloading header text and ignore list...") 505 | snapshill.refresh_headers() 506 | cycles = 0 507 | except RECOVERABLE_EXC as e: 508 | log_error(e) 509 | 510 | time.sleep(wait) 511 | except KeyboardInterrupt: 512 | pass 513 | snapshill.quit() 514 | db.close() 515 | exit(0) 516 | --------------------------------------------------------------------------------