├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── app.py ├── config.py ├── database.py ├── requirements.txt └── setInterval.py /.gitignore: -------------------------------------------------------------------------------- 1 | /.env 2 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | 3 | ADD app.py / 4 | ADD config.py / 5 | ADD database.py / 6 | ADD setInterval.py / 7 | ADD requirements.txt / 8 | 9 | RUN \ 10 | apt-get update && \ 11 | apt-get -y upgrade 12 | 13 | RUN apt-get install -y \ 14 | python3 python3-pip python3-dev pkg-config \ 15 | libavformat-dev libavcodec-dev libavdevice-dev \ 16 | libavutil-dev libswscale-dev libswresample-dev libavfilter-dev 17 | 18 | RUN pip3 install -r ./requirements.txt 19 | 20 | ARG BOT_CLIENT_ID 21 | ARG BOT_CLIENT_SECRET 22 | ARG BOT_USER_AGENT 23 | ARG BOT_USERNAME 24 | ARG BOT_PASSWORD 25 | ARG BOT_SUB_COUNT 26 | ARG BOT_SUBREDDIT0 27 | 28 | ENV BOT_CLIENT_ID=$BOT_CLIENT_ID 29 | ENV BOT_CLIENT_SECRET=$BOT_CLIENT_SECRET 30 | ENV BOT_USER_AGENT=$BOT_USER_AGENT 31 | ENV BOT_USERNAME=$BOT_USERNAME 32 | ENV BOT_PASSWORD=$BOT_PASSWORD 33 | ENV BOT_SUB_COUNT=$BOT_SUB_COUNT 34 | ENV BOT_SUBREDDIT0=$BOT_SUBREDDIT0 35 | 36 | CMD [ "python3", "./app.py" ] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Ali Abdoli 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Repost Checker 2 | 3 | # Overview 4 | This bot is able to notice reposts of any kind(link, text, picture) where the original post has not been deleted, report the post, and make a comment with details about the original post. It can be configured to work with any subreddit's rules. 5 | 6 | # Setup 7 | 1. download python 3.5+. 8 | 2. download/clone the repository. 9 | 3. "pip3 install -r requirements.txt" to install dependencies. 10 | 4. set up environmental variables for the config file with your bot [credintionals](https://github.com/reddit-archive/reddit/wiki/OAuth2) and subreddit. 11 | 5. run app.py with "python3 app.py". 12 | 13 | # Dependencies 14 | take a look at requirements.txt 15 | 16 | # Contribution 17 | Feel free to fork the repository and tackle any issues. You may also open new issues. 18 | 19 | # Testing Bot on Reddit 20 | 1) Create a new subreddit and assign whatever account the bot will post on behalf of as a moderator. 21 | 2) Head to reddit.com/prefs/apps and choose script from the radio buttons. 22 | 3) Fill out the config file with data given by this page after submission. 23 | 24 | # Subreddits using the bot 25 | [r/ihadastroke](https://www.reddit.com/r/ihadastroke/) 26 | [r/ProgrammerHumor](https://www.reddit.com/r/ProgrammerHumor) 27 | Keep in mind Bot is not running at the moment because of changes occuring. 28 | 29 | if you are using the bot and your subreddit is not listed above, please make an issue. 30 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | # packages that need to be pip installed 2 | import praw 3 | from psaw import PushshiftAPI 4 | 5 | # packages that come with python 6 | import traceback 7 | from multiprocessing import Process, Value 8 | from time import sleep, time 9 | 10 | # other files 11 | import config 12 | import database 13 | from setInterval import setInterval 14 | 15 | rows = [] 16 | reddit = praw.Reddit(client_id=config.client_id, 17 | client_secret=config.client_secret, 18 | username=config.username, 19 | password=config.password, 20 | user_agent=config.user_agent) 21 | api = PushshiftAPI(reddit) 22 | 23 | @setInterval(1800) 24 | def delete_comment(): 25 | try: 26 | for comment in reddit.redditor('RepostCheckerBot').comments.new(limit=50): 27 | if comment.score < -1: 28 | f = open('fails.txt', 'a') 29 | f.write(str(comment.body)) 30 | comment.delete() 31 | 32 | except Exception as e: 33 | print(e) 34 | print(repr(e)) 35 | if '503' in str(e): 36 | print('503 from server') 37 | if '504' in str(e): 38 | print('504 from server') 39 | if '401' in str(e): 40 | print('401 from server') 41 | else: 42 | f = open('errs.txt', 'a') 43 | f.write('{}\n'.format(str(traceback.format_exc()))) 44 | 45 | 46 | # the main function 47 | class FindPosts(Process): 48 | def __init__(self, sub_settings): 49 | # Constructor. 50 | Process.__init__(self) 51 | self.sub_settings = sub_settings 52 | self.v = Value('i', 0) 53 | 54 | def run(self): 55 | Process(target=self.find_top_posts).start() 56 | self.findNewPosts() 57 | 58 | def find_top_posts(self): 59 | subreddit = reddit.subreddit(self.sub_settings[0]) 60 | print(self.sub_settings) 61 | new = False 62 | first_time = True 63 | print('Starting searching...') 64 | while True: 65 | try: 66 | post = 0 67 | # first get 50 posts from the top of the subreddit 68 | for submission in api.search_submissions(subreddit=subreddit): 69 | while True: 70 | if (self.v.value != 0) or first_time: 71 | try: 72 | x = self.v.value 73 | except IndexError as e: 74 | if 'deque index out of range' not in str(e): 75 | raise IndexError(e) 76 | if first_time or (x is not None and x == 2): 77 | first_time = False 78 | top = True 79 | hot = False 80 | post += 1 81 | result = database.is_logged( 82 | submission.url, 83 | submission.media, 84 | submission.selftext, 85 | submission.permalink, 86 | submission.created_utc, 87 | top, 88 | hot, 89 | new, 90 | self.sub_settings, 91 | reddit, 92 | ) 93 | 94 | if result != [['delete', -1, -1, -1, -1, -1]] and (result == [] or submission.created_utc != result[0][2]): 95 | rows.append(database.add_post( 96 | submission.created_utc, 97 | submission.url, 98 | submission.media, 99 | submission.permalink, 100 | submission.selftext, 101 | submission.author, 102 | submission.title, 103 | top, 104 | hot, 105 | new, 106 | self.sub_settings[0], 107 | self.sub_settings[8] 108 | )) 109 | print('{} --> Added {}'.format( 110 | post, 111 | submission.permalink, 112 | )) 113 | self.v.value = 1 114 | break 115 | 116 | except Exception as e: 117 | print(traceback.format_exc()) 118 | if '503' in str(e): 119 | print('503 from server') 120 | if '401' in str(e): 121 | print('401 from server') 122 | else: 123 | f = open('errs.txt', 'a') 124 | error = str(traceback.format_exc()) 125 | f.write(error) 126 | 127 | def findNewPosts(self): 128 | subreddit = reddit.subreddit(self.sub_settings[0]) 129 | top = False 130 | hot = False 131 | new = True 132 | limit_val = self.sub_settings[6] 133 | while True: 134 | try: 135 | post = 0 136 | # then get 1000 posts from new of the subreddit 137 | for submission in api.search_submissions(subreddit=subreddit, limit=limit_val): 138 | while True: 139 | if self.v.value != 0: 140 | try: 141 | x = self.v.value 142 | except IndexError as e: 143 | if 'deque index out of range' not in str(e): 144 | raise IndexError(e) 145 | if x is not None and x == 1: 146 | post += 1 147 | result = database.is_logged( 148 | submission.url, 149 | submission.media, 150 | submission.selftext, 151 | submission.permalink, 152 | submission.created_utc, 153 | top, 154 | hot, 155 | new, 156 | self.sub_settings, 157 | reddit, 158 | ) 159 | if result != [['delete', -1, -1, -1, -1, -1]] and (result == [] or submission.created_utc != result[0][2]): 160 | rows.append(database.add_post( 161 | submission.created_utc, 162 | submission.url, 163 | submission.media, 164 | submission.permalink, 165 | submission.selftext, 166 | submission.author, 167 | submission.title, 168 | top, 169 | hot, 170 | new, 171 | self.sub_settings[0], 172 | self.sub_settings[8], 173 | )) 174 | print('{} --> Added {}'.format( 175 | post, 176 | submission.permalink, 177 | )) 178 | 179 | if result != [] and result != [['delete', -1, -1, -1, -1, -1]]: 180 | print('reported') 181 | # report and make a comment 182 | submission.report('REPOST ALERT') 183 | cntr = 0 184 | table = '' 185 | for i in result: 186 | table = '{}{}|[{}](https://reddit.com{})|{}|{}%|{}\n'.format( 187 | table, 188 | str(cntr), 189 | i[5], 190 | i[0], 191 | i[1], 192 | str(i[3]), 193 | i[4], 194 | ) 195 | cntr += 1 196 | full_text = 'I have detected that this may be a repost: \n'+ \ 197 | '\nNum|Post|Date|Match|Author\n:--:|:--:|:--:|:--:|:--:\n{}'.format(table) + \ 198 | '\n*Beep Boop* I am a bot | [Source](https://github.com/xXAligatorXx/repostChecker)' + \ 199 | '| Contact u/XXAligatorXx for inquiries | The bot will delete its message at -2 score' 200 | do_this = True 201 | while do_this: 202 | try: 203 | submission.reply(full_text) 204 | do_this = False 205 | except: 206 | do_this = True 207 | self.v.value = 2 208 | break 209 | 210 | limit_val = 10 211 | except Exception as e: 212 | print(traceback.format_exc()) 213 | if '503' in str(e): 214 | print('503 from server') 215 | if '401' in str(e): 216 | print('401 from server') 217 | else: 218 | f = open('errs.txt', 'a') 219 | error = str(traceback.format_exc()) 220 | f.write(error) 221 | 222 | 223 | thread_count = 0 224 | threads = [] 225 | for i in config.sub_settings: 226 | if i is not None: 227 | database.init_database(i[0], i[8]) 228 | threads.append(FindPosts(i)) 229 | if i[1] is not None or i[2] is not None or i[3] is not None: 230 | database.delete_old_loop(i) 231 | threads[thread_count].start() 232 | thread_count += 1 233 | 234 | delete_comment() 235 | for i in range(0, len(threads)): 236 | threads[i].join() 237 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | client_id = os.environ['BOT_CLIENT_ID'] 4 | client_secret = os.environ['BOT_CLIENT_SECRET'] 5 | user_agent = os.environ['BOT_USER_AGENT'] 6 | username = os.environ['BOT_USERNAME'] 7 | password = os.environ['BOT_PASSWORD'] 8 | num_subs = int(os.environ['BOT_SUB_COUNT']) 9 | sub_settings = [[ 10 | os.environ['BOT_SUBREDDIT' + i], 11 | int(os.environ['BOT_TOP_DAYS' + i]) if 'BOT_TOP_DAYS' + i in os.environ else None, 12 | int(os.environ['BOT_HOT_DAYS' + i]) if 'BOT_HOT_DAYS' + i in os.environ else None, 13 | int(os.environ['BOT_NEW_DAYS' + i]) if 'BOT_NEW_DAYS' + i in os.environ else None, 14 | int(os.environ['BOT_TOP_NUM_POSTS' + i]) if 'BOT_TOP_NUM_POSTS' + i in os.environ else 1000, 15 | int(os.environ['BOT_HOT_NUM_POSTS' + i]) if 'BOT_HOT_NUM_POSTS' + i in os.environ else 1000, 16 | int(os.environ['BOT_NEW_NUM_POSTS' + i]) if 'BOT_NEW_NUM_POSTS' + i in os.environ else 1000, 17 | int(os.environ['BOT_THRESH' +i]) if 'BOT_THRESH' + i in os.environ else 5, 18 | bool(os.environ['BOT_TEXT_IN_IMAGE' + i]) if 'BOT_TEXT_IN_IMAGE' + i in os.environ else False, 19 | ] for i in [str(x) for x in range(num_subs)]] 20 | -------------------------------------------------------------------------------- /database.py: -------------------------------------------------------------------------------- 1 | # packages that come with python 2 | from datetime import timedelta, datetime 3 | from calendar import monthrange 4 | from urllib.request import Request, urlopen 5 | from urllib.error import HTTPError 6 | from io import BytesIO 7 | import ssl 8 | import sqlite3 9 | from re import sub 10 | import traceback 11 | 12 | # packages that need to be pip installed 13 | from PIL import Image 14 | import dhash 15 | from difflib import SequenceMatcher 16 | from pytesseract import image_to_string 17 | import av 18 | 19 | from setInterval import setInterval 20 | 21 | context = ssl._create_unverified_context() 22 | user_agent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46' 23 | 24 | 25 | def init_database(subreddit, is_text_in_image): 26 | conn = sqlite3.connect( 27 | 'Posts{}.db'.format( 28 | sub( 29 | '([a-zA-Z])', 30 | lambda x: x.groups()[0].upper(), 31 | subreddit, 32 | 1, 33 | ) 34 | ) 35 | ) 36 | c = conn.cursor() 37 | c.execute( 38 | 'CREATE TABLE IF NOT EXISTS Posts (Date INT, Content TEXT, ImageText TEXT, Url TEXT, Location TEXT, Author TEXT, Title TEXT);', 39 | ) 40 | conn.commit() 41 | c.close() 42 | print('Create table.') 43 | 44 | 45 | def canonical(s): 46 | return ''.join([c for c in s if not c.isspace()]) 47 | 48 | 49 | def is_int(s): 50 | try: 51 | int(s) 52 | return True 53 | except: 54 | return False 55 | 56 | 57 | 58 | def month_delta(d1, d2): 59 | delta = 0 60 | while True: 61 | mdays = monthrange(d1.year, d1.month)[1] 62 | d1 += timedelta(days=mdays) 63 | if d1 <= d2: 64 | delta += 1 65 | else: 66 | break 67 | return delta 68 | 69 | 70 | def hash_img(conn, img_url, url): 71 | img_hash = 'invalid' 72 | try: 73 | f = BytesIO( 74 | urlopen( 75 | Request( 76 | str(img_url), 77 | headers={ 78 | 'User-Agent': user_agent 79 | }, 80 | ), 81 | context=context, 82 | ).read(), 83 | ) 84 | img = Image.open(f) 85 | img_hash = dhash.dhash_int(img) 86 | except HTTPError: 87 | c = conn.cursor() 88 | c.execute( 89 | 'DELETE FROM Posts WHERE Url = ?;', 90 | ( 91 | str(url), 92 | ), 93 | ) 94 | conn.commit() 95 | c.close() 96 | except: 97 | f = open('dedLink.txt', 'a') 98 | f.write('{}\n{}\n'.format(str(traceback.format_exc()), img_url)) 99 | c = conn.cursor() 100 | c.execute( 101 | 'DELETE FROM Posts WHERE Url = ?;', 102 | ( 103 | str(url), 104 | ), 105 | ) 106 | conn.commit() 107 | c.close() 108 | return img_hash 109 | 110 | 111 | def extract_text(img_url, url): 112 | img_text = 'invalid' 113 | try: 114 | f = BytesIO( 115 | urlopen( 116 | Request( 117 | str(img_url), 118 | headers={ 119 | 'User-Agent': user_agent 120 | }, 121 | ), 122 | context=context, 123 | ).read(), 124 | ) 125 | img = Image.open(f) 126 | img_text = image_to_string(img).replace('\n', '').replace('\r', '').replace(' ', '') 127 | except Exception as e: 128 | if e.__class__.__name__ != 'HTTPError': 129 | f = open('tesseractErrs.txt', 'a') 130 | f.write('{}\n{}\n'.format(str(traceback.format_exc()), img_url)) 131 | return img_text 132 | 133 | 134 | def hash_vid(conn, vid_url, url): 135 | vid_hash = '' 136 | try: 137 | container = av.open(vid_url['reddit_video']['fallback_url']) 138 | for frame in container.decode(video=0): 139 | vid_hash = '{}{} '.format(vid_hash, str(dhash.dhash_int(frame.to_image()))) 140 | except Exception as e: 141 | if '403' in str(e): 142 | c = conn.cursor() 143 | c.execute( 144 | 'DELETE FROM Posts WHERE Url = ?;', 145 | ( 146 | str(url), 147 | ), 148 | ) 149 | conn.commit() 150 | c.close() 151 | else: 152 | f = open('dedLink.txt', 'a') 153 | f.write('{}\n{}\n'.format(str(traceback.format_exc()), vid_url)) 154 | c = conn.cursor() 155 | c.execute( 156 | 'DELETE FROM Posts WHERE Url = ?;', 157 | ( 158 | str(url), 159 | ), 160 | ) 161 | conn.commit() 162 | c.close() 163 | vid_hash = 'invalid' 164 | return vid_hash 165 | 166 | 167 | def hash_gif(conn, gif_url, url): 168 | gif_hash = '' 169 | nframes = 0 170 | try: 171 | f = BytesIO( 172 | urlopen( 173 | Request( 174 | str(gif_url), 175 | headers={'User-Agent': user_agent}, 176 | ), 177 | context=context, 178 | ).read(), 179 | ) 180 | frame = Image.open(f) 181 | while frame: 182 | dhash.dhash_int(frame) 183 | gif_hash = '{}{} '.format(gif_hash, str(dhash.dhash_int(frame))) 184 | nframes += 1 185 | try: 186 | frame.seek(nframes) 187 | except EOFError: 188 | break 189 | except HTTPError: 190 | c = conn.cursor() 191 | c.execute( 192 | 'DELETE FROM Posts WHERE Url = ?;', 193 | ( 194 | str(url), 195 | ), 196 | ) 197 | conn.commit() 198 | c.close() 199 | except: 200 | f = open('dedLink.txt', 'a') 201 | f.write('{}\n{}\n'.format(str(traceback.format_exc()), url)) 202 | c = conn.cursor() 203 | c.execute( 204 | 'DELETE FROM Posts WHERE Url = ?;', 205 | ( 206 | str(url), 207 | ), 208 | ) 209 | conn.commit() 210 | c.close() 211 | gif_hash = 'invalid' 212 | return gif_hash 213 | 214 | 215 | def hash_vid_difference(original_hash, new_hash): 216 | cntr = 0 217 | original_hash_list = original_hash.split() 218 | new_hash_list = new_hash.split() 219 | frame_differences = [] 220 | min_differences = [] 221 | for i in original_hash_list: 222 | for j in new_hash_list: 223 | frame_differences.append(dhash.get_num_bits_different(int(i), int(j))) 224 | cntr += 1 225 | min_differences.append(min(frame_differences)) 226 | frame_differences = [] 227 | return sum(min_differences)/len(min_differences) 228 | 229 | 230 | def add_to_found(post, precentage, result, original_post_date, precentage_matched, author, title): 231 | result.append(post[0]) 232 | original_post_date.append(post[1]) 233 | author.append(post[2]) 234 | title.append(post[3]) 235 | precentage_matched.append(precentage) 236 | 237 | 238 | def update_database(conn, url, update_val): 239 | c = conn.cursor() 240 | c.execute( 241 | 'UPDATE Posts SET Location = ? WHERE Url = ?;', 242 | ( 243 | str(update_val), 244 | str(url), 245 | ), 246 | ) 247 | conn.commit() 248 | c.close() 249 | 250 | 251 | # def delete_old_from_database(sub_settings): 252 | 253 | # delete_old_loop(sub_settings, c, conn) 254 | 255 | @setInterval(86400) 256 | def delete_old_loop(sub_settings, c, conn): 257 | conn = sqlite3.connect( 258 | 'Posts{}.db'.format( 259 | sub( 260 | '([a-zA-Z])', 261 | lambda x: x.groups()[0].upper(), 262 | sub_settings[0], 263 | 1, 264 | ) 265 | ) 266 | ) 267 | c = conn.cursor() 268 | args = c.execute( 269 | 'SELECT Date, Location FROM Posts;' 270 | ) 271 | now = datetime.utcnow() 272 | for x in args.fetchall(): 273 | then = datetime.fromtimestamp(x[0]) 274 | time_passed = (now - then).days 275 | if sub_settings[1] is not None and time_passed > sub_settings[1] and x[1] == 'top' or sub_settings[ 276 | 2] is not None and time_passed > sub_settings[2] and x[1] == 'hot' or sub_settings[ 277 | 3] is not None and time_passed > sub_settings[3] and x[1] == 'new': 278 | c.execute( 279 | 'DELETE FROM Posts WHERE Date = ?;', 280 | ( 281 | int(x[0]), 282 | ), 283 | ) 284 | conn.commit() 285 | print('deleted an old post') 286 | 287 | 288 | def is_logged(content_url, media, text, url, date, top, hot, new, sub_settings, reddit): 289 | result = [] 290 | original_post_date = [] 291 | final_time_passed = [] 292 | percentage_matched = [] 293 | author = [] 294 | title = [] 295 | args = None 296 | posts_to_remove = [] 297 | cntr = 0 298 | return_result = [] 299 | 300 | conn = sqlite3.connect( 301 | 'Posts{}.db'.format( 302 | sub( 303 | '([a-zA-Z])', 304 | lambda x: x.groups()[0].upper(), 305 | sub_settings[0], 306 | 1, 307 | ) 308 | ) 309 | ) 310 | c = conn.cursor() 311 | 312 | now = datetime.utcnow() 313 | then = datetime.fromtimestamp(date) 314 | time_passed = (now-then).days 315 | 316 | # ignore post if too old 317 | if sub_settings[1] is not None and time_passed > sub_settings[1] and top or sub_settings[2] is not None and time_passed > sub_settings[2] and hot or sub_settings[3] is not None and time_passed > sub_settings[3] and new: 318 | result = ['delete'] 319 | original_post_date = [-1] 320 | final_time_passed = [-1] 321 | percentage_matched = [-1] 322 | author = [-1] 323 | title = [-1] 324 | 325 | else: 326 | 327 | # check if post is already in database 328 | args = c.execute( 329 | 'SELECT COUNT(1) FROM Posts WHERE Url = ?;', 330 | ( 331 | str(url), 332 | ), 333 | ) 334 | if list(args.fetchone())[0] != 0: 335 | args = c.execute( 336 | 'SELECT Location FROM Posts WHERE Url = ?;', 337 | ( 338 | str(url), 339 | ), 340 | ) 341 | full_result = list(args.fetchall()) 342 | 343 | # make sure the post is in the right category 344 | for i in full_result: 345 | if i[0] != 'top' and top and (sub_settings[1] is None or (time_passed < sub_settings[1] and (sub_settings[2] is None or sub_settings[1] > sub_settings[2]) and (sub_settings[3] is None or sub_settings[1] > sub_settings[3]))): 346 | update_database(conn, url, 'top') 347 | if i[0] != 'hot' and hot and (sub_settings[2] is None or (time_passed < sub_settings[2] and (sub_settings[1] is None or sub_settings[2] > sub_settings[1]) and (sub_settings[3] is None or sub_settings[2] > sub_settings[3]))): 348 | update_database(conn, url, 'hot') 349 | if i[0] != 'new' and new and (sub_settings[3] is None or (time_passed < sub_settings[3] and (sub_settings[2] is None or sub_settings[3] > sub_settings[2]) and (sub_settings[1] is None or sub_settings[3] > sub_settings[1]))): 350 | update_database(conn, url, 'new') 351 | 352 | # ignore post 353 | result = ['delete'] 354 | original_post_date = [-1] 355 | final_time_passed = [-1] 356 | percentage_matched = [-1] 357 | author = [-1] 358 | title = [-1] 359 | 360 | # check if post is a repost 361 | else: 362 | 363 | # check for text 364 | if text != '​' and text != '' and text != '[removed]' and text != '[deleted]': 365 | args = c.execute( 366 | 'SELECT COUNT(1) FROM Posts WHERE Content = ?;', 367 | ( 368 | str(text), 369 | ), 370 | ) 371 | if list(args.fetchone())[0] != 0: 372 | args = c.execute( 373 | 'SELECT Url, Date, Author, Title FROM Posts WHERE Content = ?;', 374 | ( 375 | str(text), 376 | ), 377 | ) 378 | full_result = list(args.fetchall()) 379 | for i in full_result: 380 | add_to_found( 381 | i, 382 | 100, 383 | result, 384 | original_post_date, 385 | percentage_matched, 386 | author, 387 | title, 388 | ) 389 | args = c.execute( 390 | 'SELECT Url, Date, Author, Title, Content FROM posts;', 391 | ) 392 | for texts in args.fetchall(): 393 | if texts[0] not in result: 394 | text_var = texts[4] 395 | difference = SequenceMatcher(None, text_var, text).ratio() 396 | if 10 - (difference * 10) < sub_settings[7]: 397 | add_to_found( 398 | texts, 399 | difference * 100, 400 | result, 401 | original_post_date, 402 | percentage_matched, 403 | author, 404 | title, 405 | ) 406 | 407 | # check for v.reddit 408 | elif media is not None and ('oembed' not in media or 'provider_name' not in media['oembed'] or (media['oembed']['provider_name'] != 'gfycat' and media['oembed']['provider_name'] != 'YouTube' and media['oembed']['provider_name'] != 'Imgur')): 409 | vid_hash = hash_vid(conn, media, url) 410 | if vid_hash == 'invalid': 411 | result = ['delete'] 412 | original_post_date = [-1] 413 | final_time_passed = [-1] 414 | percentage_matched = [-1] 415 | author = [-1] 416 | title = [-1] 417 | if is_int(vid_hash.replace(' ', '')): 418 | args = c.execute( 419 | 'SELECT COUNT(1) FROM Posts WHERE Content = ?;', 420 | ( 421 | str(vid_hash), 422 | ), 423 | ) 424 | if list(args.fetchone())[0] != 0: 425 | args = c.execute( 426 | 'SELECT Url, Date, Author, Title FROM Posts WHERE Content = ?;', 427 | ( 428 | str(vid_hash), 429 | ),g 430 | ) 431 | full_result = list(args.fetchall()) 432 | for i in full_result: 433 | add_to_found( 434 | i, 435 | 100, 436 | result, 437 | original_post_date, 438 | percentage_matched, 439 | author, 440 | title 441 | ) 442 | args = c.execute( 443 | 'SELECT Url, Date, Author, Title Content FROM posts;', 444 | ) 445 | for hashed in args.fetchall(): 446 | if hashed[0] not in result: 447 | hashed_readable = hashed[2] 448 | if is_int(hashed_readable.replace(' ', '')): 449 | hashed_difference = hash_vid_difference( 450 | hashed_readable, vid_hash) 451 | if hashed_difference < sub_settings[7]: 452 | add_to_found( 453 | hashed, 454 | ((sub_settings[7] - hashed_difference)/sub_settings[7])*100, 455 | result, 456 | original_post_date, 457 | percentage_matched, 458 | author, 459 | title, 460 | ) 461 | 462 | # check for image or gif 463 | elif content_url != '': 464 | args = c.execute( 465 | 'SELECT COUNT(1) FROM Posts WHERE Content = ?;', 466 | ( 467 | str(content_url).replace( 468 | '&feature=youtu.be', 469 | '', 470 | ), 471 | ), 472 | ) 473 | if list(args.fetchone())[0] != 0: 474 | args = c.execute( 475 | 'SELECT Url, Date, Author, Title FROM Posts WHERE Content = ?;', 476 | ( 477 | str(content_url).replace( 478 | '&feature=youtu.be', 479 | '', 480 | ), 481 | ), 482 | ) 483 | full_result = list(args.fetchall()) 484 | for i in full_result: 485 | add_to_found( 486 | i, 487 | 100, 488 | result, 489 | original_post_date, 490 | percentage_matched, 491 | author, 492 | title, 493 | ) 494 | 495 | # check for gif 496 | if 'gif' in content_url and not (content_url.endswith('gifv') or 'gifs' in content_url): 497 | gifHash = hash_gif(conn, content_url, url) 498 | if gifHash == 'invalid': 499 | result = ['delete'] 500 | original_post_date = [-1] 501 | final_time_passed = [-1] 502 | percentage_matched = [-1] 503 | author = [-1] 504 | title = [-1] 505 | if is_int(gifHash.replace(' ', '')): 506 | args = c.execute( 507 | 'SELECT COUNT(1) FROM Posts WHERE Content = ?;', 508 | ( 509 | str(gifHash), 510 | ), 511 | ) 512 | if list(args.fetchone())[0] != 0: 513 | args = c.execute( 514 | 'SELECT Url, Date, Author, Title FROM Posts WHERE Content = ?;', 515 | ( 516 | str(gifHash), 517 | ), 518 | ) 519 | full_result = list(args.fetchall()) 520 | for i in full_result: 521 | add_to_found( 522 | i, 523 | 100, 524 | result, 525 | original_post_date, 526 | percentage_matched, 527 | author, 528 | title, 529 | ) 530 | args = c.execute( 531 | 'SELECT Url, Date, Author, Title, Content FROM posts;' 532 | ) 533 | for hashed in args.fetchall(): 534 | if hashed[0] not in result: 535 | hashed_readable = hashed[2] 536 | if is_int(hashed_readable.replace(' ', '')): 537 | hashed_difference = hash_vid_difference( 538 | hashed_readable, gifHash) 539 | if hashed_difference < sub_settings[7]: 540 | add_to_found( 541 | hashed, 542 | ((sub_settings[7] - hashed_difference)/sub_settings[7])*100, 543 | result, 544 | original_post_date, 545 | percentage_matched, 546 | author, 547 | title, 548 | ) 549 | 550 | # check for image 551 | elif 'png' in content_url or 'jpg' in content_url: 552 | imgHash = hash_img(conn, content_url, url) 553 | if imgHash == 'invalid': 554 | result = ['delete'] 555 | original_post_date = [-1] 556 | final_time_passed = [-1] 557 | percentage_matched = [-1] 558 | author = [-1] 559 | title = [-1] 560 | elif is_int(imgHash): 561 | args = c.execute( 562 | 'SELECT COUNT(1) FROM Posts WHERE Content = ?;', 563 | ( 564 | str(imgHash), 565 | ), 566 | ) 567 | if list(args.fetchone())[0] != 0: 568 | args = c.execute( 569 | 'SELECT Url, Date, Author, Title FROM Posts WHERE Content = ?;', 570 | ( 571 | str(imgHash), 572 | ), 573 | ) 574 | full_result = list(args.fetchall()) 575 | for i in full_result: 576 | add_to_found( 577 | i, 578 | 100, 579 | result, 580 | original_post_date, 581 | percentage_matched, 582 | author, 583 | title, 584 | ) 585 | args = c.execute( 586 | 'SELECT Url, Date, Author, Title, Content FROM posts;' 587 | ) 588 | for hashed in args.fetchall(): 589 | if hashed[0] not in result: 590 | hashed_readable = hashed[2] 591 | if is_int(hashed_readable): 592 | hashed_difference = dhash.get_num_bits_different( 593 | imgHash, int(hashed_readable)) 594 | if hashed_difference < sub_settings[7]: 595 | add_to_found( 596 | hashed, 597 | ((sub_settings[7] - hashed_difference)/sub_settings[7])*100, 598 | result, 599 | original_post_date, 600 | percentage_matched, 601 | author, 602 | title, 603 | ) 604 | if sub_settings[8]: 605 | img_text = extract_text(content_url, url) 606 | if img_text != 'invalid' and img_text != '': 607 | args = c.execute( 608 | 'SELECT COUNT(1) FROM Posts WHERE Content = ?;', 609 | ( 610 | str(img_text), 611 | ), 612 | ) 613 | if list(args.fetchone())[0] != 0: 614 | args = c.execute( 615 | 'SELECT Url, Date, Author, Title FROM Posts WHERE Content = ?;', 616 | ( 617 | str(img_text), 618 | ), 619 | ) 620 | full_result = list(args.fetchall()) 621 | for i in full_result: 622 | add_to_found( 623 | i, 624 | 100, 625 | result, 626 | original_post_date, 627 | percentage_matched, 628 | author, 629 | title, 630 | ) 631 | args = c.execute( 632 | 'SELECT Url, Date, Author, Title, ImageText FROM posts;' 633 | ) 634 | for texts in args.fetchall(): 635 | if texts[0] not in result and texts[4] != '': 636 | text_var = texts[4] 637 | difference = SequenceMatcher(None, text_var, img_text).ratio() 638 | if 10 - (difference * 10) < sub_settings[7]: 639 | add_to_found( 640 | texts, 641 | difference * 100, 642 | result, 643 | original_post_date, 644 | percentage_matched, 645 | author, 646 | title, 647 | ) 648 | 649 | # delete post if it has been deleted 650 | for i in result: 651 | if i != '' and i != 'delete': 652 | if reddit.submission(url='https://reddit.com{}'.format(i)).selftext == '[deleted]': 653 | c.execute( 654 | 'DELETE FROM Posts WHERE Url = ?;', 655 | ( 656 | str(i), 657 | ), 658 | ) 659 | posts_to_remove.append([ 660 | i, 661 | original_post_date[cntr], 662 | percentage_matched[cntr], 663 | author[cntr], 664 | title[cntr], 665 | ]) 666 | print('deleted {}'.format(i)) 667 | cntr += 1 668 | 669 | c.close() 670 | 671 | for i in posts_to_remove: 672 | result.remove(i[0]) 673 | original_post_date.remove(i[1]) 674 | percentage_matched.remove(i[2]) 675 | author.remove(i[3]) 676 | title.remove(i[4]) 677 | 678 | for i in original_post_date: 679 | then = datetime.fromtimestamp(i) 680 | time_passed = month_delta(then, now) 681 | full_text = ('{} months ago'.format(str(time_passed))) 682 | if time_passed < 1: 683 | time_passed = (now-then).days 684 | full_text = ('{} days ago'.format(str(time_passed))) 685 | if time_passed < 1: 686 | time_passed = (now-then).total_seconds()//3600 687 | full_text = ('{} hours ago'.format(str(time_passed))) 688 | if time_passed < 1: 689 | time_passed = (now-then).total_seconds()//60 690 | full_text = ('{} minutes ago'.format(str(time_passed))) 691 | if time_passed < 1: 692 | time_passed = (now-then).total_seconds() 693 | full_text = ('{} seconds ago'.format(str(time_passed))) 694 | final_time_passed.append(full_text) 695 | 696 | cntr = 0 697 | for i in result: 698 | return_result.append([ 699 | i, 700 | final_time_passed[cntr], 701 | original_post_date[cntr], 702 | percentage_matched[cntr], 703 | author[cntr], 704 | title[cntr], 705 | ]) 706 | cntr += 1 707 | 708 | if return_result != [['delete', -1, -1, -1, -1, -1]]: 709 | print('Found? {}'.format(return_result)) 710 | 711 | return return_result 712 | 713 | 714 | def add_post(date, contentUrl, media, url, text, author, title, top, hot, new, subreddit, is_text_in_image): 715 | conn = sqlite3.connect( 716 | 'Posts{}.db'.format( 717 | sub( 718 | '([a-zA-Z])', 719 | lambda x: x.groups()[0].upper(), 720 | subreddit, 721 | 1, 722 | ) 723 | ) 724 | ) 725 | c = conn.cursor() 726 | img_text = '' 727 | if text != '​' and text != '' and text != '[removed]' and text != '[deleted]': 728 | content = text 729 | else: 730 | if media is not None and ('oembed' not in media or 'provider_name' not in media['oembed'] or (media['oembed']['provider_name'] != 'gfycat' and media['oembed']['provider_name'] != 'YouTube' and media['oembed']['provider_name'] != 'Imgur')): 731 | vidHash = hash_vid(conn, media, url) 732 | if is_int(vidHash.replace(' ', '')): 733 | content = vidHash 734 | else: 735 | content = contentUrl 736 | elif 'gif' in contentUrl and not (contentUrl.endswith('gifv') or 'gifs' in contentUrl): 737 | gif_hash = hash_gif(conn, contentUrl, url) 738 | if is_int(gif_hash.replace(' ', '')): 739 | content = gif_hash 740 | else: 741 | content = contentUrl 742 | elif 'png' in contentUrl or 'jpg' in contentUrl: 743 | img_hash = hash_img(conn, contentUrl, url) 744 | if is_int(img_hash): 745 | content = img_hash 746 | else: 747 | content = contentUrl 748 | if is_text_in_image: 749 | img_text = extract_text(contentUrl, url) 750 | if img_text == 'invalid': 751 | img_text = '' 752 | else: 753 | content = contentUrl 754 | if top: 755 | location_var = 'top' 756 | elif hot: 757 | location_var = 'hot' 758 | else: 759 | location_var = 'new' 760 | c.execute( 761 | 'INSERT INTO Posts (Date, Content, ImageText, Url, Location, Author, Title) VALUES (?, ?, ?, ?, ?, ?, ?);', 762 | ( 763 | int(date), 764 | str(content), 765 | str(img_text), 766 | str(url), 767 | str(location_var), 768 | str(author), 769 | str(title), 770 | ), 771 | ) 772 | conn.commit() 773 | c.close() 774 | print('Added new post - {}'.format(str(url))) 775 | return int(date), str(content), str(url), str(location_var), str(author), str(title) 776 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | av>=0.4.1 2 | dhash>=1.3 3 | Pillow>=5.3.0 4 | praw>=6.0.0 5 | psaw>=0.0.7 6 | pytesseract>=0.2.5 7 | 8 | -------------------------------------------------------------------------------- /setInterval.py: -------------------------------------------------------------------------------- 1 | import threading 2 | 3 | def setInterval(interval): 4 | def decorator(function): 5 | def wrapper(*args, **kwargs): 6 | stopped = threading.Event() 7 | 8 | def loop(): # executed in another thread 9 | while not stopped.wait(interval): # until stopped 10 | function(*args, **kwargs) 11 | 12 | t = threading.Thread(target=loop) 13 | t.daemon = True # stop if the program exits 14 | t.start() 15 | return stopped 16 | return wrapper 17 | return decorator --------------------------------------------------------------------------------