├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── app.py
├── config.py
├── database.py
├── requirements.txt
└── setInterval.py


/.gitignore:
--------------------------------------------------------------------------------
1 | /.env
2 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:18.04
 2 | 
 3 | ADD app.py /
 4 | ADD config.py /
 5 | ADD database.py /
 6 | ADD setInterval.py /
 7 | ADD requirements.txt /
 8 | 
 9 | RUN \
10 |   apt-get update && \
11 |   apt-get -y upgrade
12 | 
13 | RUN apt-get install -y \
14 |     python3 python3-pip python3-dev pkg-config \
15 |     libavformat-dev libavcodec-dev libavdevice-dev \
16 |     libavutil-dev libswscale-dev libswresample-dev libavfilter-dev
17 | 
18 | RUN pip3 install -r ./requirements.txt
19 | 
20 | ARG BOT_CLIENT_ID
21 | ARG BOT_CLIENT_SECRET
22 | ARG BOT_USER_AGENT
23 | ARG BOT_USERNAME
24 | ARG BOT_PASSWORD
25 | ARG BOT_SUB_COUNT
26 | ARG BOT_SUBREDDIT0
27 | 
28 | ENV BOT_CLIENT_ID=$BOT_CLIENT_ID
29 | ENV BOT_CLIENT_SECRET=$BOT_CLIENT_SECRET
30 | ENV BOT_USER_AGENT=$BOT_USER_AGENT
31 | ENV BOT_USERNAME=$BOT_USERNAME
32 | ENV BOT_PASSWORD=$BOT_PASSWORD
33 | ENV BOT_SUB_COUNT=$BOT_SUB_COUNT
34 | ENV BOT_SUBREDDIT0=$BOT_SUBREDDIT0
35 | 
36 | CMD [ "python3", "./app.py" ]


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Ali Abdoli
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Repost Checker
 2 | 
 3 | # Overview
 4 | This bot is able to notice reposts of any kind(link, text, picture) where the original post has not been deleted, report the post, and make a comment with details about the original post. It can be configured to work with any subreddit's rules.
 5 | 
 6 | # Setup
 7 | 1. download python 3.5+.
 8 | 2. download/clone the repository.
 9 | 3. "pip3 install -r requirements.txt" to install dependencies.
10 | 4. set up environmental variables for the config file with your bot [credintionals](https://github.com/reddit-archive/reddit/wiki/OAuth2) and subreddit.
11 | 5. run app.py with "python3 app.py".
12 | 
13 | # Dependencies
14 | take a look at requirements.txt
15 | 
16 | # Contribution
17 | Feel free to fork the repository and tackle any issues. You may also open new issues.
18 | 
19 | # Testing Bot on Reddit
20 | 1) Create a new subreddit and assign whatever account the bot will post on behalf of as a moderator.
21 | 2) Head to reddit.com/prefs/apps and choose script from the radio buttons.
22 | 3) Fill out the config file with data given by this page after submission.
23 | 
24 | # Subreddits using the bot
25 | [r/ihadastroke](https://www.reddit.com/r/ihadastroke/)
26 | [r/ProgrammerHumor](https://www.reddit.com/r/ProgrammerHumor)
27 | Keep in mind Bot is not running at the moment because of changes occuring.
28 | 
29 | if you are using the bot and your subreddit is not listed above, please make an issue.
30 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | # packages that need to be pip installed
  2 | import praw
  3 | from psaw import PushshiftAPI
  4 | 
  5 | # packages that come with python
  6 | import traceback
  7 | from multiprocessing import Process, Value
  8 | from time import sleep, time
  9 | 
 10 | # other files
 11 | import config
 12 | import database
 13 | from setInterval import setInterval
 14 | 
 15 | rows = []
 16 | reddit = praw.Reddit(client_id=config.client_id,
 17 |                      client_secret=config.client_secret,
 18 |                      username=config.username,
 19 |                      password=config.password,
 20 |                      user_agent=config.user_agent)
 21 | api = PushshiftAPI(reddit)
 22 | 
 23 | @setInterval(1800)
 24 | def delete_comment():
 25 |     try:
 26 |         for comment in reddit.redditor('RepostCheckerBot').comments.new(limit=50):
 27 |             if comment.score < -1:
 28 |                 f = open('fails.txt', 'a')
 29 |                 f.write(str(comment.body))
 30 |                 comment.delete()
 31 | 
 32 |     except Exception as e:
 33 |         print(e)
 34 |         print(repr(e))
 35 |         if '503' in str(e):
 36 |             print('503 from server')
 37 |         if '504' in str(e):
 38 |             print('504 from server')
 39 |         if '401' in str(e):
 40 |             print('401 from server')
 41 |         else:
 42 |             f = open('errs.txt', 'a')
 43 |             f.write('{}\n'.format(str(traceback.format_exc())))
 44 | 
 45 | 
 46 | # the main function
 47 | class FindPosts(Process):
 48 |     def __init__(self, sub_settings):
 49 |         # Constructor.
 50 |         Process.__init__(self)
 51 |         self.sub_settings = sub_settings
 52 |         self.v = Value('i', 0)
 53 | 
 54 |     def run(self):
 55 |         Process(target=self.find_top_posts).start()
 56 |         self.findNewPosts()
 57 | 
 58 |     def find_top_posts(self):
 59 |         subreddit = reddit.subreddit(self.sub_settings[0])
 60 |         print(self.sub_settings)
 61 |         new = False
 62 |         first_time = True
 63 |         print('Starting searching...')
 64 |         while True:
 65 |             try:
 66 |                 post = 0
 67 |                 # first get 50 posts from the top of the subreddit
 68 |                 for submission in api.search_submissions(subreddit=subreddit):
 69 |                     while True:
 70 |                         if (self.v.value != 0) or first_time:
 71 |                             try:
 72 |                                 x = self.v.value
 73 |                             except IndexError as e:
 74 |                                 if 'deque index out of range' not in str(e):
 75 |                                     raise IndexError(e)
 76 |                             if first_time or (x is not None and x == 2):
 77 |                                 first_time = False
 78 |                                 top = True
 79 |                                 hot = False
 80 |                                 post += 1
 81 |                                 result = database.is_logged(
 82 |                                     submission.url,
 83 |                                     submission.media,
 84 |                                     submission.selftext,
 85 |                                     submission.permalink,
 86 |                                     submission.created_utc,
 87 |                                     top,
 88 |                                     hot,
 89 |                                     new,
 90 |                                     self.sub_settings,
 91 |                                     reddit,
 92 |                                 )
 93 | 
 94 |                                 if result != [['delete', -1, -1, -1, -1, -1]] and (result == [] or submission.created_utc != result[0][2]):
 95 |                                     rows.append(database.add_post(
 96 |                                         submission.created_utc,
 97 |                                         submission.url,
 98 |                                         submission.media,
 99 |                                         submission.permalink,
100 |                                         submission.selftext,
101 |                                         submission.author,
102 |                                         submission.title,
103 |                                         top,
104 |                                         hot,
105 |                                         new,
106 |                                         self.sub_settings[0],
107 |                                         self.sub_settings[8]
108 |                                     ))
109 |                                     print('{} --> Added {}'.format(
110 |                                         post,
111 |                                         submission.permalink,
112 |                                     ))
113 |                                 self.v.value = 1
114 |                                 break
115 | 
116 |             except Exception as e:
117 |                 print(traceback.format_exc())
118 |                 if '503' in str(e):
119 |                     print('503 from server')
120 |                 if '401' in str(e):
121 |                     print('401 from server')
122 |                 else:
123 |                     f = open('errs.txt', 'a')
124 |                     error = str(traceback.format_exc())
125 |                     f.write(error)
126 | 
127 |     def findNewPosts(self):
128 |         subreddit = reddit.subreddit(self.sub_settings[0])
129 |         top = False
130 |         hot = False
131 |         new = True
132 |         limit_val = self.sub_settings[6]
133 |         while True:
134 |             try:
135 |                 post = 0
136 |                 # then get 1000 posts from new of the subreddit
137 |                 for submission in api.search_submissions(subreddit=subreddit, limit=limit_val):
138 |                     while True:
139 |                         if self.v.value != 0:
140 |                             try:
141 |                                 x = self.v.value
142 |                             except IndexError as e:
143 |                                 if 'deque index out of range' not in str(e):
144 |                                     raise IndexError(e)
145 |                             if x is not None and x == 1:
146 |                                 post += 1
147 |                                 result = database.is_logged(
148 |                                     submission.url,
149 |                                     submission.media,
150 |                                     submission.selftext,
151 |                                     submission.permalink,
152 |                                     submission.created_utc,
153 |                                     top,
154 |                                     hot,
155 |                                     new,
156 |                                     self.sub_settings,
157 |                                     reddit,
158 |                                 )
159 |                                 if result != [['delete', -1, -1, -1, -1, -1]] and (result == [] or submission.created_utc != result[0][2]):
160 |                                     rows.append(database.add_post(
161 |                                         submission.created_utc,
162 |                                         submission.url,
163 |                                         submission.media,
164 |                                         submission.permalink,
165 |                                         submission.selftext,
166 |                                         submission.author,
167 |                                         submission.title,
168 |                                         top,
169 |                                         hot,
170 |                                         new,
171 |                                         self.sub_settings[0],
172 |                                         self.sub_settings[8],
173 |                                     ))
174 |                                     print('{} --> Added {}'.format(
175 |                                         post,
176 |                                         submission.permalink,
177 |                                     ))
178 | 
179 |                                 if result != [] and result != [['delete', -1, -1, -1, -1, -1]]:
180 |                                     print('reported')
181 |                                     # report and make a comment
182 |                                     submission.report('REPOST ALERT')
183 |                                     cntr = 0
184 |                                     table = ''
185 |                                     for i in result:
186 |                                         table = '{}{}|[{}](https://reddit.com{})|{}|{}%|{}\n'.format(
187 |                                             table,
188 |                                             str(cntr),
189 |                                             i[5],
190 |                                             i[0],
191 |                                             i[1],
192 |                                             str(i[3]),
193 |                                             i[4],
194 |                                         )
195 |                                         cntr += 1
196 |                                     full_text = 'I have detected that this may be a repost: \n'+ \
197 |                                         '\nNum|Post|Date|Match|Author\n:--:|:--:|:--:|:--:|:--:\n{}'.format(table) + \
198 |                                         '\n*Beep Boop* I am a bot | [Source](https://github.com/xXAligatorXx/repostChecker)' + \
199 |                                         '| Contact u/XXAligatorXx for inquiries | The bot will delete its message at -2 score'
200 |                                     do_this = True
201 |                                     while do_this:
202 |                                         try:
203 |                                             submission.reply(full_text)
204 |                                             do_this = False
205 |                                         except:
206 |                                             do_this = True
207 |                                 self.v.value = 2
208 |                                 break
209 | 
210 |                 limit_val = 10
211 |             except Exception as e:
212 |                 print(traceback.format_exc())
213 |                 if '503' in str(e):
214 |                     print('503 from server')
215 |                 if '401' in str(e):
216 |                     print('401 from server')
217 |                 else:
218 |                     f = open('errs.txt', 'a')
219 |                     error = str(traceback.format_exc())
220 |                     f.write(error)
221 | 
222 | 
223 | thread_count = 0
224 | threads = []
225 | for i in config.sub_settings:
226 |     if i is not None:
227 |         database.init_database(i[0], i[8])
228 |         threads.append(FindPosts(i))
229 |         if i[1] is not None or i[2] is not None or i[3] is not None:
230 |             database.delete_old_loop(i)
231 |         threads[thread_count].start()
232 |         thread_count += 1
233 | 
234 | delete_comment()
235 | for i in range(0, len(threads)):
236 |     threads[i].join()
237 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | client_id = os.environ['BOT_CLIENT_ID']
 4 | client_secret = os.environ['BOT_CLIENT_SECRET']
 5 | user_agent = os.environ['BOT_USER_AGENT']
 6 | username = os.environ['BOT_USERNAME']
 7 | password = os.environ['BOT_PASSWORD']
 8 | num_subs = int(os.environ['BOT_SUB_COUNT'])
 9 | sub_settings = [[
10 |         os.environ['BOT_SUBREDDIT' + i],
11 |         int(os.environ['BOT_TOP_DAYS' + i]) if 'BOT_TOP_DAYS' + i in os.environ else None,
12 |         int(os.environ['BOT_HOT_DAYS' + i]) if 'BOT_HOT_DAYS' + i in os.environ else None,
13 |         int(os.environ['BOT_NEW_DAYS' + i]) if 'BOT_NEW_DAYS' + i in os.environ else None,
14 |         int(os.environ['BOT_TOP_NUM_POSTS' + i]) if 'BOT_TOP_NUM_POSTS' + i in os.environ else 1000,
15 |         int(os.environ['BOT_HOT_NUM_POSTS' + i]) if 'BOT_HOT_NUM_POSTS' + i in os.environ else 1000,
16 |         int(os.environ['BOT_NEW_NUM_POSTS' + i]) if 'BOT_NEW_NUM_POSTS' + i in os.environ else 1000,
17 |         int(os.environ['BOT_THRESH' +i]) if 'BOT_THRESH' + i in os.environ else 5,
18 |         bool(os.environ['BOT_TEXT_IN_IMAGE' + i]) if 'BOT_TEXT_IN_IMAGE' + i in os.environ else False,
19 |     ] for i in [str(x) for x in range(num_subs)]]
20 | 


--------------------------------------------------------------------------------
/database.py:
--------------------------------------------------------------------------------
  1 | # packages that come with python
  2 | from datetime import timedelta, datetime
  3 | from calendar import monthrange
  4 | from urllib.request import Request, urlopen
  5 | from urllib.error import HTTPError
  6 | from io import BytesIO
  7 | import ssl
  8 | import sqlite3
  9 | from re import sub
 10 | import traceback
 11 | 
 12 | # packages that need to be pip installed
 13 | from PIL import Image
 14 | import dhash
 15 | from difflib import SequenceMatcher
 16 | from pytesseract import image_to_string
 17 | import av
 18 | 
 19 | from setInterval import setInterval
 20 | 
 21 | context = ssl._create_unverified_context()
 22 | user_agent = 'Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46'
 23 | 
 24 | 
 25 | def init_database(subreddit, is_text_in_image):
 26 |     conn = sqlite3.connect(
 27 |             'Posts{}.db'.format(
 28 |                 sub(
 29 |                     '([a-zA-Z])',
 30 |                     lambda x: x.groups()[0].upper(),
 31 |                     subreddit,
 32 |                     1,
 33 |                     )
 34 |                 )
 35 |             )
 36 |     c = conn.cursor()
 37 |     c.execute(
 38 |         'CREATE TABLE IF NOT EXISTS Posts (Date INT, Content TEXT, ImageText TEXT, Url TEXT, Location TEXT, Author TEXT, Title TEXT);',
 39 |     )
 40 |     conn.commit()
 41 |     c.close()
 42 |     print('Create table.')
 43 | 
 44 | 
 45 | def canonical(s):
 46 |     return ''.join([c for c in s if not c.isspace()])
 47 | 
 48 | 
 49 | def is_int(s):
 50 |     try:
 51 |         int(s)
 52 |         return True
 53 |     except:
 54 |         return False
 55 | 
 56 | 
 57 | 
 58 | def month_delta(d1, d2):
 59 |     delta = 0
 60 |     while True:
 61 |         mdays = monthrange(d1.year, d1.month)[1]
 62 |         d1 += timedelta(days=mdays)
 63 |         if d1 <= d2:
 64 |             delta += 1
 65 |         else:
 66 |             break
 67 |     return delta
 68 | 
 69 | 
 70 | def hash_img(conn, img_url, url):
 71 |     img_hash = 'invalid'
 72 |     try:
 73 |         f = BytesIO(
 74 |             urlopen(
 75 |                 Request(
 76 |                     str(img_url),
 77 |                     headers={
 78 |                         'User-Agent': user_agent
 79 |                     },
 80 |                 ),
 81 |                 context=context,
 82 |             ).read(),
 83 |         )
 84 |         img = Image.open(f)
 85 |         img_hash = dhash.dhash_int(img)
 86 |     except HTTPError:
 87 |         c = conn.cursor()
 88 |         c.execute(
 89 |             'DELETE FROM Posts WHERE Url = ?;',
 90 |             (
 91 |                 str(url),
 92 |             ),
 93 |         )
 94 |         conn.commit()
 95 |         c.close()
 96 |     except:
 97 |         f = open('dedLink.txt', 'a')
 98 |         f.write('{}\n{}\n'.format(str(traceback.format_exc()), img_url))
 99 |         c = conn.cursor()
100 |         c.execute(
101 |             'DELETE FROM Posts WHERE Url = ?;',
102 |             (
103 |                 str(url),
104 |             ),
105 |         )
106 |         conn.commit()
107 |         c.close()
108 |     return img_hash
109 | 
110 | 
111 | def extract_text(img_url, url):
112 |     img_text = 'invalid'
113 |     try:
114 |         f = BytesIO(
115 |             urlopen(
116 |                 Request(
117 |                     str(img_url),
118 |                     headers={
119 |                         'User-Agent': user_agent
120 |                     },
121 |                 ),
122 |                 context=context,
123 |             ).read(),
124 |         )
125 |         img = Image.open(f)
126 |         img_text = image_to_string(img).replace('\n', '').replace('\r', '').replace(' ', '')
127 |     except Exception as e:
128 |         if e.__class__.__name__ != 'HTTPError':
129 |             f = open('tesseractErrs.txt', 'a')
130 |             f.write('{}\n{}\n'.format(str(traceback.format_exc()), img_url))
131 |     return img_text
132 | 
133 | 
134 | def hash_vid(conn, vid_url, url):
135 |     vid_hash = ''
136 |     try:
137 |         container = av.open(vid_url['reddit_video']['fallback_url'])
138 |         for frame in container.decode(video=0):
139 |             vid_hash = '{}{} '.format(vid_hash, str(dhash.dhash_int(frame.to_image())))
140 |     except Exception as e:
141 |         if '403' in str(e):
142 |             c = conn.cursor()
143 |             c.execute(
144 |                 'DELETE FROM Posts WHERE Url = ?;',
145 |                 (
146 |                     str(url),
147 |                 ),
148 |             )
149 |             conn.commit()
150 |             c.close()
151 |         else:
152 |             f = open('dedLink.txt', 'a')
153 |             f.write('{}\n{}\n'.format(str(traceback.format_exc()), vid_url))
154 |             c = conn.cursor()
155 |             c.execute(
156 |                 'DELETE FROM Posts WHERE Url = ?;',
157 |                 (
158 |                     str(url),
159 |                 ),
160 |             )
161 |             conn.commit()
162 |             c.close()
163 |         vid_hash = 'invalid'
164 |     return vid_hash
165 | 
166 | 
167 | def hash_gif(conn, gif_url, url):
168 |     gif_hash = ''
169 |     nframes = 0
170 |     try:
171 |         f = BytesIO(
172 |                 urlopen(
173 |                     Request(
174 |                         str(gif_url),
175 |                         headers={'User-Agent': user_agent},
176 |                     ),
177 |                     context=context,
178 |                 ).read(),
179 |             )
180 |         frame = Image.open(f)
181 |         while frame:
182 |             dhash.dhash_int(frame)
183 |             gif_hash = '{}{} '.format(gif_hash, str(dhash.dhash_int(frame)))
184 |             nframes += 1
185 |             try:
186 |                 frame.seek(nframes)
187 |             except EOFError:
188 |                 break
189 |     except HTTPError:
190 |         c = conn.cursor()
191 |         c.execute(
192 |             'DELETE FROM Posts WHERE Url = ?;',
193 |             (
194 |                 str(url),
195 |             ),
196 |         )
197 |         conn.commit()
198 |         c.close()
199 |     except:
200 |         f = open('dedLink.txt', 'a')
201 |         f.write('{}\n{}\n'.format(str(traceback.format_exc()), url))
202 |         c = conn.cursor()
203 |         c.execute(
204 |             'DELETE FROM Posts WHERE Url = ?;',
205 |             (
206 |                 str(url),
207 |             ),
208 |         )
209 |         conn.commit()
210 |         c.close()
211 |         gif_hash = 'invalid'
212 |     return gif_hash
213 | 
214 | 
215 | def hash_vid_difference(original_hash, new_hash):
216 |     cntr = 0
217 |     original_hash_list = original_hash.split()
218 |     new_hash_list = new_hash.split()
219 |     frame_differences = []
220 |     min_differences = []
221 |     for i in original_hash_list:
222 |         for j in new_hash_list:
223 |             frame_differences.append(dhash.get_num_bits_different(int(i), int(j)))
224 |             cntr += 1
225 |         min_differences.append(min(frame_differences))
226 |         frame_differences = []
227 |     return sum(min_differences)/len(min_differences)
228 | 
229 | 
230 | def add_to_found(post, precentage, result, original_post_date, precentage_matched, author, title):
231 |     result.append(post[0])
232 |     original_post_date.append(post[1])
233 |     author.append(post[2])
234 |     title.append(post[3])
235 |     precentage_matched.append(precentage)
236 | 
237 | 
238 | def update_database(conn, url, update_val):
239 |     c = conn.cursor()
240 |     c.execute(
241 |         'UPDATE Posts SET Location = ? WHERE Url = ?;',
242 |         (
243 |             str(update_val),
244 |             str(url),
245 |         ),
246 |     )
247 |     conn.commit()
248 |     c.close()
249 | 
250 | 
251 | # def delete_old_from_database(sub_settings):
252 | 
253 | #     delete_old_loop(sub_settings, c, conn)
254 | 
255 | @setInterval(86400)
256 | def delete_old_loop(sub_settings, c, conn):
257 |     conn = sqlite3.connect(
258 |             'Posts{}.db'.format(
259 |                 sub(
260 |                     '([a-zA-Z])',
261 |                     lambda x: x.groups()[0].upper(),
262 |                     sub_settings[0],
263 |                     1,
264 |                     )
265 |                 )
266 |             )
267 |     c = conn.cursor()
268 |     args = c.execute(
269 |         'SELECT Date, Location FROM Posts;'
270 |     )
271 |     now = datetime.utcnow()
272 |     for x in args.fetchall():
273 |         then = datetime.fromtimestamp(x[0])
274 |         time_passed = (now - then).days
275 |         if sub_settings[1] is not None and time_passed > sub_settings[1] and x[1] == 'top' or sub_settings[
276 |             2] is not None and time_passed > sub_settings[2] and x[1] == 'hot' or sub_settings[
277 |             3] is not None and time_passed > sub_settings[3] and x[1] == 'new':
278 |             c.execute(
279 |                 'DELETE FROM Posts WHERE Date = ?;',
280 |                 (
281 |                     int(x[0]),
282 |                 ),
283 |             )
284 |             conn.commit()
285 |             print('deleted an old post')
286 | 
287 | 
288 | def is_logged(content_url, media, text, url, date, top, hot, new, sub_settings, reddit):
289 |     result = []
290 |     original_post_date = []
291 |     final_time_passed = []
292 |     percentage_matched = []
293 |     author = []
294 |     title = []
295 |     args = None
296 |     posts_to_remove = []
297 |     cntr = 0
298 |     return_result = []
299 | 
300 |     conn = sqlite3.connect(
301 |             'Posts{}.db'.format(
302 |                 sub(
303 |                     '([a-zA-Z])',
304 |                     lambda x: x.groups()[0].upper(),
305 |                     sub_settings[0],
306 |                     1,
307 |                 )
308 |             )
309 |         )
310 |     c = conn.cursor()
311 | 
312 |     now = datetime.utcnow()
313 |     then = datetime.fromtimestamp(date)
314 |     time_passed = (now-then).days
315 | 
316 |     # ignore post if too old
317 |     if sub_settings[1] is not None and time_passed > sub_settings[1] and top or sub_settings[2] is not None and time_passed > sub_settings[2] and hot or sub_settings[3] is not None and time_passed > sub_settings[3] and new:
318 |         result = ['delete']
319 |         original_post_date = [-1]
320 |         final_time_passed = [-1]
321 |         percentage_matched = [-1]
322 |         author = [-1]
323 |         title = [-1]
324 | 
325 |     else:
326 | 
327 |         # check if post is already in database
328 |         args = c.execute(
329 |             'SELECT COUNT(1) FROM Posts WHERE Url = ?;',
330 |             (
331 |                 str(url),
332 |             ),
333 |         )
334 |         if list(args.fetchone())[0] != 0:
335 |             args = c.execute(
336 |                 'SELECT Location FROM Posts WHERE Url = ?;',
337 |                 (
338 |                     str(url),
339 |                 ),
340 |             )
341 |             full_result = list(args.fetchall())
342 | 
343 |             # make sure the post is in the right category
344 |             for i in full_result:
345 |                 if i[0] != 'top' and top and (sub_settings[1] is None or (time_passed < sub_settings[1] and (sub_settings[2] is None or sub_settings[1] > sub_settings[2]) and (sub_settings[3] is None or sub_settings[1] > sub_settings[3]))):
346 |                     update_database(conn, url, 'top')
347 |                 if i[0] != 'hot' and hot and (sub_settings[2] is None or (time_passed < sub_settings[2] and (sub_settings[1] is None or sub_settings[2] > sub_settings[1]) and (sub_settings[3] is None or sub_settings[2] > sub_settings[3]))):
348 |                     update_database(conn, url, 'hot')
349 |                 if i[0] != 'new' and new and (sub_settings[3] is None or (time_passed < sub_settings[3] and (sub_settings[2] is None or sub_settings[3] > sub_settings[2]) and (sub_settings[1] is None or sub_settings[3] > sub_settings[1]))):
350 |                     update_database(conn, url, 'new')
351 | 
352 |             # ignore post
353 |             result = ['delete']
354 |             original_post_date = [-1]
355 |             final_time_passed = [-1]
356 |             percentage_matched = [-1]
357 |             author = [-1]
358 |             title = [-1]
359 | 
360 |         # check if post is a repost
361 |         else:
362 | 
363 |             # check for text
364 |             if text != '&#x200B;' and text != '' and text != '[removed]' and text != '[deleted]':
365 |                 args = c.execute(
366 |                     'SELECT COUNT(1) FROM Posts WHERE Content = ?;',
367 |                     (
368 |                         str(text),
369 |                     ),
370 |                 )
371 |                 if list(args.fetchone())[0] != 0:
372 |                     args = c.execute(
373 |                         'SELECT Url, Date, Author, Title FROM Posts WHERE Content = ?;',
374 |                         (
375 |                             str(text),
376 |                         ),
377 |                     )
378 |                     full_result = list(args.fetchall())
379 |                     for i in full_result:
380 |                         add_to_found(
381 |                             i,
382 |                             100,
383 |                             result,
384 |                             original_post_date,
385 |                             percentage_matched,
386 |                             author,
387 |                             title,
388 |                         )
389 |                     args = c.execute(
390 |                         'SELECT Url, Date, Author, Title, Content FROM posts;',
391 |                     )
392 |                     for texts in args.fetchall():
393 |                         if texts[0] not in result:
394 |                             text_var = texts[4]
395 |                             difference = SequenceMatcher(None, text_var, text).ratio()
396 |                             if 10 - (difference * 10) < sub_settings[7]:
397 |                                 add_to_found(
398 |                                     texts,
399 |                                     difference * 100,
400 |                                     result,
401 |                                     original_post_date,
402 |                                     percentage_matched,
403 |                                     author,
404 |                                     title,
405 |                                 )
406 | 
407 |             # check for v.reddit
408 |             elif media is not None and ('oembed' not in media or 'provider_name' not in media['oembed'] or (media['oembed']['provider_name'] != 'gfycat' and media['oembed']['provider_name'] != 'YouTube' and media['oembed']['provider_name'] != 'Imgur')):
409 |                 vid_hash = hash_vid(conn, media, url)
410 |                 if vid_hash == 'invalid':
411 |                     result = ['delete']
412 |                     original_post_date = [-1]
413 |                     final_time_passed = [-1]
414 |                     percentage_matched = [-1]
415 |                     author = [-1]
416 |                     title = [-1]
417 |                 if is_int(vid_hash.replace(' ', '')):
418 |                     args = c.execute(
419 |                         'SELECT COUNT(1) FROM Posts WHERE Content = ?;',
420 |                         (
421 |                             str(vid_hash),
422 |                         ),
423 |                     )
424 |                     if list(args.fetchone())[0] != 0:
425 |                         args = c.execute(
426 |                             'SELECT Url, Date, Author, Title FROM Posts WHERE Content = ?;',
427 |                             (
428 |                                 str(vid_hash),
429 |                             ),g
430 |                         )
431 |                         full_result = list(args.fetchall())
432 |                         for i in full_result:
433 |                             add_to_found(
434 |                                 i,
435 |                                 100,
436 |                                 result,
437 |                                 original_post_date,
438 |                                 percentage_matched,
439 |                                 author,
440 |                                 title
441 |                             )
442 |                     args = c.execute(
443 |                         'SELECT Url, Date, Author, Title Content FROM posts;',
444 |                     )
445 |                     for hashed in args.fetchall():
446 |                         if hashed[0] not in result:
447 |                             hashed_readable = hashed[2]
448 |                             if is_int(hashed_readable.replace(' ', '')):
449 |                                 hashed_difference = hash_vid_difference(
450 |                                     hashed_readable, vid_hash)
451 |                                 if hashed_difference < sub_settings[7]:
452 |                                     add_to_found(
453 |                                         hashed,
454 |                                         ((sub_settings[7] - hashed_difference)/sub_settings[7])*100,
455 |                                         result,
456 |                                         original_post_date,
457 |                                         percentage_matched,
458 |                                         author,
459 |                                         title,
460 |                                     )
461 | 
462 |             # check for image or gif
463 |             elif content_url != '':
464 |                 args = c.execute(
465 |                     'SELECT COUNT(1) FROM Posts WHERE Content = ?;',
466 |                     (
467 |                         str(content_url).replace(
468 |                             '&feature=youtu.be',
469 |                             '',
470 |                         ),
471 |                     ),
472 |                 )
473 |                 if list(args.fetchone())[0] != 0:
474 |                     args = c.execute(
475 |                         'SELECT Url, Date, Author, Title FROM Posts WHERE Content = ?;',
476 |                         (
477 |                             str(content_url).replace(
478 |                                 '&feature=youtu.be',
479 |                                 '',
480 |                             ),
481 |                         ),
482 |                     )
483 |                     full_result = list(args.fetchall())
484 |                     for i in full_result:
485 |                         add_to_found(
486 |                             i,
487 |                             100,
488 |                             result,
489 |                             original_post_date,
490 |                             percentage_matched,
491 |                             author,
492 |                             title,
493 |                         )
494 | 
495 |                 # check for gif
496 |                 if 'gif' in content_url and not (content_url.endswith('gifv') or 'gifs' in content_url):
497 |                     gifHash = hash_gif(conn, content_url, url)
498 |                     if gifHash == 'invalid':
499 |                         result = ['delete']
500 |                         original_post_date = [-1]
501 |                         final_time_passed = [-1]
502 |                         percentage_matched = [-1]
503 |                         author = [-1]
504 |                         title = [-1]
505 |                     if is_int(gifHash.replace(' ', '')):
506 |                         args = c.execute(
507 |                             'SELECT COUNT(1) FROM Posts WHERE Content = ?;',
508 |                             (
509 |                                 str(gifHash),
510 |                             ),
511 |                         )
512 |                         if list(args.fetchone())[0] != 0:
513 |                             args = c.execute(
514 |                                 'SELECT Url, Date, Author, Title FROM Posts WHERE Content = ?;',
515 |                                 (
516 |                                     str(gifHash),
517 |                                 ),
518 |                             )
519 |                             full_result = list(args.fetchall())
520 |                             for i in full_result:
521 |                                 add_to_found(
522 |                                     i,
523 |                                     100,
524 |                                     result,
525 |                                     original_post_date,
526 |                                     percentage_matched,
527 |                                     author,
528 |                                     title,
529 |                                 )
530 |                         args = c.execute(
531 |                             'SELECT Url, Date, Author, Title, Content FROM posts;'
532 |                         )
533 |                         for hashed in args.fetchall():
534 |                             if hashed[0] not in result:
535 |                                 hashed_readable = hashed[2]
536 |                                 if is_int(hashed_readable.replace(' ', '')):
537 |                                     hashed_difference = hash_vid_difference(
538 |                                         hashed_readable, gifHash)
539 |                                     if hashed_difference < sub_settings[7]:
540 |                                         add_to_found(
541 |                                             hashed,
542 |                                             ((sub_settings[7] - hashed_difference)/sub_settings[7])*100,
543 |                                             result,
544 |                                             original_post_date,
545 |                                             percentage_matched,
546 |                                             author,
547 |                                             title,
548 |                                         )
549 | 
550 |                 # check for image
551 |                 elif 'png' in content_url or 'jpg' in content_url:
552 |                     imgHash = hash_img(conn, content_url, url)
553 |                     if imgHash == 'invalid':
554 |                         result = ['delete']
555 |                         original_post_date = [-1]
556 |                         final_time_passed = [-1]
557 |                         percentage_matched = [-1]
558 |                         author = [-1]
559 |                         title = [-1]
560 |                     elif is_int(imgHash):
561 |                         args = c.execute(
562 |                             'SELECT COUNT(1) FROM Posts WHERE Content = ?;',
563 |                             (
564 |                                 str(imgHash),
565 |                             ),
566 |                         )
567 |                         if list(args.fetchone())[0] != 0:
568 |                             args = c.execute(
569 |                                 'SELECT Url, Date, Author, Title FROM Posts WHERE Content = ?;',
570 |                                 (
571 |                                     str(imgHash),
572 |                                 ),
573 |                             )
574 |                             full_result = list(args.fetchall())
575 |                             for i in full_result:
576 |                                 add_to_found(
577 |                                     i,
578 |                                     100,
579 |                                     result,
580 |                                     original_post_date,
581 |                                     percentage_matched,
582 |                                     author,
583 |                                     title,
584 |                                 )
585 |                         args = c.execute(
586 |                             'SELECT Url, Date, Author, Title, Content FROM posts;'
587 |                         )
588 |                         for hashed in args.fetchall():
589 |                             if hashed[0] not in result:
590 |                                 hashed_readable = hashed[2]
591 |                                 if is_int(hashed_readable):
592 |                                     hashed_difference = dhash.get_num_bits_different(
593 |                                         imgHash, int(hashed_readable))
594 |                                     if hashed_difference < sub_settings[7]:
595 |                                         add_to_found(
596 |                                             hashed,
597 |                                             ((sub_settings[7] - hashed_difference)/sub_settings[7])*100,
598 |                                             result,
599 |                                             original_post_date,
600 |                                             percentage_matched,
601 |                                             author,
602 |                                             title,
603 |                                         )
604 |                     if sub_settings[8]:
605 |                         img_text = extract_text(content_url, url)
606 |                         if img_text != 'invalid' and img_text != '':
607 |                             args = c.execute(
608 |                                 'SELECT COUNT(1) FROM Posts WHERE Content = ?;',
609 |                                 (
610 |                                     str(img_text),
611 |                                 ),
612 |                             )
613 |                             if list(args.fetchone())[0] != 0:
614 |                                 args = c.execute(
615 |                                     'SELECT Url, Date, Author, Title FROM Posts WHERE Content = ?;',
616 |                                     (
617 |                                         str(img_text),
618 |                                     ),
619 |                                 )
620 |                                 full_result = list(args.fetchall())
621 |                                 for i in full_result:
622 |                                     add_to_found(
623 |                                         i,
624 |                                         100,
625 |                                         result,
626 |                                         original_post_date,
627 |                                         percentage_matched,
628 |                                         author,
629 |                                         title,
630 |                                     )
631 |                             args = c.execute(
632 |                                 'SELECT Url, Date, Author, Title, ImageText FROM posts;'
633 |                             )
634 |                             for texts in args.fetchall():
635 |                                 if texts[0] not in result and texts[4] != '':
636 |                                     text_var = texts[4]
637 |                                     difference = SequenceMatcher(None, text_var, img_text).ratio()
638 |                                     if 10 - (difference * 10) < sub_settings[7]:
639 |                                         add_to_found(
640 |                                             texts,
641 |                                             difference * 100,
642 |                                             result,
643 |                                             original_post_date,
644 |                                             percentage_matched,
645 |                                             author,
646 |                                             title,
647 |                                         )
648 | 
649 |     # delete post if it has been deleted
650 |     for i in result:
651 |         if i != '' and i != 'delete':
652 |             if reddit.submission(url='https://reddit.com{}'.format(i)).selftext == '[deleted]':
653 |                 c.execute(
654 |                     'DELETE FROM Posts WHERE Url = ?;',
655 |                     (
656 |                         str(i),
657 |                     ),
658 |                 )
659 |                 posts_to_remove.append([
660 |                     i,
661 |                     original_post_date[cntr],
662 |                     percentage_matched[cntr],
663 |                     author[cntr],
664 |                     title[cntr],
665 |                 ])
666 |                 print('deleted {}'.format(i))
667 |         cntr += 1
668 | 
669 |     c.close()
670 | 
671 |     for i in posts_to_remove:
672 |         result.remove(i[0])
673 |         original_post_date.remove(i[1])
674 |         percentage_matched.remove(i[2])
675 |         author.remove(i[3])
676 |         title.remove(i[4])
677 | 
678 |     for i in original_post_date:
679 |         then = datetime.fromtimestamp(i)
680 |         time_passed = month_delta(then, now)
681 |         full_text = ('{} months ago'.format(str(time_passed)))
682 |         if time_passed < 1:
683 |             time_passed = (now-then).days
684 |             full_text = ('{} days ago'.format(str(time_passed)))
685 |         if time_passed < 1:
686 |             time_passed = (now-then).total_seconds()//3600
687 |             full_text = ('{} hours ago'.format(str(time_passed)))
688 |         if time_passed < 1:
689 |             time_passed = (now-then).total_seconds()//60
690 |             full_text = ('{} minutes ago'.format(str(time_passed)))
691 |         if time_passed < 1:
692 |             time_passed = (now-then).total_seconds()
693 |             full_text = ('{} seconds ago'.format(str(time_passed)))
694 |         final_time_passed.append(full_text)
695 | 
696 |     cntr = 0
697 |     for i in result:
698 |         return_result.append([
699 |             i,
700 |             final_time_passed[cntr],
701 |             original_post_date[cntr],
702 |             percentage_matched[cntr],
703 |             author[cntr],
704 |             title[cntr],
705 |         ])
706 |         cntr += 1
707 | 
708 |     if return_result != [['delete', -1, -1, -1, -1, -1]]:
709 |         print('Found? {}'.format(return_result))
710 | 
711 |     return return_result
712 | 
713 | 
714 | def add_post(date, contentUrl, media, url, text, author, title, top, hot, new, subreddit, is_text_in_image):
715 |     conn = sqlite3.connect(
716 |             'Posts{}.db'.format(
717 |                 sub(
718 |                     '([a-zA-Z])',
719 |                     lambda x: x.groups()[0].upper(),
720 |                     subreddit,
721 |                     1,
722 |                 )
723 |             )
724 |         )
725 |     c = conn.cursor()
726 |     img_text = ''
727 |     if text != '&#x200B;' and text != '' and text != '[removed]' and text != '[deleted]':
728 |         content = text
729 |     else:
730 |         if media is not None and ('oembed' not in media or 'provider_name' not in media['oembed'] or (media['oembed']['provider_name'] != 'gfycat' and media['oembed']['provider_name'] != 'YouTube' and media['oembed']['provider_name'] != 'Imgur')):
731 |             vidHash = hash_vid(conn, media, url)
732 |             if is_int(vidHash.replace(' ', '')):
733 |                 content = vidHash
734 |             else:
735 |                 content = contentUrl
736 |         elif 'gif' in contentUrl and not (contentUrl.endswith('gifv') or 'gifs' in contentUrl):
737 |             gif_hash = hash_gif(conn, contentUrl, url)
738 |             if is_int(gif_hash.replace(' ', '')):
739 |                 content = gif_hash
740 |             else:
741 |                 content = contentUrl
742 |         elif 'png' in contentUrl or 'jpg' in contentUrl:
743 |             img_hash = hash_img(conn, contentUrl, url)
744 |             if is_int(img_hash):
745 |                 content = img_hash
746 |             else:
747 |                 content = contentUrl
748 |             if is_text_in_image:
749 |                 img_text = extract_text(contentUrl, url)
750 |                 if img_text == 'invalid':
751 |                     img_text = ''
752 |         else:
753 |             content = contentUrl
754 |     if top:
755 |         location_var = 'top'
756 |     elif hot:
757 |         location_var = 'hot'
758 |     else:
759 |         location_var = 'new'
760 |     c.execute(
761 |         'INSERT INTO Posts (Date, Content, ImageText, Url, Location, Author, Title) VALUES (?, ?, ?, ?, ?, ?, ?);',
762 |         (
763 |                 int(date),
764 |                 str(content),
765 |                 str(img_text),
766 |                 str(url),
767 |                 str(location_var),
768 |                 str(author),
769 |                 str(title),
770 |         ),
771 |     )
772 |     conn.commit()
773 |     c.close()
774 |     print('Added new post - {}'.format(str(url)))
775 |     return int(date), str(content), str(url), str(location_var), str(author), str(title)
776 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | av>=0.4.1
2 | dhash>=1.3
3 | Pillow>=5.3.0
4 | praw>=6.0.0
5 | psaw>=0.0.7
6 | pytesseract>=0.2.5
7 | 
8 | 


--------------------------------------------------------------------------------
/setInterval.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | 
 3 | def setInterval(interval):
 4 |     def decorator(function):
 5 |         def wrapper(*args, **kwargs):
 6 |             stopped = threading.Event()
 7 | 
 8 |             def loop(): # executed in another thread
 9 |                 while not stopped.wait(interval): # until stopped
10 |                     function(*args, **kwargs)
11 | 
12 |             t = threading.Thread(target=loop)
13 |             t.daemon = True # stop if the program exits
14 |             t.start()
15 |             return stopped
16 |         return wrapper
17 |     return decorator


--------------------------------------------------------------------------------