├── .gitignore ├── check_command.py ├── check_valid.py ├── config.py ├── download_page.py ├── functions ├── __init__.py ├── archive.py └── periodical_job.py ├── irc_bot.py ├── irc_message.py ├── main.py ├── periodical_jobs ├── __init__.py └── check_temp_perjob.py ├── refresh.py ├── services ├── __init__.py ├── dl__facebook_com.py ├── dl__ignores__twitter_com ├── dl__liveleak_com.py ├── dl__periscope_tv.py ├── dl__twitter_com.py ├── dl__vine_co.py ├── video__facebook_com.py ├── video__liveleak_com.py ├── video__periscope_tv.py ├── video__twitter_com.py ├── video__twitter_com_hashtag.py ├── video__twitter_com_user.py ├── video__video_stream.py ├── video__vine_co.py └── video__webpage.py ├── temp_perjobs └── __init__.py ├── upload.py └── url.py /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.pyc 3 | keys 4 | to_be_uploaded/ 5 | irclog 6 | -------------------------------------------------------------------------------- /check_command.py: -------------------------------------------------------------------------------- 1 | import refresh 2 | 3 | def find_command_service(command): 4 | services_list = refresh.services_list 5 | for service in services_list: 6 | if command in service[2]: 7 | return service[0] 8 | else: 9 | return None 10 | -------------------------------------------------------------------------------- /check_valid.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | def check_valid_url(url): 4 | if re.search(r'^http:\/\/[^\/]+?\.[^\/]+', url): 5 | return 'http' 6 | elif re.search(r'^https:\/\/[^\/]+?\.[^\/]+', url): 7 | return 'https' 8 | elif re.search(r'^rtsp:\/\/[^\/]+?\.[^\/]+', url): 9 | return 'rtsp' 10 | elif re.search(r'^rtmp:\/\/[^\/]+?\.[^\/]+', url): 11 | return 'rtmp' 12 | elif re.search(r'^mms:\/\/[^\/]+?\.[^\/]+', url): 13 | return 'mms' 14 | elif re.search(r'^mmsh:\/\/[^\/]+?\.[^\/]+', url): 15 | return 'mmsh' 16 | else: 17 | return False 18 | 19 | def check_num(string): 20 | try: 21 | int(string) 22 | return True 23 | except: 24 | return False 25 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | version = 20170916.01 2 | 3 | irc_server = 'irc.underworld.no' 4 | irc_port = 6667 5 | irc_channel = '#videobot' 6 | irc_nick = 'video' 7 | 8 | github = 'https://github.com/ArchiveTeam/VideoBot' 9 | 10 | periodical_job_open_time = 172800 11 | max_warc_item_size = 5368709120 12 | 13 | with open('keys', 'r') as file: 14 | ia_access_key, ia_secret_key = file.read().replace('\n', '').replace('\r', '').replace(' ', '').split(',') 15 | print(ia_access_key, ia_secret_key) 16 | -------------------------------------------------------------------------------- /download_page.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import re 3 | 4 | def extract_info(regexes, url): 5 | html = requests.get(url) 6 | if type(regexes) is str: 7 | regexes = [regexes] 8 | extracted = [] 9 | for regex in regexes: 10 | if re.search(regex, html.text): 11 | extracted.append(re.findall(regex, html.text)) 12 | else: 13 | extracted.append('') 14 | if len(extracted) == 1: 15 | extracted = extracted[0] 16 | return extracted 17 | 18 | -------------------------------------------------------------------------------- /functions/__init__.py: -------------------------------------------------------------------------------- 1 | import archive 2 | import periodical_job 3 | -------------------------------------------------------------------------------- /functions/archive.py: -------------------------------------------------------------------------------- 1 | from config import irc_channel, irc_nick 2 | import irc_bot 3 | import refresh 4 | import threading 5 | import re 6 | import url 7 | import services 8 | 9 | def irc_bot_print(irc_channel, irc_bot_message): 10 | irc_bot.irc_bot_print(irc_channel, irc_bot_message) 11 | 12 | def find_url_service(url_): 13 | return url.find_url_service(url_) 14 | 15 | def main(command, user): 16 | if len(command) == 1: 17 | irc_bot_print(irc_channel, user + ': Please specify an URL.') 18 | elif command[1].startswith('http://') or command[1].startswith('https://'): 19 | print(command) 20 | threading.Thread(target = process_url, args = (command, user)).start() 21 | else: 22 | irc_bot_print(irc_channel, user + ': I can only handle http:// and https://') 23 | 24 | def process_url(command, user): 25 | url_service = find_url_service(command[1]) 26 | for irc_bot_message in eval('services.' + url_service + '.process(url_service, command, user)'): 27 | if not irc_bot_message.startswith(irc_nick + ':'): 28 | irc_bot_print(irc_channel, irc_bot_message) -------------------------------------------------------------------------------- /functions/periodical_job.py: -------------------------------------------------------------------------------- 1 | from config import irc_channel 2 | from config import periodical_job_open_time 3 | import periodical_jobs 4 | import check_valid 5 | import check_command 6 | import irc_bot 7 | import refresh 8 | import threading 9 | import re 10 | import os 11 | import random 12 | import string 13 | import url 14 | import services 15 | import subprocess 16 | import time 17 | import sys 18 | 19 | irc_bot_print = lambda irc_channel, irc_bot_message: irc_bot.irc_bot_print(irc_channel, irc_bot_message) 20 | check_temp_perjob_variable = lambda ticket_id, var: periodical_jobs.check_temp_perjob.check_temp_perjob_variable(ticket_id, var) 21 | get_temp_perjob_variables = lambda ticket_id: periodical_jobs.check_temp_perjob.get_temp_perjob_variables(ticket_id) 22 | check_valid_url = lambda url: check_valid.check_valid_url(url) 23 | find_url_service = lambda url_: url.find_url_service(url_) 24 | find_url_title = lambda url_service, url_: url.find_url_title(url_service, url_) 25 | find_url_service_name = lambda url_service: url.find_url_service_name(url_service) 26 | find_url_id = lambda url_service, url_: url.find_url_id(url_service, url_) 27 | find_command_service = lambda command: check_command.find_command_service(command) 28 | 29 | command_url = '' 30 | 31 | def main(message, user): 32 | '''Create a periodical job. 33 | 34 | Usage: 35 | !perjob 36 | to create a new periodical job. 37 | !perjob 38 | to specify a command for the job. 39 | 40 | With commands and command dependent variables: 41 | --service-url and 42 | to periodically scrape a service. 43 | --scrape-url and 44 | to periodically scrape links from a webpage. 45 | ''' 46 | if len(message) == 1: 47 | ticket_id = ''.join(random.choice(string.ascii_lowercase) for num in range(10)) 48 | with open('temp_perjobs/'+ticket_id+'.py', 'a') as file: 49 | file.write('user = \'' + user + '\'\n') 50 | open_time_hours = int(periodical_job_open_time/3600) 51 | irc_bot_print(irc_channel, user + ': Your ticket ID is \'' + ticket_id + '\'. The ticket is open for ' + str(open_time_hours) + ' hours without edits.') 52 | irc_bot_print(irc_channel, user + ': Configure your new periodical job using \'!perjob \'.') 53 | irc_bot_print(irc_channel, user + ': See ' + command_url + ' for available commands.') 54 | elif message[1] == '--service-url': 55 | if len(message) != 4: 56 | irc_bot_print(irc_channel, user + ': I don\'t understand your command. Please review it.') 57 | else: 58 | ticket_id = message[2] 59 | if not os.path.isfile('temp_perjobs/'+ticket_id+'.py'): 60 | irc_bot_print(irc_channel, user + ': Ticket ID \'' + ticket_id + '\' does not exist.') 61 | else: 62 | url = message[3] 63 | if check_temp_perjob_variable(ticket_id, 'url') != 'var not found': 64 | irc_bot_print(irc_channel, user + ': You already provided an URL for ticket ID \'' + ticket_id + '\'.') 65 | irc_bot_print(irc_channel, user + ': Create a new ticket ID to use a new URL.') 66 | elif check_valid_url(url) == False: 67 | irc_bot_print(irc_channel, user + ': URL \'' + url + '\' doesn\'t seem to be valid.') 68 | else: 69 | url_service = find_url_service(url) 70 | if url_service != None: 71 | service_name = find_url_service_name(url_service) 72 | url_id = find_url_id(url_service, url) 73 | url_title = find_url_title(url_service, url) 74 | if url_id != None: 75 | url_id_string = 'with ID \'' + url_id + '\'' 76 | else: 77 | url_id_string = 'with URL \'' + url + '\'' 78 | if url_title != None: 79 | url_title_string = '\'' + url_title + '\' ' 80 | else: 81 | url_title_string = '' 82 | irc_bot_print(irc_channel, user + ': Found ' + service_name + ' ' + url_title_string + url_id_string + '.') 83 | threading.Thread(target = process_messages, args = ('add_url', url, ticket_id, user, ticket_id, url_service)).start() 84 | else: 85 | irc_bot_print(irc_channel, user + ': URL \'' + message[3] + '\' is currently not supported.') 86 | elif message[1] == '--scrape-url': 87 | if len(message) != 4: 88 | irc_bot_print(irc_channel, user + ': I don\'t understand your command. Please review it.') 89 | else: 90 | ticket_id = message[2] 91 | if not os.path.isfile('temp_perjobs/'+ticket_id+'.py'): 92 | irc_bot_print(irc_channel, user + ': Ticket ID \'' + ticket_id + '\' does not exist.') 93 | else: 94 | url = message[3] 95 | if check_temp_perjob_variable(ticket_id, 'url') != 'var not found': 96 | irc_bot_print(irc_channel, user + ': You already provided an URL for ticket ID \'' + ticket_id + '\'.') 97 | elif check_valid_url(url) == False: 98 | irc_bot_print(irc_channel, user + ': URL \'' + url + '\' doesn\'t seem to be valid.') 99 | else: 100 | url_service = 'video__webpage' 101 | service_name = find_url_service_name(url_service) 102 | irc_bot_print(irc_channel, user + ': Found ' + service_name + ' \'' + url + '\'.') 103 | threading.Thread(target = process_messages, args = ('add_url', url, ticket_id, user, ticket_id, url_service)).start() 104 | elif message[1] == '--edit': 105 | if len(message) != 3: 106 | irc_bot_print(irc_channel, user + ': I don\'t understand your command. Please review it.') 107 | else: 108 | ticket_id = message[2] 109 | if not os.path.isfile('periodical_jobs/' + ticket_id + '.py'): 110 | irc_bot_print(irc_channel, user + ': Ticket ID \'' + ticket_id + '\' does not exist.') 111 | else: 112 | os.rename('periodical_jobs/' + ticket_id + '.py', 'temp_perjobs/' + ticket_id + '.py') 113 | irc_bot_print(irc_channel, user + ': Ticket ID \'' + ticket_id + '\' is reopened for editing.') 114 | elif message[1] == '--remove': 115 | if len(message) != 3: 116 | irc_bot_print(irc_channel, user + ': I don\'t understand your command. Please review it.') 117 | else: 118 | ticket_id = message[2] 119 | if os.path.isfile('periodical_jobs/' + ticket_id + '.py'): 120 | os.remove('periodical_jobs/' + ticket_id + '.py') 121 | irc_bot_print(irc_channel, user + ': Periodical job with ticket ID ' + ticket_id + ' is removed.') 122 | elif os.path.isfile('temp_perjobs/' + ticket_id + '.py'): 123 | os.remove('temp_perjobs/' + ticket_id + '.py') 124 | irc_bot_print(irc_channel, user + ': Periodical job with ticket ID ' + ticket_id + ' is removed.') 125 | if os.path.isfile('periodical_jobs/' + ticket_id + '.pyc'): 126 | os.remove('periodical_jobs/' + ticket_id + '.pyc') 127 | elif os.path.isfile('temp_perjobs/' + ticket_id + '.pyc'): 128 | os.remove('temp_perjobs/' + ticket_id + '.pyc') 129 | else: 130 | irc_bot_print(irc_channel, user + ': Ticket ID \'' + ticket_id + '\' does not exist.') 131 | elif message[1] in ('--info', '--information'): 132 | if len(message) == 3: 133 | ticket_id = message[2] 134 | variables = get_temp_perjob_variables(ticket_id) 135 | if variables == None: 136 | irc_bot_print(irc_channel, user + ': Periodical job with ticket ID ' + ticket_id + ' does not exist.') 137 | else: 138 | irc_bot_print(irc_channel, user + ': Periodical job with ticket ID ' + ticket_id + ' has variables ' + ', '.join(variables) + '.') 139 | elif len(message) == 4: 140 | ticket_id = message[2] 141 | variable = message[3] 142 | variable_content = check_temp_perjob_variable(ticket_id, variable) 143 | if variable_content == 'var not found': 144 | irc_bot_print(irc_channel, user + ': Periodical job with ticket ID ' + ticked_id + '') 145 | else: 146 | irc_bot_print(irc_channel, user + ': I don\'t understand your command. Please review it.') 147 | else: 148 | ticket_id = message[2] 149 | if not os.path.isfile('temp_perjobs/'+ticket_id+'.py'): 150 | irc_bot_print(irc_channel, user + ': Ticket ID \'' + ticket_id + '\' does not exist.') 151 | else: 152 | perjob_commands = check_temp_perjob_variable(ticket_id, 'type') 153 | if perjob_commands == 'var not found': 154 | irc_bot_print(irc_channel, user + ': Please provide an URL first for ticket ID \'' + ticket_id + '\'.') 155 | else: 156 | service = find_command_service(perjob_commands[0]) 157 | if service != None: 158 | threading.Thread(target = process_messages, args = ('periodical_job', service, message, user, ticket_id, service)).start() 159 | else: 160 | irc_bot_print(irc_channel, user + ': Command \'' + message[0] + '\' was removed. Please create a new periodical job.') 161 | 162 | def process_messages(name, a, b, c, ticket_id, service): 163 | for service_message in eval('services.' + service + '.' + name + '(a, b, c)'): 164 | if service_message[0] == 'add': 165 | filelines = [] 166 | with open('temp_perjobs/'+ticket_id+'.py', 'r') as file: 167 | added = False 168 | for line in file: 169 | if not line.startswith(service_message[1]): 170 | filelines.append(line) 171 | else: 172 | filelines.append(service_message[1] + ' = ' + str(service_message[2])) 173 | added = True 174 | if not added: 175 | filelines.append(service_message[1] + ' = ' + str(service_message[2])) 176 | with open('temp_perjobs/'+ticket_id+'.py', 'w') as file: 177 | file.write('\n'.join([fileline for fileline in filelines if not fileline == ''])) 178 | elif service_message[0] == 'message': 179 | irc_bot_print(irc_channel, str(service_message[1])) 180 | elif service_message[0] == 'finish': 181 | required_commands = service_message[1] 182 | default_commands = service_message[2] 183 | user = service_message[3] 184 | for required_command in required_commands + default_commands: 185 | if check_temp_perjob_variable(b[2], required_command) == 'var not found': 186 | irc_bot_print(irc_channel, user + ': You are missing \'' + required_command + '\'.') 187 | break 188 | else: 189 | os.rename('temp_perjobs/' + ticket_id + '.py', 'periodical_jobs/' + ticket_id + '.py') 190 | irc_bot_print(irc_channel, user + ': Periodical job with ticket ID \'' + b[2] + '\' is finished.') 191 | elif service_message[0] == 'execute': 192 | os.system(service_message[1]) 193 | elif service_message[0] == 'bad_command': 194 | bad_command = service_message[1] 195 | user = service_message[2] 196 | irc_bot_print(irc_channel, user + ': I don\'t understand command \'' + bad_command + '\'.') 197 | elif service_message[0] == 'write_metadata': 198 | ia_metadata = service_message[1] 199 | fulldir = service_message[2] 200 | if not os.path.isdir(fulldir): 201 | os.makedirs(fulldir) 202 | for a, b in ia_metadata.items(): 203 | with open(fulldir + 'ia_metadata.py', 'a') as file: 204 | if type(b) is list: 205 | content_string = str(b) 206 | else: 207 | content_string = '\'' + str(b).replace('\'', '\\\'') + '\'' 208 | file.write(str(a) + ' = ' + content_string + '\n') 209 | elif service_message[0] == 'help': 210 | required_commands = service_message[1] 211 | optional_commands = service_message[2] 212 | user = service_message[3] 213 | irc_bot_print(irc_channel, user + ': The required commands are ' + ', '.join(required_commands) + '.') 214 | irc_bot_print(irc_channel, user + ': The optional commands are ' + ', '.join(optional_commands) + '.') 215 | irc_bot_print(irc_channel, user + ': Set a command using \'!perjob \'.') 216 | elif service_message[0] == 'execute_timeout': 217 | # Do not use for grab-site processes 218 | command = service_message[1].split(' ') 219 | timeout = int(service_message[2]) 220 | dir_ = service_message[3] 221 | with open(dir_ + 'no_upload', 'w') as file: 222 | pass 223 | process = subprocess.Popen(command) 224 | time.sleep(timeout) 225 | os.remove(dir_ + 'no_upload') 226 | if process.poll() is None: 227 | process.terminate() 228 | exit_code = -1 229 | else: 230 | exit_code = process.poll() 231 | 232 | def process_url(url, user): 233 | services_list = refresh.services_list 234 | for service in services_list: 235 | print(service) 236 | if re.search(service[1], url): 237 | for irc_bot_message in eval('services.' + service[0] + '.process(service[0].replace("video__", ""), url, user)'): 238 | irc_bot_print(irc_channel, irc_bot_message) 239 | 240 | def periodical_job_start(filename, type_, user): 241 | service = find_command_service(type_[0]) 242 | if service != None: 243 | threading.Thread(target = process_messages, args = ('periodical_job_start', filename, user, None, None, service)).start() 244 | 245 | def periodical_job_auto_remove(): 246 | while True: 247 | for temp_periodical_job in [name for name in os.listdir('./temp_perjobs/') if name.endswith('.py') and not name == '__init__.py']: 248 | creation_date = os.path.getctime('./temp_perjobs/' + temp_periodical_job) 249 | ticket_id = temp_periodical_job[:-3] 250 | user = check_temp_perjob_variable(ticket_id, 'user') 251 | if int(creation_date) + periodical_job_open_time < int(time.time()): 252 | os.remove('./temp_perjobs/' + temp_periodical_job) 253 | if os.path.isfile('./temp_perjobs/' + temp_periodical_job + 'c'): 254 | os.remove('./temp_perjobs/' + temp_periodical_job + 'c') 255 | irc_bot_print(irc_channel, user + ': Unfinished periodical job with ticket ID ' + ticket_id + ' is expired.') 256 | time.sleep(3600) 257 | -------------------------------------------------------------------------------- /irc_bot.py: -------------------------------------------------------------------------------- 1 | from config import irc_channel, irc_port, irc_server, irc_nick, github, version 2 | import functions 3 | import socket 4 | import re 5 | import check_command 6 | import services 7 | import refresh 8 | import threading 9 | 10 | irc = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 11 | irc.connect((irc_server, irc_port)) 12 | 13 | def archive(command, user): 14 | functions.archive.main(command, user) 15 | 16 | def periodical_job(command, user): 17 | functions.periodical_job.main(command, user) 18 | 19 | def find_command_service(command): 20 | return check_command.find_command_service(command) 21 | 22 | def new_socket(): 23 | global irc 24 | irc.close() 25 | irc = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 26 | irc.connect((irc_server, irc_port)) 27 | irc_bot_join() 28 | threading.Thread(target = irc_bot_listener).start() 29 | 30 | def irc_bot_join(): 31 | irc.send('USER ' + irc_nick + ' ' + irc_nick + ' ' + irc_nick + ' :This is the bot for ' + irc_channel + '. ' + github + '.\n') 32 | irc.send('NICK ' + irc_nick + '\n') 33 | irc.send('JOIN ' + irc_channel + '\n') 34 | 35 | def irc_bot_print(channel, message): 36 | try: 37 | message = ''.join([character if ord(character) < 128 else '_' for character in message]) 38 | irc.send("PRIVMSG " + channel + " :" + message + "\n") 39 | except Exception as exception: 40 | with open('exceptions', 'a') as exceptions: 41 | print(str(version) + '\n' + str(exception) + '\n\n') 42 | exceptions.write(str(version) + '\n' + str(exception) + '\n\n') 43 | new_socket() 44 | print("IRC BOT: " + message) 45 | 46 | def irc_bot_listener(): 47 | while True: 48 | irc_message = irc.recv(2048) 49 | with open('irclog', 'a') as file: 50 | file.write(irc_message) 51 | if irc_message.startswith('PING :'): 52 | message = re.search(r'^[^:]+:(.*)$', irc_message).group(1) 53 | irc.send('PONG :' + message + '\n') 54 | elif re.search(r'^:.+PRIVMSG[^:]+:!.*', irc_message): 55 | command = re.search(r'^:.+PRIVMSG[^:]+:(!.*)', irc_message).group(1).replace('\r', '').replace('\n', '').split(' ') 56 | user = re.search(r'^:([^!]+)!', irc_message).group(1) 57 | if command[0] in ('!a', '!archive'): 58 | archive(command, user) 59 | elif command[0] in ('!perjob', '!periodical-job'): 60 | periodical_job(command, user) 61 | elif command[0] == '!version': 62 | irc_bot_print(irc_channel, user + ': Current version of videobot is ' + str(version) + '.') 63 | elif command[0] in ('!update-services', '!us'): 64 | irc_bot_print(irc_channel, user + ': Services are resfreshing.') 65 | threading.Thread(target = refresh.refresh_services).start() 66 | else: 67 | command_short = command[0].replace('!', '') 68 | service = find_command_service(command_short) 69 | if service != None: 70 | for irc_bot_message in eval('services.' + service + '.process(service, command, user)'): 71 | irc_bot_print(irc_channel, irc_bot_message) 72 | else: 73 | irc_bot_print(irc_channel, user + ': Command \'' + command[0] + '\' does not exist.') 74 | 75 | -------------------------------------------------------------------------------- /irc_message.py: -------------------------------------------------------------------------------- 1 | def job_finished(user, name, title, id=None): 2 | if id: 3 | return user + ': Your job for ' + name + ' \'' + title + '\' with ID ' + id + ' is finished.' 4 | return user + ': Your job for ' + name + ' \'' + title + '\' is finished.' 5 | 6 | def job_failed(user, name, title, id=None): 7 | if id: 8 | return user + ': Your job for ' + name + ' \'' + title + '\' with ID ' + id + ' failed.' 9 | return user + ': Your job for ' + name + ' \'' + title + '\' failed.' 10 | 11 | def job_added(user, name, title, id=None): 12 | if id: 13 | return user + ': Your job for ' + name + ' \'' + title + '\' with ID ' + id + ' is added.' 14 | return user + ': Your job for ' + name + ' \'' + title + '\' is added.' 15 | 16 | def job_aborted(user, name, id=None): 17 | if id: 18 | return user + ': Your job for ' + name + ' with ID ' + id + ' is aborted.' 19 | return user + ': Your job for ' + name + ' is aborted.' 20 | 21 | def failed_extraction(user, name, extract, id=None): 22 | if id: 23 | return user + ': Failed to extract ' + extract + ' from ' + name + ' with ID ' + id + '.' 24 | return user + ': Failed to extract ' + extract + ' from ' + name + '.' -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #if os.path.isdir('./services'): 2 | # shutil.rmtree('./services') 3 | # os.makedirs('./services') 4 | #if not os.path.isfile('./services/__init__.py'): 5 | # open('./services/__init__.py', 'w').close() 6 | import os 7 | import glob 8 | import threading 9 | import irc_bot 10 | import refresh 11 | import upload 12 | import time 13 | import functions 14 | 15 | def dashboard(): 16 | os.system('~/.local/bin/gs-server') 17 | 18 | def irc_bot_listener(): 19 | irc_bot.irc_bot_listener() 20 | 21 | def irc_bot_join(): 22 | irc_bot.irc_bot_join() 23 | 24 | def refresh_services(): 25 | refresh.refresh_services() 26 | 27 | def process_warcs(): 28 | upload.firstrun() 29 | while True: 30 | try: 31 | upload.move_warcs() 32 | threading.Thread(target = upload.upload_items).start() 33 | except: 34 | pass #for now 35 | time.sleep(60) 36 | 37 | def remove_old_files(): 38 | for file in glob.glob('to_be_uploaded/ia_items/*/no_upload') + glob.glob('to_be_uploaded/ia_items/*/*.upload'): 39 | os.remove(file) 40 | while True: 41 | for file in glob.glob('to_be_uploaded/ia_items/*/*.upload'): 42 | os.remove(file) 43 | time.sleep(21600) 44 | 45 | def main(): 46 | if not os.path.isdir('./to_be_uploaded/ia_items'): 47 | os.makedirs('./to_be_uploaded/ia_items') 48 | if not os.path.isdir('./to_be_uploaded/ia_warcs'): 49 | os.makedirs('./to_be_uploaded/ia_warcs') 50 | irc_bot_join() 51 | refresh.refresh_services() 52 | threading.Thread(target = remove_old_files).start() 53 | threading.Thread(target = refresh.refresh_periodical_jobs).start() 54 | threading.Thread(target = refresh.refresh_periodical_jobs_start).start() 55 | threading.Thread(target = irc_bot_listener).start() 56 | threading.Thread(target = functions.periodical_job.periodical_job_auto_remove).start() 57 | threading.Thread(target = dashboard).start() 58 | threading.Thread(target = process_warcs).start() 59 | 60 | if __name__ == '__main__': 61 | main() -------------------------------------------------------------------------------- /periodical_jobs/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | for module in os.listdir(os.path.dirname(__file__)): 4 | if module == '__init__.py' or module[-3:] != '.py': 5 | continue 6 | __import__(module[:-3], locals(), globals()) 7 | del module 8 | -------------------------------------------------------------------------------- /periodical_jobs/check_temp_perjob.py: -------------------------------------------------------------------------------- 1 | import temp_perjobs 2 | import os 3 | import random 4 | import string 5 | 6 | def check_temp_perjob_variable(ticket_id, var): 7 | ticket_id_ = ''.join(random.choice(string.ascii_lowercase) for num in range(10)) 8 | os.rename('temp_perjobs/' + ticket_id + '.py', 'temp_perjobs/' + ticket_id + ticket_id_ + '.py') 9 | reload(temp_perjobs) 10 | try: 11 | variable = eval('temp_perjobs.' + ticket_id + ticket_id_ + '.' + var) 12 | os.rename('temp_perjobs/' + ticket_id + ticket_id_ + '.py', 'temp_perjobs/' + ticket_id + '.py') 13 | os.remove('temp_perjobs/' + ticket_id + ticket_id_ + '.pyc') 14 | return variable 15 | except: 16 | os.rename('temp_perjobs/' + ticket_id + ticket_id_ + '.py', 'temp_perjobs/' + ticket_id + '.py') 17 | os.remove('temp_perjobs/' + ticket_id + ticket_id_ + '.pyc') 18 | return 'var not found' 19 | 20 | def get_temp_perjob_variables(ticket_id): 21 | file_location = None 22 | if os.path.isfile('temp_perjobs/' + ticket_id + '.py'): 23 | file_location = 'temp_perjobs/' + ticket_id + '.py' 24 | elif os.path.isfile('periodical_jobs/' + ticket_id + '.py'): 25 | file_location = 'periodical_jobs/' + ticket_id + '.py' 26 | if file_location != None: 27 | variables = [] 28 | with open('temp_perjobs/' + ticket_id + '.py', 'r') as file: 29 | for line in file: 30 | line = line.replace('\n', '').replace('\r', '') 31 | if ' = ' in line: 32 | variables.append(line.split(' = ')[0]) 33 | return variables 34 | else: 35 | return None 36 | -------------------------------------------------------------------------------- /refresh.py: -------------------------------------------------------------------------------- 1 | from config import irc_channel 2 | from config import github 3 | import os 4 | import services 5 | import time 6 | import re 7 | import irc_bot 8 | import periodical_jobs 9 | import random 10 | import string 11 | import threading 12 | import functions 13 | 14 | periodical_job_start = lambda filename, type_, user: functions.periodical_job.periodical_job_start(filename, type_, user) 15 | 16 | services_count = 0 17 | services_list = [['service_name', 'service_regex', ['service_commands']]] 18 | periodical_jobs_list = [['perjob_name', 'refreshtime']] 19 | periodical_jobs_start = {} 20 | 21 | def irc_bot_print(channel, message): 22 | irc_bot.irc_bot_print(channel, message) 23 | 24 | def refresh_services(): 25 | global services_list 26 | global services_count 27 | services_list = [['service_name', 'service_regex', ['service_commands']]] 28 | new_services = 0 29 | #if os.path.isdir('./services'): 30 | # shutil.rmtree('./services') 31 | #os.system('git clone ' + github + '.git') 32 | #repository_name = re.search(r'([^\/]+)\/?$', github).group(1) 33 | #shutil.copytree('./' + repository_name + '/services', './services') 34 | #shutil.rmtree('./' + repository_name) 35 | reload(services) 36 | for root, dirs, files in os.walk("./services"): 37 | for service in files: 38 | if service.startswith("video__") and service.endswith(".py"): 39 | if service[:-3] in services_list: 40 | break 41 | else: 42 | try: 43 | url_regex = eval('services.' + service[:-3] + '.url_regex') 44 | except AttributeError: 45 | url_regex = None 46 | service_commands = eval('services.' + service[:-3] + '.service_commands') 47 | services_list.append([service[:-3], url_regex, service_commands]) 48 | new_services += 1 49 | print('Found service ' + service[:-3] + '.') 50 | new_count = new_services-services_count 51 | services_count = new_services 52 | if new_count == 1: 53 | irc_bot_print(irc_channel, 'Found and updated ' + str(new_count) + ' service.') 54 | elif new_count != 0: 55 | irc_bot_print(irc_channel, 'Found and updated ' + str(new_count) + ' services.') 56 | 57 | def refresh_periodical_jobs(): 58 | global periodical_jobs_list 59 | while True: 60 | periodical_jobs_list_ = [['perjob_name', 'refreshtime']] 61 | random_string = ''.join(random.choice(string.ascii_lowercase) for num in range(10)) 62 | for filename in os.listdir('periodical_jobs'): 63 | if filename.endswith('.py') and filename not in ('check_temp_perjob.py', '__init__.py'): 64 | filename_ = filename.replace('.py', random_string + '.py') 65 | os.rename('periodical_jobs/' + filename, 'periodical_jobs/' + filename_) 66 | reload(periodical_jobs) 67 | time.sleep(10) 68 | for filename in os.listdir('periodical_jobs'): 69 | if filename.endswith(random_string + '.py'): 70 | filename_ = filename.replace(random_string + '.py', '.py') 71 | os.rename('periodical_jobs/' + filename, 'periodical_jobs/' + filename_) 72 | for periodical_job_list_ in periodical_jobs_list_: 73 | if filename[:-3] in periodical_job_list_: 74 | break 75 | else: 76 | periodical_jobs_list_.append([filename[:-3], eval('periodical_jobs.' + filename[:-3] + '.refreshtime')]) 77 | print('Found periodical job ' + filename[:-13] + '.') 78 | os.remove('periodical_jobs/' + filename + 'c') 79 | periodical_jobs_list = list(periodical_jobs_list_) 80 | time.sleep(300) 81 | 82 | def refresh_periodical_jobs_start(): 83 | global periodical_jobs_list 84 | global periodical_jobs_start 85 | while True: 86 | for periodical_job_list in periodical_jobs_list: 87 | if periodical_job_list[0] != 'perjob_name': 88 | periodical_job_name = periodical_job_list[0][:-10] 89 | if periodical_job_name in periodical_jobs_start: 90 | last_start = periodical_jobs_start[periodical_job_name] 91 | else: 92 | last_start = 0 93 | current_time = int(time.time()) 94 | if last_start + periodical_job_list[1] <= current_time: 95 | periodical_jobs_start[periodical_job_name] = current_time 96 | threading.Thread(target = periodical_job_start, args = (periodical_job_list[0], eval('periodical_jobs.' + periodical_job_list[0] + '.type'), eval('periodical_jobs.' + periodical_job_list[0] + '.user'),)).start() 97 | time.sleep(1) 98 | 99 | def periodical_job_args(filename, args): 100 | args_ = [] 101 | for arg in args: 102 | try: 103 | variable_content = eval('periodical_jobs.' + filename + '.' + arg) 104 | except AttributeError: 105 | variable_content = '' 106 | args_.append(variable_content) 107 | return args_ 108 | -------------------------------------------------------------------------------- /services/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | for module in os.listdir(os.path.dirname(__file__)): 4 | if module.startswith('dl__') or module == '__init__.py' or module[-3:] != '.py': 5 | continue 6 | __import__(module[:-3], locals(), globals()) 7 | del module 8 | -------------------------------------------------------------------------------- /services/dl__facebook_com.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import os 4 | import re 5 | import shutil 6 | import time 7 | import json 8 | import html 9 | import datetime 10 | 11 | wpull_hook = globals().get('wpull_hook') 12 | 13 | firsturl = '' 14 | ia_metadata = {'identifier': '', 'files': [], 'title': '', 'description': '', 'mediatype': 'movies', 'collection': 'archiveteam_videobot', 'date': '', 'original_url': '', 'creator': '', 'subject': ''} 15 | added_to_list = [] 16 | tries = {} 17 | video_file = None 18 | item_id = None 19 | 20 | def accept_url(url_info, record_info, verdict, reasons): 21 | global added_to_list 22 | if (firsturl == '' or url_info["url"] in added_to_list) and not '\\' in url_info["url"]: 23 | return True 24 | return False 25 | 26 | def get_urls(filename, url_info, document_info): 27 | global firsturl 28 | global item_id 29 | global ia_metadata 30 | global added_to_list 31 | global video_file 32 | 33 | newurls = [] 34 | 35 | def url_allowed(url, parent_url=None): 36 | return True 37 | 38 | def add_url(url, parent_url=None): 39 | if url in added_to_list: 40 | return None 41 | if url_allowed(url, parent_url): 42 | added_to_list.append(url) 43 | newurls.append({'url': url}) 44 | 45 | if video_file is not None and video_file in url_info["url"]: 46 | if not os.path.isdir('../ia_item'): 47 | os.makedirs('../ia_item') 48 | shutil.copyfile(filename, '../ia_item/' + video_file) 49 | ia_metadata['files'].append(video_file) 50 | 51 | if firsturl == '': 52 | with open(filename, 'r', encoding='utf-8') as file: 53 | content = file.read() 54 | firsturl = url_info['url'] 55 | for url in extract_urls(content, url_info['url']): 56 | add_url(url) 57 | url_name = 'hd_src_no_ratelimit' 58 | if 'hd_src_no_ratelimit' not in content: 59 | url_name = 'sd_src_no_ratelimit' 60 | video_file = re.search(url_name + ':"https?://[^/]+/v/[^/]+/([a-zA-Z0-9-_]+\.mp4)', content).group(1) 61 | item_id = re.search('video_id:"([0-9]+)"', content).group(1) 62 | item_name = re.search('ownerName:"([^"]+)"', content).group(1) 63 | ia_metadata['identifier'] = 'archiveteam_videobot_facebook_com_' + item_id 64 | ia_metadata['title'] = re.search('([^<]+(?:\.\.\.))\s+-\s+[^<]+', content).group(1) 65 | ia_metadata['description'] = html.unescape(re.search(']+>(.+?)', content).group(1)) 66 | ia_metadata['date'] = datetime.datetime.fromtimestamp(int(re.search('data-utime="([0-9]+)"', content).group(1))).strftime('%Y-%m-%d %H:%M:%S') 67 | ia_metadata['original_url'] = firsturl 68 | ia_metadata['creator'] = item_name 69 | ia_metadata['creator_id'] = re.search('^https?://[^/]+/([^/]+)/videos/', url_info["url"]).group(1) 70 | ia_metadata['video_id'] = item_id 71 | ia_metadata['subject'] = ['videobot', 'archiveteam', 'facebook', 'facebook.com', item_id, item_name] 72 | 73 | for newurl in newurls: 74 | added_to_list.append(newurl['url']) 75 | 76 | return newurls 77 | 78 | def exit_status(exit_code): 79 | global ia_metadata 80 | 81 | if os.path.isdir('../ia_item'): 82 | item_identifier = ia_metadata['identifier'] 83 | for a, b in ia_metadata.items(): 84 | with open('../ia_item/ia_metadata.py', 'a') as file: 85 | if type(b) is list: 86 | content_string = str(b) 87 | else: 88 | content_string = '\'' + str(b).replace('\'', '\\\'').replace('\n', '\\n').replace('\r', '\\r') + '\'' 89 | file.write(str(a) + ' = ' + content_string + '\n') 90 | 91 | if len(os.listdir('../ia_item')) > 1: 92 | print(ia_metadata['files']) 93 | os.rename('../ia_item', '../../to_be_uploaded/ia_items/ia_item_' + item_identifier + '_' + str(int(time.time()))) 94 | 95 | return exit_code 96 | 97 | handle_response_grabsite = wpull_hook.callbacks.handle_response 98 | def handle_response(url_info, record_info, response_info): 99 | global tries 100 | 101 | if not url_info["url"] in tries: 102 | tries[url_info["url"]] = 0 103 | elif tries[url_info["url"]] > 5: 104 | return wpull_hook.actions.FINISH 105 | 106 | tries[url_info["url"]] += 1 107 | 108 | return handle_response_grabsite(url_info, record_info, response_info) 109 | 110 | wpull_hook.callbacks.get_urls = get_urls 111 | wpull_hook.callbacks.exit_status = exit_status 112 | wpull_hook.callbacks.accept_url = accept_url 113 | wpull_hook.callbacks.handle_response = handle_response 114 | 115 | def extract_urls(file, url): 116 | extractedurls = [] 117 | for extractedurl in re.findall('((?:....=)?(?P[\'"]).*?(?P=quote))', file, re.I): 118 | extractedstart = '' 119 | if re.search('^....=[\'"](.*?)[\'"]$', extractedurl[0], re.I): 120 | extractedstart = re.search(r'^(....)', extractedurl[0], re.I).group(1) 121 | extractedurl = re.search('^....=[\'"](.*?)[\'"]$', extractedurl[0], re.I).group(1) 122 | else: 123 | extractedurl = extractedurl[0][1:-1] 124 | extractedurl = re.search(r'^([^#]*)', extractedurl, re.I).group(1) 125 | extractedurl = extractedurl.replace('%3A', ':').replace('%2F', '/') 126 | if extractedurl.startswith('http:\/\/') or extractedurl.startswith('https:\/\/') or extractedurl.startswith('HTTP:\/\/') or extractedurl.startswith('HTTPS:\/\/'): 127 | extractedurl = extractedurl.replace('\/', '/') 128 | if extractedurl.startswith('//'): 129 | extractedurls.append("http:" + extractedurl) 130 | elif extractedurl.startswith('/'): 131 | extractedurls.append(re.search(r'^(https?:\/\/[^\/]+)', url, re.I).group(1) + extractedurl) 132 | elif re.search(r'^https?:?\/\/?', extractedurl, re.I): 133 | extractedurls.append(extractedurl.replace(re.search(r'^(https?:?\/\/?)', extractedurl, re.I).group(1), re.search(r'^(https?)', extractedurl, re.I).group(1) + '://')) 134 | elif extractedurl.startswith('?'): 135 | extractedurls.append(re.search(r'^(https?:\/\/[^\?]+)', url, re.I).group(1) + extractedurl) 136 | elif extractedurl.startswith('./'): 137 | if re.search(r'^https?:\/\/.*\/', url, re.I): 138 | extractedurls.append(re.search(r'^(https?:\/\/.*)\/', url, re.I).group(1) + '/' + re.search(r'^\.\/(.*)', extractedurl, re.I).group(1)) 139 | else: 140 | extractedurls.append(re.search(r'^(https?:\/\/.*)', url, re.I).group(1) + '/' + re.search(r'^\.\/(.*)', extractedurl, re.I).group(1)) 141 | elif extractedurl.startswith('../'): 142 | tempurl = url 143 | tempextractedurl = extractedurl 144 | while tempextractedurl.startswith('../'): 145 | if not re.search(r'^https?://[^\/]+\/$', tempurl, re.I): 146 | tempurl = re.search(r'^(.*\/)[^\/]*\/', tempurl, re.I).group(1) 147 | tempextractedurl = re.search(r'^\.\.\/(.*)', tempextractedurl).group(1) 148 | extractedurls.append(tempurl + tempextractedurl) 149 | elif extractedstart == 'href': 150 | if re.search(r'^https?:\/\/.*\/', url, re.I): 151 | extractedurls.append(re.search(r'^(https?:\/\/.*)\/', url, re.I).group(1) + '/' + extractedurl) 152 | else: 153 | extractedurls.append(re.search(r'^(https?:\/\/.*)', url, re.I).group(1) + '/' + extractedurl) 154 | for extractedurl in re.findall(r'>[^]+>||

)', '', re.search(r'(.+(?:\n.+)+)