├── create_database_postgres.sql ├── mbox.py ├── nntp.py └── utzoo2postgres.py /mbox.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import email 3 | import glob 4 | import gzip 5 | import mailbox as mailbox 6 | import ntpath 7 | import os 8 | import quopri 9 | import random 10 | import re 11 | import shutil 12 | import string 13 | import sys 14 | import time 15 | from datetime import date 16 | from email import policy 17 | 18 | import cchardet 19 | import dateutil.parser 20 | # START - CONFIGURATION 21 | from bs4 import UnicodeDammit 22 | 23 | import configuration 24 | 25 | # Wait random time 15-120 seconds before starting 26 | # sleep_time = random.randint(1, 120) 27 | # print("Waiting " + str(sleep_time) + " seconds before start!") 28 | # time.sleep(sleep_time) 29 | 30 | where2unzip = "" 31 | print("Connecting PostgreSQL DB") 32 | 33 | today = date.today() 34 | print("Starting at:", today) 35 | 36 | 37 | def print_psycopg2_exception(err2): 38 | # get details about the exception 39 | err_type, err_obj, traceback = sys.exc_info() 40 | # get the line number when exception occured 41 | line_num = traceback.tb_lineno 42 | 43 | 44 | # db_cursor.execute('SET GLOBAL max_allowed_packet=67108864') 45 | # db_cursor.execute('SET GLOBAL max_connections = 500') 46 | # db_cursor.execute('SET GLOBAL max_delayed_threads = 50') 47 | # db_cursor.execute('SET GLOBAL innodb_io_capacity = 5000') 48 | 49 | 50 | def convert_encoding(data, new_coding='UTF-8'): 51 | encoding = cchardet.detect(data)['encoding'] 52 | if new_coding.upper() != encoding.upper(): 53 | data = data.decode(encoding, data).encode(new_coding) 54 | return data 55 | 56 | 57 | group_name_fin = "" 58 | count_minutes = 0 59 | # path = r"C:\tmp" 60 | 61 | path = configuration.path.replace("\\", "/") + "/" 62 | 63 | # END - CONFIGURATION 64 | 65 | # Connect to MySQL 66 | 67 | 68 | # Start processing MBOX files 69 | print("** START **" + str(path)) 70 | print("Processing all files on path: " + str(path)) 71 | 72 | # foo() 73 | 74 | files = [f for f in glob.glob(path + '**/*.mbox.gz', recursive=True)] 75 | count = 0 76 | for f in files: 77 | f = f.replace("\\", "/") 78 | # filename = f.replace(path, "").replace(".gz", "") 79 | filename = ntpath.basename(f).replace(".gz", "") 80 | 81 | file_name = "" 82 | current_position_in_db = 0 83 | last_message_count = 0 84 | is_file_being_processed = 0 85 | processing_message_counter = 0 86 | group_name_fin = "" 87 | try: 88 | sql = f"SELECT * FROM all_messages.__all_files WHERE file_name = '{filename}' LIMIT 1" 89 | db_cursor = configuration.db_connection.cursor() 90 | db_cursor.execute(sql) 91 | details = db_cursor.fetchone() 92 | file_name = details[0] 93 | current_position_in_db = details[1] 94 | last_message_count = details[2] 95 | is_file_being_processed = details[3] 96 | db_cursor.close() 97 | except Exception: 98 | file_name = "" 99 | current_position_in_db = 0 100 | last_message_count = 0 101 | is_file_being_processed = 0 102 | #print("Exception #: 1") 103 | db_cursor.close() 104 | 105 | if (current_position_in_db > 0) and (current_position_in_db == last_message_count): 106 | # Move a file from the directory d1 to d2 107 | try: 108 | shutil.move(f, configuration.processed_path + filename + '.gz') 109 | print("Moving File: " + filename + ".gz") 110 | except Exception: 111 | #print("Exception #: 2") 112 | pass 113 | 114 | if (file_name == "") or (current_position_in_db == 0 and last_message_count == 0 and is_file_being_processed == 0) or (current_position_in_db < last_message_count and is_file_being_processed == 0): 115 | try: 116 | filename_extract = filename.replace(".mbox", "") 117 | group_name_fin = filename_extract.replace("." + filename_extract.split(".")[-1], "") 118 | group_name_fin_db = group_name_fin.replace(".", "_").replace("-", "_").replace("+", "") 119 | if len(group_name_fin_db) > 45: 120 | group_name_fin_db = group_name_fin_db[-45:] 121 | 122 | try: 123 | sql = f"INSERT INTO all_messages.__all_files(file_name, current, total, processing, newsgroup_name) VALUES ('{filename}', 0, 0 ,1,'{group_name_fin}') ON CONFLICT (file_name) DO UPDATE SET processing=1" 124 | # sql = "INSERT INTO all_messages.__all_files(file_name, current, total, processing, newsgroup_name) VALUES ('sci.homebrew.20140221.mbox', 0, 0 ,1,'sci.homebrew') ON CONFLICT (file_name) DO UPDATE SET processing=1" 125 | db_cursor = configuration.db_connection.cursor() 126 | db_cursor.execute(sql) 127 | configuration.db_connection.commit() 128 | db_cursor.close() 129 | except Exception: 130 | #print("Exception #: 3") 131 | db_cursor.close() 132 | #exit() 133 | 134 | 135 | # Create tables for a new group 136 | db_cursor = configuration.db_connection.cursor() 137 | db_cursor.execute( 138 | f"select exists(select * from information_schema.tables where table_name='{group_name_fin_db}_headers')") 139 | exist = db_cursor.fetchone()[0] 140 | db_cursor.close() 141 | 142 | if not exist: 143 | try: 144 | sql = f"create table all_messages.{group_name_fin_db}_headers(id bigserial not null constraint {group_name_fin_db}_headers_pk primary key, dateparsed timestamp, subj_id bigint, ref smallint, msg_id text, msg_from bigint, enc text, contype text, processed timestamp default CURRENT_TIMESTAMP);alter table all_messages.{group_name_fin_db}_headers owner to postgres;" 145 | db_cursor = configuration.db_connection.cursor() 146 | db_cursor.execute(sql) 147 | configuration.db_connection.commit() 148 | db_cursor.close() 149 | 150 | sql = f"create table all_messages.{group_name_fin_db}_refs(id bigint, ref_msg text default null);alter table all_messages.{group_name_fin_db}_refs owner to postgres;" 151 | db_cursor = configuration.db_connection.cursor() 152 | db_cursor.execute(sql) 153 | configuration.db_connection.commit() 154 | db_cursor.close() 155 | 156 | sql = f"create table all_messages.{group_name_fin_db}_body(id bigint primary key, data text default null);alter table all_messages.{group_name_fin_db}_body owner to postgres;" 157 | db_cursor = configuration.db_connection.cursor() 158 | db_cursor.execute(sql) 159 | configuration.db_connection.commit() 160 | db_cursor.close() 161 | 162 | sql = f"create table all_messages.{group_name_fin_db}_from(id serial not null constraint {group_name_fin_db}_from_pk primary key, data text);alter table all_messages.{group_name_fin_db}_from owner to postgres;" 163 | db_cursor = configuration.db_connection.cursor() 164 | db_cursor.execute(sql) 165 | configuration.db_connection.commit() 166 | db_cursor.close() 167 | 168 | sql = f"create table all_messages.{group_name_fin_db}_subjects(id serial not null constraint {group_name_fin_db}_subjects_pk primary key, subject text);alter table all_messages.{group_name_fin_db}_subjects owner to postgres;" 169 | db_cursor = configuration.db_connection.cursor() 170 | db_cursor.execute(sql) 171 | configuration.db_connection.commit() 172 | db_cursor.close() 173 | 174 | sql = f"create unique index {group_name_fin_db}_headers_uiidx on all_messages.{group_name_fin_db}_headers(id);" \ 175 | f"create unique index {group_name_fin_db}_headers_umidx on all_messages.{group_name_fin_db}_headers(msg_id);" \ 176 | f"create unique index {group_name_fin_db}_body_idx on all_messages.{group_name_fin_db}_body(id);" \ 177 | f"create unique index {group_name_fin_db}_from_idx on all_messages.{group_name_fin_db}_from(data);" \ 178 | f"create unique index {group_name_fin_db}_subjects_idx on all_messages.{group_name_fin_db}_subjects(subject);" 179 | db_cursor = configuration.db_connection.cursor() 180 | db_cursor.execute(sql) 181 | configuration.db_connection.commit() 182 | db_cursor.close() 183 | except Exception: 184 | pass 185 | 186 | except Exception: 187 | #print("Exception #: 4") 188 | db_cursor.close() 189 | #exit() 190 | 191 | count = count + 1 192 | # Set initial path for where to unzip MBOX files 193 | where2unzip = configuration.path2unzip + filename 194 | 195 | try: 196 | f = open(where2unzip) 197 | f.close() 198 | print("MBOX file was already unzipped and found at: " + where2unzip) 199 | except IOError: 200 | print("MBOX Unzipped file not there yet!") 201 | # Unzip MBOX.GZ and Place to TMP 202 | print("Starting to Unzip: " + str(count) + " - " + str(f)) 203 | with gzip.open(f, 'rb') as f_in: 204 | with open(where2unzip, 'wb') as f_out: 205 | shutil.copyfileobj(f_in, f_out) 206 | print("Unzipped to: " + where2unzip) 207 | print("Starting to Process MBOX") 208 | 209 | mbox = mailbox.mbox(where2unzip) 210 | 211 | print("**************************************") 212 | 213 | 214 | def groupnum(number): 215 | s = '%d' % number 216 | groups = [] 217 | while s and s[-1].isdigit(): 218 | groups.append(s[-3:]) 219 | s = s[:-3] 220 | return s + ','.join(reversed(groups)) 221 | 222 | 223 | def find_between(s, first, last): 224 | try: 225 | start = s.index(first) + len(first) 226 | end = s.index(last, start) 227 | return s[start:end] 228 | except ValueError: 229 | return "" 230 | 231 | def removeNonAscii(s): return "".join(i for i in s if ord(i)<126 and ord(i)>31) 232 | 233 | def clean_string(header_part, encoding): 234 | orig_header_part = header_part 235 | header_part = header_part.rstrip(os.linesep).replace("\n", "") 236 | encoding_quoted = encoding 237 | 238 | if '?q?' in header_part: 239 | encoding_quoted = find_between(header_part, '=?', '?') 240 | header_part = header_part.split("?q?", 1)[1] # .replace("_", " ") 241 | # header_part = find_between(header_part, 'q?', '?').replace("_", " ") 242 | elif '?Q?' in header_part: 243 | encoding_quoted = find_between(header_part, '=?', '?') 244 | header_part = header_part.split("?Q?", 1)[1] # .replace("_", " ") 245 | elif '?b?' in header_part: 246 | encoding_quoted = find_between(header_part, '=?', '?') 247 | header_part = header_part.split("?b?", 1)[1] # .replace("_", " ") 248 | try: 249 | header_part = base64.b64decode(header_part) 250 | except Exception: 251 | try: 252 | header_part = base64.b64decode(header_part) 253 | except Exception: 254 | header_part = orig_header_part 255 | elif '?B?' in header_part: 256 | encoding_quoted = find_between(header_part, '=?', '?') 257 | header_part = header_part.split("?B?", 1)[1] # .replace("_", " ") 258 | try: 259 | header_part = base64.b64decode(header_part) 260 | except Exception: 261 | try: 262 | header_part = base64.b64decode(header_part) 263 | except Exception: 264 | header_part = orig_header_part 265 | 266 | if 'unknown' in encoding_quoted: 267 | encoding_quoted = encoding 268 | elif 'x-user-defined' in encoding_quoted: 269 | encoding_quoted = encoding 270 | 271 | try: 272 | header_part = quopri.decodestring(header_part).decode(encoding_quoted) 273 | return header_part 274 | except Exception: 275 | try: 276 | header_part = quopri.decodestring(header_part).decode(encoding) 277 | return header_part 278 | except Exception: 279 | try: 280 | dammit = UnicodeDammit(header_part) 281 | if dammit.original_encoding: 282 | header_part = quopri.decodestring(header_part).decode(dammit.original_encoding) 283 | return header_part 284 | else: 285 | header_part = quopri.decodestring(header_part).decode('ascii') 286 | return header_part 287 | except Exception: 288 | try: 289 | header_part = quopri.decodestring(header_part).decode("ansi") 290 | return header_part 291 | except Exception: 292 | try: 293 | header_part = header_part.encode('utf8', 'surrogatepass').decode('utf8', 294 | 'surrogatepass') 295 | return header_part 296 | except Exception: 297 | return "" 298 | 299 | 300 | # Process every single message recovered from the MBOX file 301 | for message in mbox: 302 | processing_message_counter = processing_message_counter + 1 303 | 304 | # start only processing once you get to first unprocessed message 305 | if processing_message_counter > current_position_in_db: 306 | 307 | try: 308 | message = email.message_from_string(str(message), policy=policy.default) 309 | except Exception: 310 | pass 311 | 312 | all_count = int(mbox._next_key) 313 | 314 | if processing_message_counter % 1000 == 0: 315 | percentage = round(100 * float(processing_message_counter) / float(all_count), 2) 316 | 317 | # Show how many messsages we're processing per minute 318 | # sql_count = "SELECT COUNT(*) FROM all_messages.headers WHERE processed >= (now() - INTERVAL \'1 MINUTE\')" 319 | # sql_count = "SELECT COUNT(*) FROM all_messages.headers" 320 | try: 321 | sql = f"SELECT COUNT(*) FROM all_messages.{group_name_fin_db}_headers WHERE processed >= (now() - INTERVAL '1 MINUTE')" 322 | db_cursor = configuration.db_connection.cursor() 323 | db_cursor.execute(sql) 324 | messages_per_minute1 = db_cursor.fetchone()[0] 325 | db_cursor.close() 326 | except Exception: 327 | #print("Exception #: 5") 328 | db_cursor.close() 329 | messages_per_minute1 = 0 330 | #exit() 331 | 332 | # print(message_body) 333 | try: 334 | sql = f"INSERT INTO all_messages.__all_updates(groupname,perminute) VALUES ((%s), (%s))" 335 | db_cursor = configuration.db_connection.cursor() 336 | db_cursor.execute(sql, (filename, messages_per_minute1)) 337 | configuration.db_connection.commit() 338 | db_cursor.close() 339 | except Exception as err: 340 | #print("Exception #: 6") 341 | db_cursor.close() 342 | #exit() 343 | 344 | # Delete all execept last 100 most recent update messages 345 | try: 346 | sql = f"DELETE FROM all_messages.__all_updates WHERE id <= (SELECT id FROM (SELECT id FROM all_messages.__all_updates ORDER BY id DESC LIMIT 1 OFFSET 100) foo);" 347 | db_cursor = configuration.db_connection.cursor() 348 | db_cursor.execute(sql) 349 | db_cursor.close() 350 | except Exception: 351 | #print("Exception #: 7") 352 | db_cursor.close() 353 | #exit() 354 | 355 | try: 356 | sql = f"select SUM(perminute) from all_messages.__all_updates where id in (SELECT MAX(id) as t FROM all_messages.__all_updates WHERE tstamp >= (now() - INTERVAL '1 MINUTE') group by groupname);" 357 | db_cursor = configuration.db_connection.cursor() 358 | db_cursor.execute(sql) 359 | messages_per_minute1 = db_cursor.fetchone()[0] 360 | db_cursor.close() 361 | if not messages_per_minute1: 362 | messages_per_minute1 = 0 363 | print(filename.replace(".mbox", "") + ": " + str(processing_message_counter) + " of " + str( 364 | all_count) + " (" + str(percentage) + "%) | " + str( 365 | groupnum(messages_per_minute1)) + " msgs/min (" + str( 366 | groupnum(messages_per_minute1 * 60)) + " hr, " + str( 367 | groupnum(messages_per_minute1 * 60 * 24)) + " day, " + str( 368 | groupnum(messages_per_minute1 * 60 * 24 * 365)) + " year)") 369 | except Exception: 370 | #print("Exception #: 8") 371 | db_cursor.close() 372 | #exit() 373 | 374 | # RESET ALL VARS 375 | parsed_encoding = None 376 | parsed_content_type = None 377 | parsed_message_id = None 378 | parsed_date = None 379 | parsed_subject = None 380 | parsed_subject_original = None 381 | parsed_ref = None 382 | parsed_body_text = None 383 | parsed_body_text_original = None 384 | parsed_from = None 385 | parsed_from_original = None 386 | has_ref = 0 387 | 388 | 389 | ############################################# 390 | # USENET HEADER PARSING 391 | ############################################# 392 | # GET HEADERS IN ORIGINAL RAW FORMAT (NOT UTF-8) 393 | # PARSE THE IMPORTANT PARTS FROM LIST OF HEADERS 394 | 395 | for p in message._headers: 396 | name = str(p[0]).lower() 397 | 398 | # Parse Date 399 | if name == 'date': 400 | parsed_date = p[1] 401 | 402 | # Parse Content Type 403 | if name == 'content-type': 404 | parsed_content_type = p[1] 405 | 406 | # Parse content-transfer-encoding 407 | if name == 'content-transfer-encoding': 408 | parsed_content_type = p[1] 409 | 410 | # Parse References 411 | if name == 'references': 412 | parsed_ref = p[1] 413 | 414 | # Parse Subject 415 | if name == 'subject': 416 | parsed_subject = p[1] 417 | parsed_subject_original = p[1] 418 | 419 | # Parse message-id 420 | if name == 'message-id': 421 | parsed_message_id = p[1] 422 | 423 | # Parse From 424 | if name == 'from': 425 | parsed_from = p[1] 426 | parsed_from_original = p[1] 427 | 428 | # Parse Charset Encoding 429 | if name == 'content-type': 430 | try: 431 | parsed_encoding = message.get_content_charset() 432 | except Exception: 433 | if "charset=" in name: 434 | try: 435 | parsed_encoding = str( 436 | re.findall(r'"([^"]*)"', str(p[1].rstrip(os.linesep).replace("\n", "")))[0]) 437 | except Exception: 438 | dammit = UnicodeDammit(p[1].rstrip(os.linesep).replace("\n", "")) 439 | parsed_encoding = dammit.original_encoding 440 | else: 441 | dammit = UnicodeDammit(p[1].rstrip(os.linesep).replace("\n", "")) 442 | parsed_encoding = dammit.original_encoding 443 | 444 | ############################################# 445 | # DATA CLEAN UP - message_references 446 | ############################################# 447 | 448 | # GET BODY OF THE MESSAGE 449 | try: 450 | parsed_body_text_original = message.get_payload(decode=False) 451 | if message.is_multipart(): 452 | for part in message.walk(): 453 | if part.is_multipart(): 454 | for subpart in part.walk(): 455 | if subpart.get_content_type() == 'text/plain': 456 | parsed_body_text = subpart.get_content() 457 | elif part.get_content_type() == 'text/plain': 458 | parsed_body_text = part.get_content() 459 | elif message.get_content_type() == 'text/plain': 460 | try: 461 | parsed_body_text = message.get_content() 462 | try: 463 | parsed_body_text.encode('utf-8', 'surrogatepass') 464 | except Exception: 465 | parsed_body_text = message.get_payload(decode=False) 466 | except Exception: 467 | parsed_body_text = message.get_payload(decode=False) 468 | # parsed_body_text = parsed_message._mail['body'] 469 | except Exception: 470 | # dammit = UnicodeDammit(str(parsed_body_text).encode('utf-8', 'surrogatepass')) 471 | # parsed_body_text = str(parsed_body_text).encode('utf-8', 'surrogatepass').decode(dammit.original_encoding) 472 | try: 473 | if message.is_multipart(): 474 | for part in message.walk(): 475 | if part.is_multipart(): 476 | for subpart in part.walk(): 477 | if subpart.get_content_type() == 'text/plain': 478 | parsed_body_text = subpart.get_payload(decode=True) 479 | elif part.get_content_type() == 'text/plain': 480 | parsed_body_text = str(part.get_payload(decode=True)) 481 | elif message.get_content_type() == 'text/plain': 482 | parsed_body_text1 = message.get_payload(decode=True) 483 | parsed_body_text = message.get_payload(decode=False) 484 | parsed_body_text_original = message.get_payload(decode=False) 485 | # parsed_body_text = str(message.get_payload(decode=True)).encode('utf-8', 'surrogatepass') 486 | dammit = UnicodeDammit(parsed_body_text1) 487 | parsed_encoding = dammit.original_encoding 488 | # body_text = parsed_message._mail['body'] 489 | except Exception: 490 | parsed_body_text = "" 491 | pass 492 | 493 | # DATA CLEAN UP - MESSAGE BODY 494 | # try: 495 | # if parsed_encoding: 496 | # parsed_body_text = parsed_body_text.encode('utf-8', 'surrogatepass').decode(parsed_encoding) 497 | # else: 498 | # dammit_body = UnicodeDammit(str(parsed_body_text).encode('utf-8', 'surrogatepass')) 499 | # parsed_body_text = str(parsed_body_text).encode('utf-8', 'surrogatepass').decode( 500 | # dammit_body.original_encoding) 501 | # except Exception: 502 | # dammit_body = UnicodeDammit(str(parsed_body_text).encode('utf-8', 'surrogatepass')) 503 | # parsed_body_text = str(parsed_body_text).encode('utf-8', 'surrogatepass').decode("ANSI") 504 | 505 | # DATA CLEAN UP - DATE 506 | 507 | try: 508 | if '(' in parsed_date: 509 | parsed_date = message['date'].split("(")[0].strip() 510 | else: 511 | parsed_date = message['date'].strip() 512 | except Exception: 513 | pass 514 | 515 | failing_zones_to_check = ['-13', '-14', '-15', '-16', '-17', '-18', '-19', '-20', '-21', '-22', '-23', '-24', '+15', '+16', '+17', '+18', '+19', '+20', '+21', '+22', '+23', '+24'] 516 | try: 517 | for failedzone in failing_zones_to_check: 518 | if failedzone in parsed_date: 519 | parsed_date = parsed_date.split(failedzone)[0] 520 | print('Fixed: ' + parsed_date + ' | ' + failedzone) 521 | break 522 | else: 523 | parsed_date = dateutil.parser.parse(parsed_date, tzinfos=configuration.timezone_info) 524 | except Exception: 525 | try: 526 | # Try to parse/convert NNTP-Posting-Date 527 | parsed_date = message['NNTP-Posting-Date'] 528 | for failedzone in failing_zones_to_check: 529 | if failedzone in parsed_date: 530 | parsed_date = parsed_date.split(failedzone)[0] 531 | print ('Fixed NNTP: ' + parsed_date + ' | ' + failedzone) 532 | break 533 | else: 534 | parsed_date = dateutil.parser.parse(parsed_date, tzinfos=configuration.timezone_info) 535 | except Exception: 536 | # new_headers.append(tuple(("odate", value))) 537 | continue 538 | 539 | # DATA CLEAN UP - message_encoding 540 | if not parsed_encoding: 541 | parsed_encoding = "ANSI" 542 | elif parsed_encoding == "x-user-defined": 543 | parsed_encoding = "ANSI" 544 | 545 | if parsed_ref: 546 | parsed_ref = clean_string(parsed_ref, parsed_encoding) 547 | else: 548 | parsed_ref = "" 549 | 550 | # DATA CLEAN UP - message_id 551 | if parsed_message_id: 552 | parsed_message_id = clean_string(parsed_message_id, parsed_encoding) 553 | parsed_message_id = parsed_message_id.replace("'", "") 554 | parsed_message_id = parsed_message_id.replace(" ", "").replace('\n', ' ').replace('\r', '') 555 | else: 556 | parsed_message_id = ''.join(random.choices(string.ascii_letters + string.digits, k=16)) 557 | 558 | # DATA CLEAN UP - message_subject 559 | if parsed_subject: 560 | parsed_subject = clean_string(parsed_subject, parsed_encoding) 561 | if len(parsed_subject) > 250: 562 | parsed_subject = parsed_subject.split("=?")[0] 563 | 564 | 565 | # DATA CLEAN UP - message_subject 566 | if parsed_from: 567 | parsed_from = clean_string(parsed_from, parsed_encoding) 568 | 569 | ############################################# 570 | # ADD MESSAGE DETAILS INTO POSTGRES 571 | ############################################# 572 | inserted_subject_id = None 573 | inserted_from_id = None 574 | inserted_header_id = None 575 | 576 | try: 577 | # Check If MSG ID already in db 578 | db_cursor = configuration.db_connection.cursor() 579 | parsed_message_id = removeNonAscii(parsed_message_id) 580 | query = f"select count(*) from all_messages.{group_name_fin_db}_headers where msg_id='" + parsed_message_id + "';" 581 | db_cursor.execute(query) 582 | msg_exist = db_cursor.fetchone()[0] 583 | #print("message_exists:") 584 | #print(msg_exist) 585 | db_cursor.close() 586 | except Exception: 587 | #print("Exception #: 9") 588 | print(query) 589 | db_cursor.close() 590 | try: 591 | # Check If MSG ID already in db 592 | db_cursor = configuration.db_connection.cursor() 593 | 594 | query = f"select count(*) from all_messages.{group_name_fin_db}_headers where msg_id='{parsed_message_id}'" 595 | db_cursor.execute(query) 596 | msg_exist = db_cursor.fetchone()[0] 597 | #print("message_exists:") 598 | #print(msg_exist) 599 | db_cursor.close() 600 | except Exception: 601 | print("Passing: " + parsed_message_id) 602 | #print("Exception #: 10") 603 | db_cursor.close() 604 | msg_exist = False 605 | pass 606 | 607 | 608 | # Continue only if MSG not in the headers db 609 | if msg_exist == 0: 610 | #try: 611 | try: 612 | # Add a unique subject line 613 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_subjects(subject) VALUES ((%s)) ON CONFLICT(subject) DO UPDATE SET subject=(%s) RETURNING id" 614 | db_cursor = configuration.db_connection.cursor() 615 | db_cursor.execute(sql, (parsed_subject, parsed_subject)) 616 | configuration.db_connection.commit() 617 | inserted_subject_id = db_cursor.fetchone()[0] 618 | db_cursor.close() 619 | except Exception: 620 | #print("Exception #: 11") 621 | db_cursor.close() 622 | #exit() 623 | if inserted_subject_id is None: 624 | try: 625 | parsed_subject = parsed_subject.encode("ascii", "ignore").decode() 626 | parsed_subject = re.sub(r'[^\x00-\x7f]', r'', parsed_subject) 627 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_subjects(subject) VALUES ((%s)) ON CONFLICT(subject) DO UPDATE SET subject=(%s) RETURNING id" 628 | db_cursor = configuration.db_connection.cursor() 629 | db_cursor.execute(sql, (parsed_subject, parsed_subject)) 630 | configuration.db_connection.commit() 631 | inserted_subject_id = db_cursor.fetchone()[0] 632 | db_cursor.close() 633 | except Exception: 634 | #print("Exception #: 12") 635 | db_cursor.close() 636 | #exit() 637 | try: 638 | parsed_subject = re.sub(r'[^\x00-\x7f]', r'', parsed_subject_original) 639 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_subjects(subject) VALUES ((%s)) ON CONFLICT(subject) DO UPDATE SET subject=(%s) RETURNING id" 640 | db_cursor = configuration.db_connection.cursor() 641 | db_cursor.execute(sql, (parsed_subject, parsed_subject)) 642 | configuration.db_connection.commit() 643 | inserted_subject_id = db_cursor.fetchone()[0] 644 | db_cursor.close() 645 | except Exception: 646 | #print("Exception #: 13") 647 | db_cursor.close() 648 | #exit() 649 | pass 650 | 651 | 652 | try: 653 | # Add a unique from line 654 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_from(data) VALUES (%s) ON CONFLICT(data) DO UPDATE SET data=(%s) RETURNING id" 655 | db_cursor = configuration.db_connection.cursor() 656 | db_cursor.execute(sql, (parsed_from, parsed_from)) 657 | configuration.db_connection.commit() 658 | inserted_from_id = db_cursor.fetchone()[0] 659 | db_cursor.close() 660 | except Exception: 661 | #print("Exception #: 13") 662 | print(group_name_fin_db) 663 | db_cursor.close() 664 | if inserted_from_id is None: 665 | try: 666 | parsed_from = parsed_from.encode("ascii", "ignore").decode() 667 | parsed_from = re.sub(r'[^\x00-\x7f]', r'', parsed_from) 668 | #print("Exception #: 14") 669 | print(parsed_from) 670 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_from(data) VALUES (%s) ON CONFLICT(data) DO UPDATE SET data=(%s) RETURNING id" 671 | db_cursor = configuration.db_connection.cursor() 672 | db_cursor.execute(sql, (parsed_from, parsed_from)) 673 | configuration.db_connection.commit() 674 | inserted_from_id = db_cursor.fetchone()[0] 675 | db_cursor.close() 676 | except Exception: 677 | try: 678 | db_cursor.close() 679 | parsed_from = re.sub(r'[^\x00-\x7f]', r'', parsed_from_original) 680 | #print("Exception #: 15") 681 | print(parsed_from) 682 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_from(data) VALUES (%s) ON CONFLICT(data) DO UPDATE SET data=(%s) RETURNING id" 683 | db_cursor = configuration.db_connection.cursor() 684 | db_cursor.execute(sql, (parsed_from, parsed_from)) 685 | configuration.db_connection.commit() 686 | inserted_from_id = db_cursor.fetchone()[0] 687 | db_cursor.close() 688 | except Exception: 689 | try: 690 | db_cursor.close() 691 | parsed_from = re.search(r'<(.*?)>', parsed_from).group(1) 692 | #print("Exception #: 16") 693 | print(parsed_from) 694 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_from(data) VALUES ('{parsed_from}') ON CONFLICT(data) DO UPDATE SET data=('{parsed_from}') RETURNING id" 695 | print(sql) 696 | db_cursor = configuration.db_connection.cursor() 697 | print("ss1") 698 | db_cursor.execute(sql) 699 | print("ss2") 700 | configuration.db_connection.commit() 701 | print("ss3") 702 | inserted_from_id = db_cursor.fetchone()[0] 703 | print(inserted_from_id) 704 | db_cursor.close() 705 | except Exception: 706 | #print("Exception #: 17a") 707 | #exit() 708 | pass 709 | # Add a header info - pass in the subject line id from the previous statement 710 | # 711 | if not inserted_from_id: 712 | print("I couldn't get inserted_from_id!") 713 | exit() 714 | 715 | if parsed_ref: 716 | has_ref = 1 717 | else: 718 | has_ref = 0 719 | 720 | try: 721 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_headers(dateparsed, subj_id, ref, msg_id, msg_from, enc, contype) VALUES ((%s), (%s), (%s), (%s), (%s), (%s), (%s)) RETURNING id" 722 | db_cursor = configuration.db_connection.cursor() 723 | db_cursor.execute(sql, ( 724 | parsed_date, inserted_subject_id, has_ref, parsed_message_id, inserted_from_id, parsed_encoding, 725 | parsed_content_type)) 726 | configuration.db_connection.commit() 727 | inserted_header_id = db_cursor.fetchone()[0] 728 | db_cursor.close() 729 | except Exception: 730 | #print("Exception #: 16a") 731 | db_cursor.close() 732 | #exit() 733 | print('Duplicate MSG ID: ' + parsed_message_id) 734 | 735 | continue 736 | 737 | if parsed_ref: 738 | split_refs = parsed_ref.split(' ') 739 | for split in split_refs: 740 | try: 741 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_refs(id, ref_msg) VALUES ((%s), (%s));" 742 | db_cursor = configuration.db_connection.cursor() 743 | db_cursor.execute(sql, (inserted_header_id, split.strip())) 744 | configuration.db_connection.commit() 745 | db_cursor.close() 746 | except Exception: 747 | #print("Exception #: 17") 748 | db_cursor.close() 749 | #exit() 750 | pass 751 | try: 752 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_body(id,data) VALUES ((%s), (%s))" 753 | db_cursor = configuration.db_connection.cursor() 754 | db_cursor.execute(sql, (inserted_header_id, parsed_body_text)) 755 | configuration.db_connection.commit() 756 | db_cursor.close() 757 | except Exception: 758 | #print("Exception #: 18") 759 | db_cursor.close() 760 | try: 761 | parsed_body_text = parsed_body_text.encode("ascii", "ignore").decode() 762 | parsed_body_text = re.sub(r'[^\x00-\x7f]', r'', parsed_body_text) 763 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_body(id,data) VALUES ((%s), (%s))" 764 | db_cursor = configuration.db_connection.cursor() 765 | db_cursor.execute(sql, (inserted_header_id, parsed_body_text)) 766 | configuration.db_connection.commit() 767 | db_cursor.close() 768 | except Exception: 769 | #print("Exception #: 19") 770 | db_cursor.close() 771 | #parsed_body_text = parsed_body_text_original.encode('utf-8', 'surrogateescape').decode('ANSI') 772 | try: 773 | parsed_body_text = re.sub(r'[^\x00-\x7f]', r'', parsed_body_text) 774 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_body(id,data) VALUES ((%s), (%s))" 775 | db_cursor = configuration.db_connection.cursor() 776 | db_cursor.execute(sql, (inserted_header_id, parsed_body_text)) 777 | configuration.db_connection.commit() 778 | db_cursor.close() 779 | except Exception: 780 | #print("Exception #: 19") 781 | db_cursor.close() 782 | continue 783 | 784 | 785 | # except Exception as err: 786 | # #print("Exception #: 20") 787 | # print("------------------------") 788 | # print("-*-" + str(sql) + "-*-") 789 | # print("-*-" + str(parsed_message_id) + "-*-") 790 | # print("-*-" + str(parsed_date) + "-*-") 791 | # print("-*-" + str(parsed_from) + "-*-") 792 | # print("-*-" + str(parsed_subject) + "-*-") 793 | # print("-*-" + str(parsed_ref) + "-*-") 794 | # print("-*-" + str(parsed_encoding) + "-*-") 795 | # print("-*-" + str(parsed_content_type) + "-*-") 796 | # print("-*-" + str(parsed_body_text) + "-*-") 797 | # print("------------------------") 798 | # print_psycopg2_exception(err) 799 | # print(str(processing_message_counter) + " - " + str(err)) 800 | # print("------------------------") 801 | 802 | 803 | all_count = int(mbox._next_key) 804 | # group_name_fin = file_name 805 | # update DB - marked file as not being processed anymore 806 | #print("Final Group Name: " + group_name_fin) 807 | group_name_fin = re.sub('\s+',' ',group_name_fin) 808 | #print("Final Group Name 2: " + group_name_fin) 809 | 810 | if processing_message_counter == all_count: 811 | 812 | try: 813 | sql = f"INSERT INTO all_messages.__all_files(file_name, current, total, processing, newsgroup_name) VALUES ('{filename}',{processing_message_counter},{all_count},0,'{group_name_fin}') ON CONFLICT (file_name) DO UPDATE SET current={processing_message_counter}, total={all_count}, processing=0" 814 | #print(sql) 815 | db_cursor = configuration.db_connection.cursor() 816 | db_cursor.execute(sql) 817 | #configuration.db_connection.commit() 818 | db_cursor.close() 819 | except Exception as err: 820 | #print("Exception #: 21") 821 | db_cursor.close() 822 | exit() 823 | else: 824 | try: 825 | sql = f"INSERT INTO all_messages.__all_files(file_name, current, total, processing, newsgroup_name) VALUES ('{filename}',{processing_message_counter},{all_count},1,'{group_name_fin}') ON CONFLICT (file_name) DO UPDATE SET current={processing_message_counter}, total={all_count}, processing=1" 826 | #print(sql) 827 | db_cursor = configuration.db_connection.cursor() 828 | db_cursor.execute(sql) 829 | #configuration.db_connection.commit() 830 | db_cursor.close() 831 | except Exception as err: 832 | #print("Exception #: 22") 833 | db_cursor.close() 834 | exit() 835 | 836 | 837 | 838 | 839 | 840 | # remove temp file 841 | if os.path.exists(where2unzip): 842 | mbox.unlock() 843 | mbox.close() 844 | 845 | try: 846 | os.remove(where2unzip) 847 | print("** TEMP file removed: " + where2unzip) 848 | except Exception: 849 | print(Exception) 850 | 851 | try: 852 | f.close() 853 | shutil.move(f, configuration.processed_path + '/' + filename + '.gz') 854 | print('Moving File to ' + configuration.processed_path + filename + '.gz') 855 | except Exception: 856 | print(Exception) 857 | 858 | 859 | else: 860 | print("The file does not exist: " + where2unzip) 861 | -------------------------------------------------------------------------------- /nntp.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import nntplib 3 | import os 4 | import quopri 5 | import random 6 | import re 7 | import string 8 | from datetime import date 9 | import dateutil.parser 10 | from bs4 import UnicodeDammit 11 | import configuration 12 | 13 | today = date.today() 14 | print("** START **") 15 | print("Starting at:", today) 16 | 17 | 18 | nntp_connection = configuration.nntp_connection 19 | 20 | db_cursor = configuration.db_connection.cursor() 21 | 22 | 23 | response, groups = nntp_connection.list() 24 | 25 | print("Total Number of groups:", len(groups)) 26 | 27 | b = 0 28 | group_name_fin = "" 29 | count_minutes = 0 30 | 31 | for x in range(len(groups)): 32 | groupName = "" 33 | if ((groups[x].group.startswith("comp.") is True) \ 34 | or groups[x].group.startswith("humanities.") is True \ 35 | or groups[x].group.startswith("microsoft.") is True \ 36 | or groups[x].group.startswith("news.") is True \ 37 | or groups[x].group.startswith("rec.") is True \ 38 | or groups[x].group.startswith("sci.") is True \ 39 | or groups[x].group.startswith("soc.") is True \ 40 | or groups[x].group.startswith("talk.") is True)\ 41 | and ((int(groups[x].last) - int(groups[x].first)) > configuration.syncGroupsOverNumPosts)\ 42 | and "binaries" not in groups[x].group: 43 | b = b + 1 44 | groupName = groups[x].group 45 | 46 | file_name = "" 47 | current_position_in_db = 0 48 | last_message_count = 0 49 | is_file_being_processed = 0 50 | processing_message_counter = 0 51 | processing_message_counter2 = 0 52 | group_name_fin = "" 53 | 54 | try: 55 | sql = f"SELECT * FROM all_messages.__all_files WHERE file_name = '{groupName}' LIMIT 1" 56 | db_cursor.execute(sql) 57 | details = db_cursor.fetchone() 58 | file_name = details[0] 59 | current_position_in_db = details[1] 60 | last_message_count = details[2] 61 | is_file_being_processed = details[3] 62 | # db_cursor.close() 63 | except Exception: 64 | file_name = "" 65 | current_position_in_db = 0 66 | last_message_count = 0 67 | is_file_being_processed = 0 68 | # print("Exception #: 1") 69 | # db_cursor.close() 70 | 71 | print(b, groupName, groups[x].first, groups[x].last, current_position_in_db, last_message_count) 72 | if (file_name == "") or (current_position_in_db == 0 and last_message_count == 0 73 | and is_file_being_processed == 0) or (current_position_in_db < last_message_count 74 | and is_file_being_processed == 0): 75 | 76 | group_name_fin_db = groupName.replace(".", "_").replace("-", "_").replace("+", "_") 77 | if len(group_name_fin_db) > 45: 78 | group_name_fin_db = group_name_fin_db[-45:] 79 | 80 | try: 81 | # db_cursor.close() 82 | sql = f"INSERT INTO all_messages.__all_files(file_name, current, total, processing, newsgroup_name) VALUES ('{groupName}', 0, 0 ,1,'{groupName}') ON CONFLICT (file_name) DO UPDATE SET processing=1" 83 | # sql = "INSERT INTO all_messages.__all_files(file_name, current, total, processing, newsgroup_name) VALUES ('sci.homebrew.20140221.mbox', 0, 0 ,1,'sci.homebrew') ON CONFLICT (file_name) DO UPDATE SET processing=1" 84 | db_cursor.execute(sql) 85 | configuration.db_connection.commit() 86 | # db_cursor.close() 87 | except Exception: 88 | # print("Exception #: 3") 89 | # db_cursor.close() 90 | # exit() 91 | pass 92 | 93 | # Create tables for a new group 94 | # db_cursor.close() 95 | # db_cursor = configuration.db_connection.cursor() 96 | sql = f"select exists(select * from information_schema.tables where table_name='{group_name_fin_db}_headers')" 97 | db_cursor.execute(sql) 98 | exist = db_cursor.fetchone()[0] 99 | # db_cursor.close() 100 | 101 | if not exist: 102 | try: 103 | # db_cursor.close() 104 | sql = f"create table all_messages.{group_name_fin_db}_headers(id bigserial not null constraint {group_name_fin_db}_headers_pk primary key, dateparsed timestamp, subj_id bigint, ref smallint, msg_id text, msg_from bigint, enc text, contype text, processed timestamp default CURRENT_TIMESTAMP);alter table all_messages.{group_name_fin_db}_headers owner to postgres;" 105 | # db_cursor = configuration.db_connection.cursor() 106 | db_cursor.execute(sql) 107 | configuration.db_connection.commit() 108 | # db_cursor.close() 109 | 110 | sql = f"create table all_messages.{group_name_fin_db}_refs(id bigint, ref_msg text default null);alter table all_messages.{group_name_fin_db}_refs owner to postgres;" 111 | # db_cursor = configuration.db_connection.cursor() 112 | db_cursor.execute(sql) 113 | configuration.db_connection.commit() 114 | # db_cursor.close() 115 | 116 | sql = f"create table all_messages.{group_name_fin_db}_body(id bigint primary key, data text default null);alter table all_messages.{group_name_fin_db}_body owner to postgres;" 117 | # db_cursor = configuration.db_connection.cursor() 118 | db_cursor.execute(sql) 119 | configuration.db_connection.commit() 120 | # db_cursor.close() 121 | 122 | sql = f"create table all_messages.{group_name_fin_db}_from(id serial not null constraint {group_name_fin_db}_from_pk primary key, data text);alter table all_messages.{group_name_fin_db}_from owner to postgres;" 123 | # db_cursor = configuration.db_connection.cursor() 124 | db_cursor.execute(sql) 125 | configuration.db_connection.commit() 126 | # db_cursor.close() 127 | 128 | sql = f"create table all_messages.{group_name_fin_db}_subjects(id serial not null constraint {group_name_fin_db}_subjects_pk primary key, subject text);alter table all_messages.{group_name_fin_db}_subjects owner to postgres;" 129 | # db_cursor = configuration.db_connection.cursor() 130 | db_cursor.execute(sql) 131 | configuration.db_connection.commit() 132 | # db_cursor.close() 133 | 134 | sql = f"create unique index {group_name_fin_db}_headers_uiidx on all_messages.{group_name_fin_db}_headers(id);" \ 135 | f"create unique index {group_name_fin_db}_headers_umidx on all_messages.{group_name_fin_db}_headers(msg_id);" \ 136 | f"create unique index {group_name_fin_db}_body_idx on all_messages.{group_name_fin_db}_body(id);" \ 137 | f"create unique index {group_name_fin_db}_from_idx on all_messages.{group_name_fin_db}_from(data);" \ 138 | f"create unique index {group_name_fin_db}_subjects_idx on all_messages.{group_name_fin_db}_subjects(subject);" 139 | # db_cursor = configuration.db_connection.cursor() 140 | db_cursor.execute(sql) 141 | configuration.db_connection.commit() 142 | # db_cursor.close() 143 | except Exception: 144 | pass 145 | 146 | 147 | def groupnum(number): 148 | s = '%d' % number 149 | groups = [] 150 | while s and s[-1].isdigit(): 151 | groups.append(s[-3:]) 152 | s = s[:-3] 153 | return s + ','.join(reversed(groups)) 154 | 155 | 156 | def find_between(s, first, last): 157 | try: 158 | start = s.index(first) + len(first) 159 | end = s.index(last, start) 160 | return s[start:end] 161 | except ValueError: 162 | return "" 163 | 164 | 165 | def removeNonAscii(s): 166 | return "".join(i for i in s if ord(i) < 126 and ord(i) > 31) 167 | 168 | 169 | def clean_string(header_part, encoding): 170 | orig_header_part = header_part 171 | header_part = header_part.rstrip(os.linesep).replace("\n", "") 172 | encoding_quoted = encoding 173 | 174 | if '?q?' in header_part: 175 | encoding_quoted = find_between(header_part, '=?', '?') 176 | header_part = header_part.split("?q?", 1)[1] # .replace("_", " ") 177 | # header_part = find_between(header_part, 'q?', '?').replace("_", " ") 178 | elif '?Q?' in header_part: 179 | encoding_quoted = find_between(header_part, '=?', '?') 180 | header_part = header_part.split("?Q?", 1)[1] # .replace("_", " ") 181 | elif '?b?' in header_part: 182 | encoding_quoted = find_between(header_part, '=?', '?') 183 | header_part = header_part.split("?b?", 1)[1] # .replace("_", " ") 184 | try: 185 | header_part = base64.b64decode(header_part) 186 | except Exception: 187 | try: 188 | header_part = base64.b64decode(header_part) 189 | except Exception: 190 | header_part = orig_header_part 191 | elif '?B?' in header_part: 192 | encoding_quoted = find_between(header_part, '=?', '?') 193 | header_part = header_part.split("?B?", 1)[1] # .replace("_", " ") 194 | try: 195 | header_part = base64.b64decode(header_part) 196 | except Exception: 197 | try: 198 | header_part = base64.b64decode(header_part) 199 | except Exception: 200 | header_part = orig_header_part 201 | 202 | if 'unknown' in encoding_quoted: 203 | encoding_quoted = encoding 204 | elif 'x-user-defined' in encoding_quoted: 205 | encoding_quoted = encoding 206 | 207 | try: 208 | header_part = quopri.decodestring(header_part).decode(encoding_quoted) 209 | return header_part 210 | except Exception: 211 | try: 212 | header_part = quopri.decodestring(header_part).decode(encoding) 213 | return header_part 214 | except Exception: 215 | try: 216 | dammit = UnicodeDammit(header_part) 217 | if dammit.original_encoding: 218 | header_part = quopri.decodestring(header_part).decode(dammit.original_encoding) 219 | return header_part 220 | else: 221 | header_part = quopri.decodestring(header_part).decode('ascii') 222 | return header_part 223 | except Exception: 224 | try: 225 | header_part = quopri.decodestring(header_part).decode("ansi") 226 | return header_part 227 | except Exception: 228 | try: 229 | header_part = header_part.encode('utf8', 'surrogatepass').decode('utf8', 230 | 'surrogatepass') 231 | return header_part 232 | except Exception: 233 | return "" 234 | 235 | 236 | resp, count, first, last, name = nntp_connection.group(groupName) 237 | all_count = last 238 | count_really_inserted = 0 239 | count_emptybody_inserted = 0 240 | 241 | if first < current_position_in_db: 242 | first = current_position_in_db 243 | print('Group', name, 'has', count, 'articles, range', first, 'to', last) 244 | 245 | 246 | # if (first+configuration.syncInTiersOfNumPosts) >= all_count: 247 | # end = all_count 248 | # else: 249 | # end = first+configuration.syncInTiersOfNumPosts 250 | 251 | end = all_count 252 | print("Processing: ", first, end) 253 | 254 | try: 255 | resp, overviews = nntp_connection.over((first, end)) 256 | except Exception as inst: 257 | print(inst) 258 | 259 | # (5, {'subject': 'Re: Nominate Hirokazu Yamamoto (oceancity) for commit privs.', 'from': 'Jeroen Ruigrok van der Werven ', 'date': 'Mon, 11 Aug 2008 22:15:34 +0200', 'message-id': '<20080811201534.GL57679@nexus.in-nomine.org>', 'references': '<6167796BFEB5D0438720AC212E89A6B0078F4D64@exchange.onresolve.com> <48A0995D.6010902@v.loewis.de>', ':bytes': '5100', ':lines': '14', 'xref': 'news.gmane.org gmane.comp.python.committers:5'}) 260 | for id, over in overviews: 261 | processing_message_counter = id 262 | processing_message_counter2 = processing_message_counter2 + 1 263 | 264 | if processing_message_counter % 1000 == 0: 265 | percentage = round(100 * float(processing_message_counter) / float(all_count), 2) 266 | 267 | # Show how many messsages we're processing per minute 268 | # sql_count = "SELECT COUNT(*) FROM all_messages.headers WHERE processed >= (now() - INTERVAL \'1 MINUTE\')" 269 | # sql_count = "SELECT COUNT(*) FROM all_messages.headers" 270 | try: 271 | # db_cursor.close() 272 | sql = f"SELECT COUNT(*) FROM all_messages.{group_name_fin_db}_headers WHERE processed >= (now() - INTERVAL '1 MINUTE')" 273 | # db_cursor = configuration.db_connection.cursor() 274 | db_cursor.execute(sql) 275 | messages_per_minute1 = db_cursor.fetchone()[0] 276 | # db_cursor.close() 277 | except Exception: 278 | # print("Exception #: 5") 279 | # db_cursor.close() 280 | messages_per_minute1 = 0 281 | # exit() 282 | 283 | try: 284 | # db_cursor.close() 285 | sql = f"INSERT INTO all_messages.__all_updates(groupname,perminute) VALUES ((%s), (%s))" 286 | # db_cursor = configuration.db_connection.cursor() 287 | db_cursor.execute(sql, (groupName, messages_per_minute1)) 288 | configuration.db_connection.commit() 289 | # db_cursor.close() 290 | except Exception as err: 291 | # print("Exception #: 6") 292 | # db_cursor.close() 293 | # exit() 294 | pass 295 | 296 | # Delete all execept last 100 most recent update messages 297 | try: 298 | # db_cursor.close() 299 | sql = f"DELETE FROM all_messages.__all_updates WHERE id <= (SELECT id FROM (SELECT id FROM all_messages.__all_updates ORDER BY id DESC LIMIT 1 OFFSET 10000) foo);" 300 | # db_cursor = configuration.db_connection.cursor() 301 | db_cursor.execute(sql) 302 | # db_cursor.close() 303 | except Exception: 304 | # print("Exception #: 7") 305 | # db_cursor.close() 306 | # exit() 307 | pass 308 | 309 | try: 310 | # db_cursor.close() 311 | sql = f"select SUM(perminute) from all_messages.__all_updates where id in (SELECT MAX(id) as t FROM all_messages.__all_updates WHERE tstamp >= (now() - INTERVAL '1 MINUTE') group by groupname);" 312 | # db_cursor = configuration.db_connection.cursor() 313 | db_cursor.execute(sql) 314 | messages_per_minute1 = db_cursor.fetchone()[0] 315 | # db_cursor.close() 316 | if not messages_per_minute1: 317 | messages_per_minute1 = 0 318 | print(groupName + ": " + str(processing_message_counter) + " of " + str( 319 | all_count) + " (" + str(percentage) + "%) | " + str( 320 | groupnum(messages_per_minute1)) + " msgs/min (" + str( 321 | groupnum(messages_per_minute1 * 60)) + " hr, " + str( 322 | groupnum(messages_per_minute1 * 60 * 24)) + " day, " + str( 323 | parsed_date) + " - Added: " + str(count_really_inserted) + " - Empty Body Added: " + str(count_emptybody_inserted)) 324 | except Exception: 325 | # print("Exception #: 8") 326 | # db_cursor.close() 327 | # exit() 328 | pass 329 | # something is wrong, too many empty body messages inserted - break the run 330 | if count_emptybody_inserted>10: 331 | print("Empty Body Inserted Count: ", count_emptybody_inserted) 332 | exit(0) 333 | 334 | # RESET ALL VARS 335 | parsed_encoding = "utf-8" 336 | parsed_content_type = None 337 | parsed_message_id = None 338 | parsed_date = None 339 | parsed_subject = None 340 | parsed_subject_original = None 341 | parsed_ref = None 342 | parsed_body_text = "" 343 | parsed_body_text_original = None 344 | parsed_from = None 345 | parsed_from_original = None 346 | has_ref = 0 347 | 348 | # Get the rest 349 | try: 350 | parsed_date = nntplib.decode_header(over['date']) 351 | except Exception: 352 | pass 353 | 354 | try: 355 | parsed_content_type = nntplib.decode_header(over['content-type']) 356 | except Exception: 357 | pass 358 | 359 | try: 360 | parsed_ref = nntplib.decode_header(over['references']) 361 | except Exception: 362 | pass 363 | 364 | try: 365 | parsed_subject = nntplib.decode_header(over['subject']) 366 | except Exception: 367 | pass 368 | 369 | try: 370 | parsed_subject_original = nntplib.decode_header(over['subject']) 371 | except Exception: 372 | pass 373 | 374 | try: 375 | parsed_message_id = nntplib.decode_header(over['message-id']) 376 | except Exception: 377 | pass 378 | 379 | try: 380 | parsed_from = nntplib.decode_header(over['from']) 381 | except Exception: 382 | pass 383 | 384 | try: 385 | if '(' in parsed_date: 386 | parsed_date = parsed_date.split("(")[0].strip() 387 | else: 388 | parsed_date = parsed_date.strip() 389 | except Exception: 390 | pass 391 | 392 | failing_zones_to_check = ['-13', '-14', '-15', '-16', '-17', '-18', '-19', '-20', '-21', '-22', 393 | '-23', '-24', '+15', '+16', '+17', '+18', '+19', '+20', '+21', '+22', 394 | '+23', '+24'] 395 | try: 396 | for failedzone in failing_zones_to_check: 397 | if failedzone in parsed_date: 398 | parsed_date = parsed_date.split(failedzone)[0] 399 | print('Fixed: ' + parsed_date + ' | ' + failedzone) 400 | break 401 | else: 402 | parsed_date = dateutil.parser.parse(parsed_date, tzinfos=configuration.timezone_info) 403 | except Exception: 404 | try: 405 | # Try to parse/convert NNTP-Posting-Date 406 | parsed_date = nntplib.decode_header(over['NNTP-Posting-Date']) 407 | for failedzone in failing_zones_to_check: 408 | if failedzone in parsed_date: 409 | parsed_date = parsed_date.split(failedzone)[0] 410 | print('Fixed NNTP: ' + parsed_date + ' | ' + failedzone) 411 | break 412 | else: 413 | parsed_date = dateutil.parser.parse(parsed_date, tzinfos=configuration.timezone_info) 414 | except Exception: 415 | # new_headers.append(tuple(("odate", value))) 416 | continue 417 | 418 | if parsed_ref: 419 | parsed_ref = clean_string(parsed_ref, parsed_encoding) 420 | else: 421 | parsed_ref = "" 422 | 423 | # DATA CLEAN UP - message_id 424 | if parsed_message_id: 425 | parsed_message_id = clean_string(parsed_message_id, parsed_encoding) 426 | parsed_message_id = parsed_message_id.replace("'", "") 427 | parsed_message_id = parsed_message_id.replace(" ", "").replace('\n', ' ').replace('\r', '') 428 | else: 429 | parsed_message_id = ''.join(random.choices(string.ascii_letters + string.digits, k=16)) 430 | 431 | # DATA CLEAN UP - message_subject 432 | stringType = None 433 | if parsed_subject: 434 | parsed_subject = clean_string(parsed_subject, parsed_encoding) 435 | if '\\u' in ascii(parsed_subject): 436 | parsed_subject = ascii(parsed_subject) 437 | if len(parsed_subject) > 250: 438 | parsed_subject = parsed_subject.split("=?")[0] 439 | 440 | 441 | # DATA CLEAN UP - message_subject 442 | if parsed_from: 443 | parsed_from = clean_string(parsed_from, parsed_encoding) 444 | 445 | ############################################# 446 | # ADD MESSAGE DETAILS INTO POSTGRES 447 | ############################################# 448 | inserted_subject_id = None 449 | inserted_from_id = None 450 | inserted_header_id = None 451 | msg_exist = 0 452 | try: 453 | # Check If MSG ID already in db 454 | # db_cursor.close() 455 | # db_cursor = configuration.db_connection.cursor() 456 | parsed_message_id = removeNonAscii(parsed_message_id) 457 | query = f"select id from all_messages.{group_name_fin_db}_headers where msg_id='{parsed_message_id}';" 458 | db_cursor.execute(query) 459 | inserted_header_id = db_cursor.fetchone()[0] 460 | if inserted_header_id: 461 | msg_exist = 1 462 | else: 463 | inserted_header_id = None 464 | 465 | # db_cursor.close() 466 | except Exception: 467 | # print("Exception #: 9") 468 | # print(query) 469 | # db_cursor.close() 470 | try: 471 | # Check If MSG ID already in db 472 | # db_cursor = configuration.db_connection.cursor() 473 | 474 | query = f"select id from all_messages.{group_name_fin_db}_headers where msg_id='{parsed_message_id}'" 475 | db_cursor.execute(query) 476 | inserted_header_id = db_cursor.fetchone()[0] 477 | if inserted_header_id: 478 | msg_exist = 1 479 | else: 480 | inserted_header_id = None 481 | # print("message_exists:") 482 | # print(msg_exist) 483 | # db_cursor.close() 484 | except Exception: 485 | #print("Passing: " + parsed_message_id) 486 | # print("Exception #: 10") 487 | # db_cursor.close() 488 | msg_exist = 0 489 | pass 490 | 491 | bodyIsEmpty = 0 492 | bodyExist = 1 493 | try: 494 | # If Msg exists, let's find if body is empty 495 | if msg_exist == 1: 496 | query = f"select count(*) from all_messages.{group_name_fin_db}_body where id={inserted_header_id} and data=''" 497 | db_cursor.execute(query) 498 | bodyIsEmpty = db_cursor.fetchone()[0] 499 | except Exception: 500 | pass 501 | 502 | try: 503 | # If Msg exists, let's find if there is any body entry for this message 504 | if msg_exist == 1: 505 | query = f"select count(*) from all_messages.{group_name_fin_db}_body where id={inserted_header_id}" 506 | db_cursor.execute(query) 507 | bodyExist = db_cursor.fetchone()[0] 508 | except Exception: 509 | pass 510 | 511 | #if bodyIsEmpty == 1: print("Message Exists but has Empty Body: ", parsed_message_id, query) 512 | 513 | #if bodyExist == 0: print("Message Exists but Body Not Exists: ", parsed_message_id, query) 514 | 515 | # Continue only if MSG not in the headers db 516 | if msg_exist == 0 or bodyIsEmpty == 1 or bodyExist == 0: 517 | 518 | # try: 519 | try: 520 | # db_cursor.close() 521 | # Add a unique subject line 522 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_subjects(subject) VALUES ((%s)) ON CONFLICT(subject) DO UPDATE SET subject=(%s) RETURNING id" 523 | # db_cursor = configuration.db_connection.cursor() 524 | db_cursor.execute(sql, (parsed_subject, parsed_subject)) 525 | configuration.db_connection.commit() 526 | inserted_subject_id = db_cursor.fetchone()[0] 527 | # db_cursor.close() 528 | except Exception: 529 | # print("Exception #: 11") 530 | # db_cursor.close() 531 | # exit() 532 | if inserted_subject_id is None: 533 | try: 534 | # db_cursor.close() 535 | parsed_subject = parsed_subject.encode("ascii", "ignore").decode() 536 | parsed_subject = re.sub(r'[^\x00-\x7f]', r'', parsed_subject) 537 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_subjects(subject) VALUES ((%s)) ON CONFLICT(subject) DO UPDATE SET subject=(%s) RETURNING id" 538 | # db_cursor = configuration.db_connection.cursor() 539 | db_cursor.execute(sql, (parsed_subject, parsed_subject)) 540 | configuration.db_connection.commit() 541 | inserted_subject_id = db_cursor.fetchone()[0] 542 | # db_cursor.close() 543 | except Exception: 544 | # print("Exception #: 12") 545 | # db_cursor.close() 546 | # exit() 547 | try: 548 | # db_cursor.close() 549 | parsed_subject = re.sub(r'[^\x00-\x7f]', r'', parsed_subject_original) 550 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_subjects(subject) VALUES ((%s)) ON CONFLICT(subject) DO UPDATE SET subject=(%s) RETURNING id" 551 | # db_cursor = configuration.db_connection.cursor() 552 | db_cursor.execute(sql, (parsed_subject, parsed_subject)) 553 | configuration.db_connection.commit() 554 | inserted_subject_id = db_cursor.fetchone()[0] 555 | # db_cursor.close() 556 | except Exception: 557 | # print("Exception #: 13") 558 | # db_cursor.close() 559 | # exit() 560 | pass 561 | 562 | try: 563 | # Add a unique from line 564 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_from(data) VALUES ((%s)) ON CONFLICT(data) DO UPDATE SET data=(%s) RETURNING id" 565 | #db_cursor = configuration.db_connection.cursor() 566 | #print(parsed_from) 567 | db_cursor.execute(sql, (parsed_from, parsed_from)) 568 | configuration.db_connection.commit() 569 | inserted_from_id = db_cursor.fetchone()[0] 570 | except Exception: 571 | # print("Exception #: 13") 572 | print(group_name_fin_db, sql) 573 | if inserted_from_id is None: 574 | try: 575 | # db_cursor.close() 576 | parsed_from = parsed_from.encode("ascii", "ignore").decode() 577 | parsed_from = re.sub(r'[^\x00-\x7f]', r'', parsed_from) 578 | # print("Exception #: 14") 579 | print(parsed_from) 580 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_from(data) VALUES (%s) ON CONFLICT(data) DO UPDATE SET data=(%s) RETURNING id" 581 | # db_cursor = configuration.db_connection.cursor() 582 | db_cursor.execute(sql, (parsed_from, parsed_from)) 583 | configuration.db_connection.commit() 584 | inserted_from_id = db_cursor.fetchone()[0] 585 | # db_cursor.close() 586 | except Exception: 587 | try: 588 | # db_cursor.close() 589 | parsed_from = re.sub(r'[^\x00-\x7f]', r'', parsed_from_original) 590 | # print("Exception #: 15") 591 | print(parsed_from) 592 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_from(data) VALUES (%s) ON CONFLICT(data) DO UPDATE SET data=(%s) RETURNING id" 593 | # db_cursor = configuration.db_connection.cursor() 594 | db_cursor.execute(sql, (parsed_from, parsed_from)) 595 | configuration.db_connection.commit() 596 | inserted_from_id = db_cursor.fetchone()[0] 597 | # db_cursor.close() 598 | except Exception: 599 | try: 600 | # db_cursor.close() 601 | parsed_from = re.search(r'<(.*?)>', parsed_from).group(1) 602 | # print("Exception #: 16") 603 | #print(parsed_from) 604 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_from(data) VALUES ('{parsed_from}') ON CONFLICT(data) DO UPDATE SET data=('{parsed_from}') RETURNING id" 605 | #print(sql) 606 | # db_cursor = configuration.db_connection.cursor() 607 | #print("ss1") 608 | db_cursor.execute(sql) 609 | #print("ss2") 610 | configuration.db_connection.commit() 611 | #print("ss3") 612 | inserted_from_id = db_cursor.fetchone()[0] 613 | #print(inserted_from_id) 614 | # db_cursor.close() 615 | except Exception: 616 | # print("Exception #: 17a") 617 | # exit() 618 | pass 619 | # Add a header info - pass in the subject line id from the previous statement 620 | # 621 | if not inserted_from_id: 622 | if bodyIsEmpty == 1 or bodyExist == 0: 623 | #print("I couldn't get inserted_from_id!") 624 | pass 625 | else: 626 | print("failure - no inserted_from_id", parsed_message_id) 627 | exit() 628 | 629 | 630 | if parsed_ref: 631 | has_ref = 1 632 | else: 633 | has_ref = 0 634 | 635 | 636 | if msg_exist == 0: 637 | try: 638 | # db_cursor.close() 639 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_headers(dateparsed, subj_id, ref, msg_id, msg_from, enc, contype) VALUES ((%s), (%s), (%s), (%s), (%s), (%s), (%s)) RETURNING id" 640 | # db_cursor = configuration.db_connection.cursor() 641 | db_cursor.execute(sql, ( 642 | parsed_date, inserted_subject_id, has_ref, parsed_message_id, inserted_from_id, 643 | parsed_encoding, 644 | parsed_content_type)) 645 | configuration.db_connection.commit() 646 | inserted_header_id = db_cursor.fetchone()[0] 647 | # db_cursor.close() 648 | except Exception: 649 | # print("Exception #: 16a") 650 | # db_cursor.close() 651 | # exit() 652 | #print('Duplicate MSG ID: ' + parsed_message_id) 653 | pass 654 | 655 | continue 656 | 657 | if parsed_ref: 658 | split_refs = parsed_ref.split(' ') 659 | for split in split_refs: 660 | try: 661 | # db_cursor.close() 662 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_refs(id, ref_msg) VALUES ((%s), (%s));" 663 | # db_cursor = configuration.db_connection.cursor() 664 | db_cursor.execute(sql, (inserted_header_id, split.strip())) 665 | configuration.db_connection.commit() 666 | # db_cursor.close() 667 | except Exception: 668 | # print("Exception #: 17") 669 | # db_cursor.close() 670 | # exit() 671 | pass 672 | 673 | try: 674 | # Get Body 675 | bodyError = 0 676 | try: 677 | resp, info = nntp_connection.body(over['message-id']) 678 | # header = nntp_connection.decode_header(nntp_connection.body(nntplib.decode_header(over['message-id']))) 679 | for line in info.lines: 680 | dammit = UnicodeDammit(line) 681 | parsed_encoding = dammit.original_encoding 682 | try: 683 | parsed_body_text += line.decode('utf-8') 684 | parsed_body_text += "\n" 685 | except Exception as e: 686 | try: 687 | parsed_body_text += line.decode(parsed_encoding) 688 | parsed_body_text += "\n" 689 | except Exception as e: 690 | parsed_body_text += re.sub(r'[^\x00-\x7f]', r'', line) 691 | parsed_body_text += "\n" 692 | except Exception as e: 693 | if "430" in str(e): 694 | #print(e) 695 | pass 696 | else: 697 | bodyError = 1 698 | #print("-------***********ERROR & LINE************------") 699 | #print(e) 700 | #print(line) 701 | pass 702 | 703 | 704 | #if bodyError == 1: 705 | #print("-------************MESSAGE ID + FULL BODY***************------") 706 | #print(parsed_message_id) 707 | #print(parsed_body_text) 708 | #print("-------***************************------") 709 | #exit(0) 710 | 711 | # db_cursor.close() 712 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_body(id,data) VALUES ((%s), (%s)) ON CONFLICT (id) DO UPDATE SET data=(%s)" 713 | # db_cursor = configuration.db_connection.cursor() 714 | 715 | if len(parsed_body_text) > 0: 716 | db_cursor.execute(sql, (inserted_header_id, parsed_body_text, parsed_body_text)) 717 | configuration.db_connection.commit() 718 | # print(inserted_header_id, parsed_message_id, len(parsed_body_text)) 719 | count_really_inserted = count_really_inserted + 1 720 | else: 721 | count_emptybody_inserted = count_emptybody_inserted + 1 722 | # pass 723 | #print(f"{inserted_header_id} - NO BODY") 724 | # db_cursor.close() 725 | # print('Inserted:' + inserted_header_id) 726 | except Exception: 727 | # print("Exception #: 18") 728 | # db_cursor.close() 729 | try: 730 | parsed_body_text = parsed_body_text.encode("ascii", "ignore").decode() 731 | parsed_body_text = re.sub(r'[^\x00-\x7f]', r'', parsed_body_text) 732 | # db_cursor.close() 733 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_body(id,data) VALUES ((%s), (%s))" 734 | # db_cursor = configuration.db_connection.cursor() 735 | db_cursor.execute(sql, (inserted_header_id, parsed_body_text)) 736 | configuration.db_connection.commit() 737 | # db_cursor.close() 738 | except Exception: 739 | # print("Exception #: 19") 740 | # db_cursor.close() 741 | # parsed_body_text = parsed_body_text_original.encode('utf-8', 'surrogateescape').decode('ANSI') 742 | try: 743 | # db_cursor.close() 744 | parsed_body_text = re.sub(r'[^\x00-\x7f]', r'', parsed_body_text) 745 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_body(id,data) VALUES ((%s), (%s))" 746 | # db_cursor = configuration.db_connection.cursor() 747 | db_cursor.execute(sql, (inserted_header_id, parsed_body_text)) 748 | configuration.db_connection.commit() 749 | # db_cursor.close() 750 | except Exception: 751 | # print("Exception #: 19") 752 | # db_cursor.close() 753 | continue 754 | 755 | # except Exception as err: 756 | # #print("Exception #: 20") 757 | # print("------------------------") 758 | # print("-*-" + str(sql) + "-*-") 759 | # print("-*-" + str(parsed_message_id) + "-*-") 760 | # print("-*-" + str(parsed_date) + "-*-") 761 | # print("-*-" + str(parsed_from) + "-*-") 762 | # print("-*-" + str(parsed_subject) + "-*-") 763 | # print("-*-" + str(parsed_ref) + "-*-") 764 | # print("-*-" + str(parsed_encoding) + "-*-") 765 | # print("-*-" + str(parsed_content_type) + "-*-") 766 | # print("-*-" + str(parsed_body_text) + "-*-") 767 | # print("------------------------") 768 | # print_psycopg2_exception(err) 769 | # print(str(processing_message_counter) + " - " + str(err)) 770 | # print("------------------------") 771 | 772 | # group_name_fin = file_name 773 | # update DB - marked file as not being processed anymore 774 | # print("Final Group Name: " + group_name_fin) 775 | # group_name_fin = re.sub('\s+',' ',group_name_fin) 776 | # print("Final Group Name 2: " + group_name_fin) 777 | 778 | if processing_message_counter == all_count: 779 | 780 | try: 781 | # db_cursor.close() 782 | sql = f"INSERT INTO all_messages.__all_files(file_name, current, total, processing, newsgroup_name,tstamp) VALUES ('{groupName}',{processing_message_counter},{last},0,'{groupName}',now()) ON CONFLICT (file_name) DO UPDATE SET current={processing_message_counter}, total={last}, processing=0,tstamp = now()" 783 | # print(sql) 784 | # db_cursor = configuration.db_connection.cursor() 785 | db_cursor.execute(sql) 786 | # configuration.db_connection.commit() 787 | # db_cursor.close() 788 | except Exception as err: 789 | # print("Exception #: 21") 790 | # db_cursor.close() 791 | exit() 792 | else: 793 | try: 794 | # db_cursor.close() 795 | sql = f"INSERT INTO all_messages.__all_files(file_name, current, total, processing, newsgroup_name,tstamp) VALUES ('{groupName}',{processing_message_counter},{last},1,'{groupName}',now()) ON CONFLICT (file_name) DO UPDATE SET current={processing_message_counter}, total={last}, processing=1,tstamp = now()" 796 | # print(sql)tstamp 797 | # db_cursor = configuration.db_connection.cursor() 798 | db_cursor.execute(sql) 799 | configuration.db_connection.commit() 800 | # db_cursor.close() 801 | except Exception as err: 802 | # print("Exception #: 22") 803 | # db_cursor.close() 804 | exit() 805 | -------------------------------------------------------------------------------- /utzoo2postgres.py: -------------------------------------------------------------------------------- 1 | ########################### 2 | # UTZOO to PostgreSQL # 3 | # Python Parser # 4 | # Author: Jozef Jarosciak # 5 | # Email: jarosciak@gmail # 6 | # License: MIT # 7 | ######################################################################################################################## 8 | # Details at: https://www.joe0.com/2020/10/07/converting-utzoo-wiseman-netnews-archive-to-postgresql-using-python-3-8/ # 9 | ######################################################################################################################## 10 | 11 | from pathlib import Path 12 | import dateutil.parser 13 | import base64 14 | import email 15 | import quopri 16 | import random 17 | import re 18 | import string 19 | from datetime import date 20 | import cchardet 21 | import dateutil.parser 22 | from bs4 import UnicodeDammit 23 | import psycopg2 24 | import os 25 | 26 | # define DB connection 27 | try: 28 | db_connection = psycopg2.connect(host="localhost", user="postgres", password="", port="5432", database="utzoo") 29 | 30 | except Exception as e: 31 | print(e) 32 | exit(0) 33 | 34 | # define path to un-tared Utzoo archive 35 | # for Windows 36 | #positionFilePath = "E:\\Usenet\\Utzoo\\" 37 | positionFilePath = "E:\\Usenet\\Utzoo\\news124f1\\b163\\comp\\sys\\mac\\" 38 | 39 | # for linux: 40 | # positionFilePath = "/home/utzoo/Utzoo/" 41 | 42 | timezone_info = { 43 | "A": 1 * 3600, 44 | "ACDT": 10.5 * 3600, 45 | "ACST": 9.5 * 3600, 46 | "ACT": -5 * 3600, 47 | "ACWST": 8.75 * 3600, 48 | "ADT": 4 * 3600, 49 | "AEDT": 11 * 3600, 50 | "AEST": 10 * 3600, 51 | "AET": 10 * 3600, 52 | "AFT": 4.5 * 3600, 53 | "AKDT": -8 * 3600, 54 | "AKST": -9 * 3600, 55 | "ALMT": 6 * 3600, 56 | "AMST": -3 * 3600, 57 | "AMT": -4 * 3600, 58 | "ANAST": 12 * 3600, 59 | "ANAT": 12 * 3600, 60 | "AQTT": 5 * 3600, 61 | "ART": -3 * 3600, 62 | "AST": 3 * 3600, 63 | "AT": -4 * 3600, 64 | "AWDT": 9 * 3600, 65 | "AWST": 8 * 3600, 66 | "AZOST": 0 * 3600, 67 | "AZOT": -1 * 3600, 68 | "AZST": 5 * 3600, 69 | "AZT": 4 * 3600, 70 | "AoE": -12 * 3600, 71 | "B": 2 * 3600, 72 | "BNT": 8 * 3600, 73 | "BOT": -4 * 3600, 74 | "BRST": -2 * 3600, 75 | "BRT": -3 * 3600, 76 | "BST": 6 * 3600, 77 | "BTT": 6 * 3600, 78 | "C": 3 * 3600, 79 | "CAST": 8 * 3600, 80 | "CAT": 2 * 3600, 81 | "CCT": 6.5 * 3600, 82 | "CDT": -5 * 3600, 83 | "CEST": 2 * 3600, 84 | "CET": 1 * 3600, 85 | "CHADT": 13.75 * 3600, 86 | "CHAST": 12.75 * 3600, 87 | "CHOST": 9 * 3600, 88 | "CHOT": 8 * 3600, 89 | "CHUT": 10 * 3600, 90 | "CIDST": -4 * 3600, 91 | "CIST": -5 * 3600, 92 | "CKT": -10 * 3600, 93 | "CLST": -3 * 3600, 94 | "CLT": -4 * 3600, 95 | "COT": -5 * 3600, 96 | "CST": -6 * 3600, 97 | "CT": -6 * 3600, 98 | "CVT": -1 * 3600, 99 | "CXT": 7 * 3600, 100 | "ChST": 10 * 3600, 101 | "D": 4 * 3600, 102 | "DAVT": 7 * 3600, 103 | "DDUT": 10 * 3600, 104 | "E": 5 * 3600, 105 | "EASST": -5 * 3600, 106 | "EAST": -6 * 3600, 107 | "EAT": 3 * 3600, 108 | "ECT": -5 * 3600, 109 | "EDT": -4 * 3600, 110 | "EEST": 3 * 3600, 111 | "EET": 2 * 3600, 112 | "EGST": 0 * 3600, 113 | "EGT": -1 * 3600, 114 | "EST": -5 * 3600, 115 | "ET": -5 * 3600, 116 | "F": 6 * 3600, 117 | "FET": 3 * 3600, 118 | "FJST": 13 * 3600, 119 | "FJT": 12 * 3600, 120 | "FKST": -3 * 3600, 121 | "FKT": -4 * 3600, 122 | "FNT": -2 * 3600, 123 | "G": 7 * 3600, 124 | "GALT": -6 * 3600, 125 | "GAMT": -9 * 3600, 126 | "GET": 4 * 3600, 127 | "GFT": -3 * 3600, 128 | "GILT": 12 * 3600, 129 | "GMT": 0 * 3600, 130 | "GST": 4 * 3600, 131 | "GYT": -4 * 3600, 132 | "H": 8 * 3600, 133 | "HDT": -9 * 3600, 134 | "HKT": 8 * 3600, 135 | "HOVST": 8 * 3600, 136 | "HOVT": 7 * 3600, 137 | "HST": -10 * 3600, 138 | "I": 9 * 3600, 139 | "ICT": 7 * 3600, 140 | "IDT": 3 * 3600, 141 | "IOT": 6 * 3600, 142 | "IRDT": 4.5 * 3600, 143 | "IRKST": 9 * 3600, 144 | "IRKT": 8 * 3600, 145 | "IRST": 3.5 * 3600, 146 | "IST": 5.5 * 3600, 147 | "JST": 9 * 3600, 148 | "K": 10 * 3600, 149 | "KGT": 6 * 3600, 150 | "KOST": 11 * 3600, 151 | "KRAST": 8 * 3600, 152 | "KRAT": 7 * 3600, 153 | "KST": 9 * 3600, 154 | "KUYT": 4 * 3600, 155 | "L": 11 * 3600, 156 | "LHDT": 11 * 3600, 157 | "LHST": 10.5 * 3600, 158 | "LINT": 14 * 3600, 159 | "M": 12 * 3600, 160 | "MAGST": 12 * 3600, 161 | "MAGT": 11 * 3600, 162 | "MART": 9.5 * 3600, 163 | "MAWT": 5 * 3600, 164 | "MDT": -6 * 3600, 165 | "MHT": 12 * 3600, 166 | "MMT": 6.5 * 3600, 167 | "MSD": 4 * 3600, 168 | "MSK": 3 * 3600, 169 | "MST": -7 * 3600, 170 | "MT": -7 * 3600, 171 | "MUT": 4 * 3600, 172 | "MVT": 5 * 3600, 173 | "MYT": 8 * 3600, 174 | "N": -1 * 3600, 175 | "NCT": 11 * 3600, 176 | "NDT": 2.5 * 3600, 177 | "NFT": 11 * 3600, 178 | "NOVST": 7 * 3600, 179 | "NOVT": 7 * 3600, 180 | "NPT": 5.5 * 3600, 181 | "NRT": 12 * 3600, 182 | "NST": 3.5 * 3600, 183 | "NUT": -11 * 3600, 184 | "NZDT": 13 * 3600, 185 | "NZST": 12 * 3600, 186 | "O": -2 * 3600, 187 | "OMSST": 7 * 3600, 188 | "OMST": 6 * 3600, 189 | "ORAT": 5 * 3600, 190 | "P": -3 * 3600, 191 | "PDT": -7 * 3600, 192 | "PET": -5 * 3600, 193 | "PETST": 12 * 3600, 194 | "PETT": 12 * 3600, 195 | "PGT": 10 * 3600, 196 | "PHOT": 13 * 3600, 197 | "PHT": 8 * 3600, 198 | "PKT": 5 * 3600, 199 | "PMDT": -2 * 3600, 200 | "PMST": -3 * 3600, 201 | "PONT": 11 * 3600, 202 | "PST": -8 * 3600, 203 | "PT": -8 * 3600, 204 | "PWT": 9 * 3600, 205 | "PYST": -3 * 3600, 206 | "PYT": -4 * 3600, 207 | "Q": -4 * 3600, 208 | "QYZT": 6 * 3600, 209 | "R": -5 * 3600, 210 | "RET": 4 * 3600, 211 | "ROTT": -3 * 3600, 212 | "S": -6 * 3600, 213 | "SAKT": 11 * 3600, 214 | "SAMT": 4 * 3600, 215 | "SAST": 2 * 3600, 216 | "SBT": 11 * 3600, 217 | "SCT": 4 * 3600, 218 | "SGT": 8 * 3600, 219 | "SRET": 11 * 3600, 220 | "SRT": -3 * 3600, 221 | "SST": -11 * 3600, 222 | "SYOT": 3 * 3600, 223 | "T": -7 * 3600, 224 | "TAHT": -10 * 3600, 225 | "TFT": 5 * 3600, 226 | "TJT": 5 * 3600, 227 | "TKT": 13 * 3600, 228 | "TLT": 9 * 3600, 229 | "TMT": 5 * 3600, 230 | "TOST": 14 * 3600, 231 | "TOT": 13 * 3600, 232 | "TRT": 3 * 3600, 233 | "TVT": 12 * 3600, 234 | "U": -8 * 3600, 235 | "ULAST": 9 * 3600, 236 | "ULAT": 8 * 3600, 237 | "UTC": 0 * 3600, 238 | "UYST": -2 * 3600, 239 | "UYT": -3 * 3600, 240 | "UZT": 5 * 3600, 241 | "V": -9 * 3600, 242 | "VET": -4 * 3600, 243 | "VLAST": 11 * 3600, 244 | "VLAT": 10 * 3600, 245 | "VOST": 6 * 3600, 246 | "VUT": 11 * 3600, 247 | "W": -10 * 3600, 248 | "WAKT": 12 * 3600, 249 | "WARST": -3 * 3600, 250 | "WAST": 2 * 3600, 251 | "WAT": 1 * 3600, 252 | "WEST": 1 * 3600, 253 | "WET": 0 * 3600, 254 | "WFT": 12 * 3600, 255 | "WGST": -2 * 3600, 256 | "WGT": -3 * 3600, 257 | "WIB": 7 * 3600, 258 | "WIT": 9 * 3600, 259 | "WITA": 8 * 3600, 260 | "WST": 14 * 3600, 261 | "WT": 0 * 3600, 262 | "X": -11 * 3600, 263 | "Y": -12 * 3600, 264 | "YAKST": 10 * 3600, 265 | "YAKT": 9 * 3600, 266 | "YAPT": 10 * 3600, 267 | "YEKST": 6 * 3600, 268 | "YEKT": 5 * 3600, 269 | "Z": 0 * 3600 270 | } 271 | 272 | today = date.today() 273 | print("** START **") 274 | print("Starting at:", today) 275 | 276 | db_cursor = db_connection.cursor() 277 | processing_message_counter = dict() 278 | counterall = 0 279 | last_page = 0 280 | 281 | 282 | def convert_encoding(data, new_coding='UTF-8'): 283 | encoding = cchardet.detect(data)['encoding'] 284 | if new_coding.upper() != encoding.upper(): 285 | data = data.decode(encoding, data).encode(new_coding) 286 | return data 287 | 288 | 289 | insertedMsgs = 0 290 | 291 | completeProcessing = 0 292 | 293 | positionFilePath = positionFilePath + "counter.txt" 294 | 295 | for path in Path(positionFilePath.replace("\\counter.txt", "").replace("/counter.txt", "")).rglob('*'): 296 | if os.path.isfile(path): 297 | counterall = counterall + 1 298 | 299 | if os.path.exists(positionFilePath): 300 | with open(positionFilePath, 'r') as file: 301 | last_page = int(file.read().replace('\n', '')) 302 | 303 | if last_page > counterall: 304 | continue 305 | 306 | if os.path.isfile(path): 307 | if ".TARDIRPERMS_" in str(path): 308 | continue 309 | 310 | ggmsg = None 311 | ggthread = None 312 | groupName = None 313 | body = None 314 | message = None 315 | 316 | try: 317 | message_from_utzoo_file = Path(path.absolute()).read_text() 318 | except Exception as e: 319 | print(path.absolute()) 320 | message_from_utzoo_file = Path(path.absolute()).read_text(encoding="ascii", errors="ignore") 321 | 322 | try: 323 | message = email.message_from_string(message_from_utzoo_file) 324 | if message['Newsgroups']: 325 | # print(message) 326 | pass 327 | else: 328 | # print("Broken No Newsgroup: ", path) 329 | continue 330 | except Exception as e: 331 | continue 332 | 333 | if ("," in str(message['Newsgroups'])) or (" " in str(message['Newsgroups'])): 334 | groupName = str(message['Newsgroups']).split(",") 335 | if "@" in groupName[0] or "..." in groupName[0]: 336 | try: 337 | groupName = groupName[1] 338 | except Exception as e: 339 | print(e) 340 | exit(0) 341 | else: 342 | groupName = groupName[0] 343 | groupName = str(groupName).split(" ") 344 | groupName = groupName[0] 345 | else: 346 | groupName = message['Newsgroups'] 347 | 348 | body = message.get_payload() 349 | 350 | if groupName is None: 351 | completeProcessing = 1 352 | print("Processed all messages") 353 | exit(0) 354 | 355 | # print(groupName, ggmsg) 356 | group_name_fin_db = groupName.replace(".", "_").replace("-", "_").replace("+", "_") 357 | if len(group_name_fin_db) > 45: 358 | group_name_fin_db = group_name_fin_db[-45:] 359 | 360 | group_name_fin_db = re.sub(r'[^\x00-\x7f]', '', group_name_fin_db) 361 | group_name_fin_db = group_name_fin_db.replace('c\b', '').rstrip().lower() 362 | 363 | #print("-"+group_name_fin_db.rstrip()+"-") 364 | 365 | # Create tables for a new group 366 | db_cursor = db_connection.cursor() 367 | query = "" 368 | query = f"select exists(select * from information_schema.tables where table_schema = 'all_messages' AND table_name='{group_name_fin_db}_headers')" 369 | 370 | db_cursor.execute(query) 371 | exist = db_cursor.fetchone()[0] 372 | # db_cursor.close() 373 | 374 | if exist is False: 375 | 376 | try: 377 | sql = f"create table all_messages.{group_name_fin_db}_headers(id bigserial not null constraint {group_name_fin_db}_headers_pk primary key, dateparsed timestamp, subj_id bigint, ref smallint, msg_id text, msg_from bigint, enc text, contype text, processed timestamp default CURRENT_TIMESTAMP);alter table all_messages.{group_name_fin_db}_headers owner to postgres;" 378 | db_cursor = db_connection.cursor() 379 | db_cursor.execute(sql) 380 | db_connection.commit() 381 | # db_cursor.close() 382 | 383 | sql = f"create table all_messages.{group_name_fin_db}_refs(id bigint, ref_msg text default null);alter table all_messages.{group_name_fin_db}_refs owner to postgres;" 384 | db_cursor = db_connection.cursor() 385 | db_cursor.execute(sql) 386 | db_connection.commit() 387 | # db_cursor.close() 388 | 389 | sql = f"create table all_messages.{group_name_fin_db}_body(id bigint primary key, data text default null);alter table all_messages.{group_name_fin_db}_body owner to postgres;" 390 | db_cursor = db_connection.cursor() 391 | db_cursor.execute(sql) 392 | db_connection.commit() 393 | # db_cursor.close() 394 | 395 | sql = f"create table all_messages.{group_name_fin_db}_from(id serial not null constraint {group_name_fin_db}_from_pk primary key, data text);alter table all_messages.{group_name_fin_db}_from owner to postgres;" 396 | db_cursor = db_connection.cursor() 397 | db_cursor.execute(sql) 398 | db_connection.commit() 399 | # db_cursor.close() 400 | 401 | sql = f"create table all_messages.{group_name_fin_db}_subjects(id serial not null constraint {group_name_fin_db}_subjects_pk primary key, subject text);alter table all_messages.{group_name_fin_db}_subjects owner to postgres;" 402 | db_cursor = db_connection.cursor() 403 | db_cursor.execute(sql) 404 | db_connection.commit() 405 | # db_cursor.close() 406 | 407 | sql = f"create unique index {group_name_fin_db}_headers_uiidx on all_messages.{group_name_fin_db}_headers(id);" \ 408 | f"create unique index {group_name_fin_db}_headers_umidx on all_messages.{group_name_fin_db}_headers(msg_id);" \ 409 | f"create unique index {group_name_fin_db}_body_idx on all_messages.{group_name_fin_db}_body(id);" \ 410 | f"create unique index {group_name_fin_db}_from_idx on all_messages.{group_name_fin_db}_from(data);" \ 411 | f"create unique index {group_name_fin_db}_subjects_idx on all_messages.{group_name_fin_db}_subjects(subject);" 412 | db_cursor = db_connection.cursor() 413 | db_cursor.execute(sql) 414 | db_connection.commit() 415 | # db_cursor.close() 416 | except Exception: 417 | pass 418 | 419 | 420 | def groupnum(number): 421 | s = '%d' % number 422 | groups = [] 423 | while s and s[-1].isdigit(): 424 | groups.append(s[-3:]) 425 | s = s[:-3] 426 | return s + ','.join(reversed(groups)) 427 | 428 | 429 | def find_between(s, first, last): 430 | try: 431 | start = s.index(first) + len(first) 432 | end = s.index(last, start) 433 | return s[start:end] 434 | except ValueError: 435 | return "" 436 | 437 | 438 | def removeNonAscii(s): 439 | return "".join(i for i in s if ord(i) < 126 and ord(i) > 31) 440 | 441 | 442 | def clean_string(header_part, encoding): 443 | orig_header_part = header_part 444 | header_part = header_part.rstrip(os.linesep).replace("\n", "") 445 | encoding_quoted = encoding 446 | 447 | if '?q?' in header_part: 448 | encoding_quoted = find_between(header_part, '=?', '?') 449 | header_part = header_part.split("?q?", 1)[1] # .replace("_", " ") 450 | # header_part = find_between(header_part, 'q?', '?').replace("_", " ") 451 | elif '?Q?' in header_part: 452 | encoding_quoted = find_between(header_part, '=?', '?') 453 | header_part = header_part.split("?Q?", 1)[1] # .replace("_", " ") 454 | elif '?b?' in header_part: 455 | encoding_quoted = find_between(header_part, '=?', '?') 456 | header_part = header_part.split("?b?", 1)[1] # .replace("_", " ") 457 | try: 458 | header_part = base64.b64decode(header_part) 459 | except Exception: 460 | try: 461 | header_part = base64.b64decode(header_part) 462 | except Exception: 463 | header_part = orig_header_part 464 | elif '?B?' in header_part: 465 | encoding_quoted = find_between(header_part, '=?', '?') 466 | header_part = header_part.split("?B?", 1)[1] # .replace("_", " ") 467 | try: 468 | header_part = base64.b64decode(header_part) 469 | except Exception: 470 | try: 471 | header_part = base64.b64decode(header_part) 472 | except Exception: 473 | header_part = orig_header_part 474 | 475 | if 'unknown' in encoding_quoted: 476 | encoding_quoted = encoding 477 | elif 'x-user-defined' in encoding_quoted: 478 | encoding_quoted = encoding 479 | 480 | try: 481 | header_part = quopri.decodestring(header_part).decode(encoding_quoted) 482 | return header_part 483 | except Exception: 484 | try: 485 | header_part = quopri.decodestring(header_part).decode(encoding) 486 | return header_part 487 | except Exception: 488 | try: 489 | dammit = UnicodeDammit(header_part) 490 | if dammit.original_encoding: 491 | header_part = quopri.decodestring(header_part).decode(dammit.original_encoding) 492 | return header_part 493 | else: 494 | header_part = quopri.decodestring(header_part).decode('ascii') 495 | return header_part 496 | except Exception: 497 | try: 498 | header_part = quopri.decodestring(header_part).decode("ansi") 499 | return header_part 500 | except Exception: 501 | try: 502 | header_part = header_part.encode('utf8', 'surrogatepass').decode('utf8', 503 | 'surrogatepass') 504 | return header_part 505 | except Exception: 506 | return "" 507 | 508 | 509 | # RESET ALL VARS 510 | parsed_encoding = None 511 | parsed_content_type = None 512 | parsed_message_id = None 513 | parsed_date = None 514 | parsed_subject = None 515 | parsed_subject_original = None 516 | parsed_ref = None 517 | parsed_body_text = None 518 | parsed_body_text_original = None 519 | parsed_from = None 520 | parsed_from_original = None 521 | has_ref = 0 522 | 523 | ############################################# 524 | # USENET HEADER PARSING 525 | ############################################# 526 | # GET HEADERS IN ORIGINAL RAW FORMAT (NOT UTF-8) 527 | # PARSE THE IMPORTANT PARTS FROM LIST OF HEADERS 528 | 529 | for p in message._headers: 530 | name = str(p[0]).lower() 531 | 532 | # Parse Date 533 | if name == 'date': 534 | parsed_date = p[1] 535 | 536 | # Parse Content Type 537 | if name == 'content-type': 538 | parsed_content_type = p[1] 539 | 540 | # Parse content-transfer-encoding 541 | if name == 'content-transfer-encoding': 542 | parsed_content_type = p[1] 543 | 544 | # Parse References 545 | if name == 'references': 546 | parsed_ref = p[1] 547 | 548 | # Parse Subject 549 | if name == 'subject': 550 | parsed_subject = p[1] 551 | parsed_subject_original = p[1] 552 | 553 | # Parse message-id 554 | if name == 'message-id': 555 | parsed_message_id = p[1] 556 | 557 | # Parse From 558 | if name == 'from': 559 | parsed_from = p[1] 560 | parsed_from_original = p[1] 561 | 562 | # Parse Charset Encoding 563 | if name == 'content-type': 564 | try: 565 | parsed_encoding = message.get_content_charset() 566 | except Exception: 567 | if "charset=" in name: 568 | try: 569 | parsed_encoding = str( 570 | re.findall(r'"([^"]*)"', str(p[1].rstrip(os.linesep).replace("\n", "")))[0]) 571 | except Exception: 572 | dammit = UnicodeDammit(p[1].rstrip(os.linesep).replace("\n", "")) 573 | parsed_encoding = dammit.original_encoding 574 | else: 575 | dammit = UnicodeDammit(p[1].rstrip(os.linesep).replace("\n", "")) 576 | parsed_encoding = dammit.original_encoding 577 | 578 | ############################################# 579 | # DATA CLEAN UP - message_references 580 | ############################################# 581 | 582 | # GET BODY OF THE MESSAGE 583 | try: 584 | parsed_body_text_original = message.get_payload(decode=False) 585 | if message.is_multipart(): 586 | for part in message.walk(): 587 | if part.is_multipart(): 588 | for subpart in part.walk(): 589 | if subpart.get_content_type() == 'text/plain': 590 | parsed_body_text = subpart.get_content() 591 | elif part.get_content_type() == 'text/plain': 592 | parsed_body_text = part.get_content() 593 | elif message.get_content_type() == 'text/plain': 594 | try: 595 | parsed_body_text = message.get_content() 596 | try: 597 | parsed_body_text.encode('utf-8', 'surrogatepass') 598 | except Exception: 599 | parsed_body_text = message.get_payload(decode=False) 600 | except Exception: 601 | parsed_body_text = message.get_payload(decode=False) 602 | # parsed_body_text = parsed_message._mail['body'] 603 | except Exception: 604 | # dammit = UnicodeDammit(str(parsed_body_text).encode('utf-8', 'surrogatepass')) 605 | # parsed_body_text = str(parsed_body_text).encode('utf-8', 'surrogatepass').decode(dammit.original_encoding) 606 | try: 607 | if message.is_multipart(): 608 | for part in message.walk(): 609 | if part.is_multipart(): 610 | for subpart in part.walk(): 611 | if subpart.get_content_type() == 'text/plain': 612 | parsed_body_text = subpart.get_payload(decode=True) 613 | elif part.get_content_type() == 'text/plain': 614 | parsed_body_text = str(part.get_payload(decode=True)) 615 | elif message.get_content_type() == 'text/plain': 616 | parsed_body_text1 = message.get_payload(decode=True) 617 | parsed_body_text = message.get_payload(decode=False) 618 | parsed_body_text_original = message.get_payload(decode=False) 619 | # parsed_body_text = str(message.get_payload(decode=True)).encode('utf-8', 'surrogatepass') 620 | dammit = UnicodeDammit(parsed_body_text1) 621 | parsed_encoding = dammit.original_encoding 622 | # body_text = parsed_message._mail['body'] 623 | except Exception: 624 | parsed_body_text = "" 625 | pass 626 | 627 | # DATA CLEAN UP - DATE 628 | 629 | if parsed_subject is None: 630 | parsed_subject = message['Title'] 631 | 632 | # if "Re:" in parsed_subject: 633 | # print(message.keys()) 634 | # print(parsed_subject) 635 | parsed_date_check = None 636 | if parsed_date: 637 | #parsed_date = parsed_date.replace("Wednesday, ", "") 638 | #print(parsed_date) 639 | try: 640 | parsed_date_check = dateutil.parser.parse(parsed_date, tzinfos=timezone_info) 641 | except Exception as e: 642 | try: 643 | parsed_date_check = dateutil.parser.parse(parsed_date.upper(), tzinfos=timezone_info) 644 | except Exception as e: 645 | parsed_date_check = None 646 | 647 | if parsed_date_check is None or (parsed_date_check.hour == 0 and parsed_date_check.minute == 0 and parsed_date_check.second == 0 and parsed_date_check.microsecond == 0): 648 | parsed_date = message['NNTP-Posting-Date'] 649 | if parsed_date: 650 | parsed_date_check = dateutil.parser.parse(parsed_date, tzinfos=timezone_info) 651 | if parsed_date_check is None or (parsed_date_check.hour == 0 and parsed_date_check.minute == 0 and parsed_date_check.second == 0 and parsed_date_check.microsecond == 0): 652 | parsed_date = message['X-Article-Creation-Date'] 653 | if parsed_date: 654 | parsed_date_check = dateutil.parser.parse(parsed_date, tzinfos=timezone_info) 655 | if parsed_date_check is None or (parsed_date_check.hour == 0 and parsed_date_check.minute == 0 and parsed_date_check.second == 0 and parsed_date_check.microsecond == 0): 656 | parsed_date = message['Posted'] 657 | if parsed_date: 658 | #print(parsed_date) 659 | parsed_date_check = dateutil.parser.parse(parsed_date.split('(')[0], tzinfos=timezone_info) 660 | if parsed_date_check is None or (parsed_date_check.hour == 0 and parsed_date_check.minute == 0 and parsed_date_check.second == 0 and parsed_date_check.microsecond == 0): 661 | parsed_date = message['Received'] 662 | if parsed_date: 663 | parsed_date_check = dateutil.parser.parse(parsed_date, tzinfos=timezone_info) 664 | 665 | if parsed_date is None: 666 | print('No date') 667 | print(message._headers) 668 | continue 669 | # exit(0) 670 | 671 | if parsed_message_id is None: 672 | parsed_message_id = message['Article-I.D.'] 673 | 674 | try: 675 | if '(' in parsed_date: 676 | parsed_date = message['date'].split("(")[0].strip() 677 | else: 678 | parsed_date = message['date'].strip() 679 | except Exception: 680 | pass 681 | 682 | failing_zones_to_check = ['-13', '-14', '-15', '-16', '-17', '-18', '-19', '-20', '-21', '-22', '-23', '-24', '-25', '-26', '-27', '-28', '-29', '-30', 683 | '+15', '+16', '+17', '+18', '+19', '+20', '+21', '+22', '+23', '+24', '+25', '+26', '+27', '+28', '+29', '+30'] 684 | try: 685 | for failedzone in failing_zones_to_check: 686 | if failedzone in parsed_date: 687 | parsed_date = parsed_date.split(failedzone)[0] 688 | print('Fixed: ' + parsed_date + ' | ' + failedzone) 689 | break 690 | else: 691 | parsed_date = dateutil.parser.parse(parsed_date, tzinfos=timezone_info) 692 | except Exception: 693 | try: 694 | # Try to parse/convert NNTP-Posting-Date 695 | parsed_date = message['NNTP-Posting-Date'] 696 | for failedzone in failing_zones_to_check: 697 | if failedzone in parsed_date: 698 | parsed_date = parsed_date.split(failedzone)[0] 699 | print('Fixed NNTP: ' + parsed_date + ' | ' + failedzone) 700 | break 701 | else: 702 | parsed_date = dateutil.parser.parse(parsed_date, tzinfos=timezone_info) 703 | except Exception: 704 | # new_headers.append(tuple(("odate", value))) 705 | try: 706 | print(" can't get date - skipping") 707 | # #db_cursor.close() 708 | except Exception as err: 709 | # print("Exception #: 22") 710 | # #db_cursor.close() 711 | exit() 712 | continue 713 | # exit(0) 714 | 715 | # DATA CLEAN UP - message_encoding 716 | if not parsed_encoding: 717 | parsed_encoding = "ANSI" 718 | elif parsed_encoding == "x-user-defined": 719 | parsed_encoding = "ANSI" 720 | 721 | if parsed_ref: 722 | parsed_ref = clean_string(parsed_ref, parsed_encoding) 723 | else: 724 | parsed_ref = "" 725 | 726 | # DATA CLEAN UP - message_id 727 | if parsed_message_id: 728 | parsed_message_id = clean_string(parsed_message_id, parsed_encoding) 729 | parsed_message_id = parsed_message_id.replace("'", "") 730 | parsed_message_id = parsed_message_id.replace(" ", "").replace('\n', ' ').replace('\r', '') 731 | else: 732 | parsed_message_id = ''.join(random.choices(string.ascii_letters + string.digits, k=16)) 733 | 734 | # DATA CLEAN UP - message_subject 735 | if parsed_subject: 736 | parsed_subject = clean_string(parsed_subject, parsed_encoding) 737 | #parsed_subject = re.sub(r'[^\x00-\x7f]', '', parsed_subject) 738 | if len(parsed_subject) > 250: 739 | parsed_subject = parsed_subject.split("=?")[0] 740 | 741 | # DATA CLEAN UP - message_subject 742 | if parsed_from: 743 | parsed_from = clean_string(parsed_from, parsed_encoding) 744 | 745 | ############################################# 746 | # ADD MESSAGE DETAILS INTO POSTGRES 747 | ############################################# 748 | inserted_subject_id = None 749 | inserted_from_id = None 750 | inserted_header_id = None 751 | 752 | try: 753 | # Check If MSG ID already in db 754 | db_cursor = db_connection.cursor() 755 | parsed_message_id = removeNonAscii(parsed_message_id) 756 | query = f"select count(*) from all_messages.{group_name_fin_db}_headers where msg_id='" + parsed_message_id + "';" 757 | db_cursor.execute(query) 758 | msg_exist = db_cursor.fetchone()[0] 759 | # print("message_exists:") 760 | # print(msg_exist) 761 | # #db_cursor.close() 762 | except Exception as e: 763 | print(e) 764 | # print("Exception #: 9") 765 | # print(query) 766 | # #db_cursor.close() 767 | try: 768 | # Check If MSG ID already in db 769 | db_cursor = db_connection.cursor() 770 | 771 | query = f"select count(*) from all_messages.{group_name_fin_db}_headers where msg_id='{parsed_message_id}'" 772 | print(query) 773 | db_cursor.execute(query) 774 | msg_exist = db_cursor.fetchone()[0] 775 | # print("message_exists:") 776 | # print(msg_exist) 777 | # #db_cursor.close() 778 | except Exception as e: 779 | print("Passing: " + parsed_message_id) 780 | print(e) 781 | 782 | # print("Exception #: 10") 783 | # #db_cursor.close() 784 | msg_exist = 1 785 | pass 786 | 787 | # Continue only if MSG not in the headers db 788 | if msg_exist == 0: 789 | # try: 790 | try: 791 | # Add a unique subject line 792 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_subjects(subject) VALUES ((%s)) ON CONFLICT(subject) DO UPDATE SET subject=(%s) RETURNING id" 793 | db_cursor = db_connection.cursor() 794 | db_cursor.execute(sql, (parsed_subject, parsed_subject)) 795 | db_connection.commit() 796 | inserted_subject_id = db_cursor.fetchone()[0] 797 | # #db_cursor.close() 798 | except Exception as e: 799 | print(e) 800 | # print("Exception #: 11") 801 | # #db_cursor.close() 802 | # exit() 803 | if inserted_subject_id is None: 804 | try: 805 | parsed_subject = parsed_subject.encode("ascii", "ignore").decode() 806 | parsed_subject = re.sub(r'[^\x00-\x7f]', r'', parsed_subject) 807 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_subjects(subject) VALUES ((%s)) ON CONFLICT(subject) DO UPDATE SET subject=(%s) RETURNING id" 808 | db_cursor = db_connection.cursor() 809 | db_cursor.execute(sql, (parsed_subject, parsed_subject)) 810 | db_connection.commit() 811 | inserted_subject_id = db_cursor.fetchone()[0] 812 | # #db_cursor.close() 813 | except Exception as e: 814 | print(e) 815 | # print("Exception #: 12") 816 | # #db_cursor.close() 817 | # exit() 818 | try: 819 | parsed_subject = re.sub(r'[^\x00-\x7f]', r'', parsed_subject_original) 820 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_subjects(subject) VALUES ((%s)) ON CONFLICT(subject) DO UPDATE SET subject=(%s) RETURNING id" 821 | db_cursor = db_connection.cursor() 822 | db_cursor.execute(sql, (parsed_subject, parsed_subject)) 823 | db_connection.commit() 824 | inserted_subject_id = db_cursor.fetchone()[0] 825 | # #db_cursor.close() 826 | except Exception as e: 827 | print(e) 828 | # print("Exception #: 13") 829 | # #db_cursor.close() 830 | # exit() 831 | pass 832 | 833 | try: 834 | # Add a unique from line 835 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_from(data) VALUES (%s) ON CONFLICT(data) DO UPDATE SET data=(%s) RETURNING id" 836 | db_cursor = db_connection.cursor() 837 | db_cursor.execute(sql, (parsed_from, parsed_from)) 838 | db_connection.commit() 839 | inserted_from_id = db_cursor.fetchone()[0] 840 | # #db_cursor.close() 841 | except Exception as e: 842 | print(e) 843 | print(group_name_fin_db) 844 | # #db_cursor.close() 845 | if inserted_from_id is None: 846 | try: 847 | parsed_from = parsed_from.encode("ascii", "ignore").decode() 848 | parsed_from = re.sub(r'[^\x00-\x7f]', r'', parsed_from) 849 | # print("Exception #: 14") 850 | print(parsed_from) 851 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_from(data) VALUES (%s) ON CONFLICT(data) DO UPDATE SET data=(%s) RETURNING id" 852 | db_cursor = db_connection.cursor() 853 | db_cursor.execute(sql, (parsed_from, parsed_from)) 854 | db_connection.commit() 855 | inserted_from_id = db_cursor.fetchone()[0] 856 | # #db_cursor.close() 857 | except Exception: 858 | try: 859 | # #db_cursor.close() 860 | parsed_from = re.sub(r'[^\x00-\x7f]', r'', parsed_from_original) 861 | # print("Exception #: 15") 862 | print(parsed_from) 863 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_from(data) VALUES (%s) ON CONFLICT(data) DO UPDATE SET data=(%s) RETURNING id" 864 | db_cursor = db_connection.cursor() 865 | db_cursor.execute(sql, (parsed_from, parsed_from)) 866 | db_connection.commit() 867 | inserted_from_id = db_cursor.fetchone()[0] 868 | # #db_cursor.close() 869 | except Exception: 870 | try: 871 | # #db_cursor.close() 872 | parsed_from = re.search(r'<(.*?)>', parsed_from).group(1) 873 | # print("Exception #: 16") 874 | print(parsed_from) 875 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_from(data) VALUES ('{parsed_from}') ON CONFLICT(data) DO UPDATE SET data=('{parsed_from}') RETURNING id" 876 | print(sql) 877 | db_cursor = db_connection.cursor() 878 | print("ss1") 879 | db_cursor.execute(sql) 880 | print("ss2") 881 | db_connection.commit() 882 | print("ss3") 883 | inserted_from_id = db_cursor.fetchone()[0] 884 | print(inserted_from_id) 885 | # #db_cursor.close() 886 | except Exception: 887 | # print("Exception #: 17a") 888 | # exit() 889 | pass 890 | # Add a header info - pass in the subject line id from the previous statement 891 | # 892 | if not inserted_from_id: 893 | print("I couldn't get inserted_from_id!") 894 | print(path.absolute()) 895 | print(message._headers) 896 | exit() 897 | 898 | if parsed_ref: 899 | has_ref = 1 900 | else: 901 | has_ref = 0 902 | 903 | try: 904 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_headers(dateparsed, subj_id, ref, msg_id, msg_from, enc, contype) VALUES ((%s), (%s), (%s), (%s), (%s), (%s), (%s)) RETURNING id" 905 | db_cursor = db_connection.cursor() 906 | db_cursor.execute(sql, ( 907 | parsed_date, inserted_subject_id, has_ref, parsed_message_id, inserted_from_id, parsed_encoding, 908 | parsed_content_type)) 909 | db_connection.commit() 910 | inserted_header_id = db_cursor.fetchone()[0] 911 | # #db_cursor.close() 912 | except Exception as e: 913 | # print("Exception #: 16a") 914 | # #db_cursor.close() 915 | # exit() 916 | print(e) 917 | print('Duplicate MSG ID: ' + parsed_message_id) 918 | continue 919 | # exit(0) 920 | 921 | if parsed_ref: 922 | split_refs = parsed_ref.split(' ') 923 | for split in split_refs: 924 | try: 925 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_refs(id, ref_msg) VALUES ((%s), (%s));" 926 | db_cursor = db_connection.cursor() 927 | db_cursor.execute(sql, (inserted_header_id, split.strip())) 928 | db_connection.commit() 929 | # #db_cursor.close() 930 | except Exception: 931 | # print("Exception #: 17") 932 | # #db_cursor.close() 933 | # exit() 934 | pass 935 | try: 936 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_body(id,data) VALUES ((%s), (%s))" 937 | db_cursor = db_connection.cursor() 938 | db_cursor.execute(sql, (inserted_header_id, parsed_body_text)) 939 | db_connection.commit() 940 | processing_message_counter[str(groupName)] = processing_message_counter.get(str(groupName), 0) + 1 941 | print(counterall, parsed_date, groupName, path, processing_message_counter.get(str(groupName), 0)) 942 | # print(processing_message_counter.get(str(groupName), 0)) 943 | except Exception: 944 | # print("Exception #: 18") 945 | # #db_cursor.close() 946 | try: 947 | parsed_body_text = parsed_body_text.encode("ascii", "ignore").decode() 948 | parsed_body_text = re.sub(r'[^\x00-\x7f]', r'', parsed_body_text) 949 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_body(id,data) VALUES ((%s), (%s))" 950 | db_cursor = db_connection.cursor() 951 | db_cursor.execute(sql, (inserted_header_id, parsed_body_text)) 952 | db_connection.commit() 953 | processing_message_counter[str(groupName)] = processing_message_counter.get(str(groupName), 0) + 1 954 | print(counterall, parsed_date, groupName, path, processing_message_counter.get(str(groupName), 0)) 955 | # #db_cursor.close() 956 | except Exception: 957 | # print("Exception #: 19") 958 | # #db_cursor.close() 959 | # parsed_body_text = parsed_body_text_original.encode('utf-8', 'surrogateescape').decode('ANSI') 960 | try: 961 | parsed_body_text = re.sub(r'[^\x00-\x7f]', r'', parsed_body_text) 962 | sql = f"INSERT INTO all_messages.{group_name_fin_db}_body(id,data) VALUES ((%s), (%s))" 963 | db_cursor = db_connection.cursor() 964 | db_cursor.execute(sql, (inserted_header_id, parsed_body_text)) 965 | db_connection.commit() 966 | processing_message_counter[str(groupName)] = processing_message_counter.get(str(groupName), 0) + 1 967 | print(counterall, parsed_date, groupName, path, processing_message_counter.get(str(groupName), 0)) 968 | # #db_cursor.close() 969 | except Exception: 970 | # print("Exception #: 19") 971 | # #db_cursor.close() 972 | exit(0) 973 | else: 974 | processing_message_counter[str(groupName)] = processing_message_counter.get(str(groupName), 0) + 1 975 | print(counterall, "Skipping: ", groupName, path, processing_message_counter.get(str(groupName), 0)) 976 | if parsed_subject: 977 | if "Re:" in parsed_subject: 978 | pass 979 | # print(parsed_subject) 980 | 981 | # counter 982 | if os.path.exists(positionFilePath): 983 | os.remove(positionFilePath) 984 | # print(positionFilePath, "removed") 985 | if not os.path.exists(positionFilePath): 986 | # print('Updated', positionFilePath) 987 | file = open(positionFilePath, 'w') 988 | file.write(str(counterall)) 989 | file.close() 990 | 991 | try: 992 | filename = None 993 | filename = groupName + ".utzoo" 994 | sql = f"INSERT INTO all_messages.__all_files(file_name, current, total, processing, newsgroup_name) VALUES ('{filename}',{processing_message_counter.get(str(groupName), 0)},{processing_message_counter.get(str(groupName), 0)},1,'{groupName}') ON CONFLICT (file_name) DO UPDATE SET current={processing_message_counter.get(str(groupName), 0)}, total={processing_message_counter.get(str(groupName), 0)}, processing=1" 995 | #print(sql) 996 | db_cursor = db_connection.cursor() 997 | db_cursor.execute(sql) 998 | db_connection.commit() 999 | #db_cursor.close() 1000 | except Exception as err: 1001 | print(err) 1002 | exit(0) --------------------------------------------------------------------------------