├── .gitignore ├── CITATION.cff ├── LICENSE.md ├── Pipfile ├── Pipfile.lock ├── README.md ├── personal ├── combine │ ├── build_month.py │ ├── classes.py │ ├── merge.py │ ├── merge_and_backfill.py │ └── merge_minutes.py ├── compression │ ├── extract_file.py │ ├── recompress_file.py │ ├── recompress_folder.py │ └── recompress_folder_multiprocess.py ├── diagnostic │ ├── comments_per_day.py │ ├── comments_per_day_with_score.py │ ├── compare_lines.py │ ├── count_fields.py │ ├── count_subreddits_multiprocess.py │ ├── get_zst_details.py │ ├── sum_subreddit_counts.py │ ├── test_file.py │ └── test_files_multiprocess.py ├── mongo │ ├── export_mongo.py │ ├── group_subs.py │ └── insert_mongo.py ├── move │ ├── copy_listed_files.py │ ├── move_files.py │ └── rename_files.py ├── opt_in_quarantined.py ├── process_month.py ├── transform │ ├── split_blocks_by_minutes.py │ ├── split_by_minutes.py │ └── split_by_subreddit.py ├── utils.py └── zst_blocks.py └── scripts ├── combine_folder_multiprocess.py ├── count_words_single_file.py ├── filter_file.py ├── find_overlapping_users.py ├── ignored.txt ├── iterate_folder.py ├── single_file.py └── to_csv.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/* 2 | logs/* 3 | __pycache__/* 4 | *.db 5 | *.ini 6 | *.txt -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | # This CITATION.cff file was generated with cffinit. 2 | # Visit https://bit.ly/cffinit to generate yours today! 3 | 4 | cff-version: 1.2.0 5 | title: Pushshift dump utils 6 | message: >- 7 | If you use this software, please cite it using the 8 | metadata from this file. 9 | type: software 10 | authors: 11 | - given-names: Watchful1 12 | repository-code: 'https://github.com/Watchful1/PushshiftDumps' 13 | abstract: >- 14 | Tools to help parse reddit data from zstandard compressed 15 | ndjson files from the pushshift archives 16 | license: MIT 17 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Watchful1 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | zstandard = "*" 8 | discord-logging = {editable = true, git = "https://github.com/Watchful1/DiscordLogging.git"} 9 | requests = "*" 10 | pymongo = {extras = ["srv"], version = "*"} 11 | scipy = "*" 12 | sortedcontainers = "*" 13 | praw = "*" 14 | multiprocessing-logging = "*" 15 | 16 | [dev-packages] 17 | 18 | [requires] 19 | python_version = "3.9" 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This repo contains example python scripts for processing the reddit dump files created by pushshift. The files can be downloaded from [here](https://files.pushshift.io/reddit/) or torrented from [here](https://academictorrents.com/details/f37bb9c0abe350f0f1cbd4577d0fe413ed07724e). 2 | 3 | * `single_file.py` decompresses and iterates over a single zst compressed file 4 | * `iterate_folder.py` does the same, but for all files in a folder 5 | * `combine_folder_multiprocess.py` uses separate processes to iterate over multiple files in parallel, writing lines that match the criteria passed in to text files, then combining them into a final zst compressed file -------------------------------------------------------------------------------- /personal/combine/build_month.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import requests 3 | import time 4 | import discord_logging 5 | import argparse 6 | import os 7 | import re 8 | import zstandard 9 | from datetime import datetime, timedelta 10 | import json 11 | import praw 12 | from praw import endpoints 13 | import prawcore 14 | import logging.handlers 15 | 16 | sys.path.append('personal') 17 | 18 | log = discord_logging.get_logger(init=True) 19 | 20 | import utils 21 | import classes 22 | from classes import IngestType 23 | from merge import ObjectType 24 | 25 | 26 | NEWLINE_ENCODED = "\n".encode('utf-8') 27 | reg = re.compile(r"\d\d-\d\d-\d\d_\d\d-\d\d") 28 | 29 | 30 | def build_month(month, input_folder, output_folder, file_type, compression_level): 31 | if file_type == "comments": 32 | prefix = "RC" 33 | elif file_type == "submissions": 34 | prefix = "RS" 35 | else: 36 | log.error(f"Invalid type: {args.type}") 37 | sys.exit(2) 38 | 39 | total_objects = 0 40 | total_bytes = 0 41 | minute_iterator = month 42 | if month.month == 12: 43 | end_time = month.replace(year=month.year + 1, month=1) 44 | else: 45 | end_time = month.replace(month=month.month + 1) 46 | while minute_iterator < end_time: 47 | minute_file_path = os.path.join(input_folder, file_type, minute_iterator.strftime('%y-%m-%d'), f"{prefix}_{minute_iterator.strftime('%y-%m-%d_%H-%M')}.zst") 48 | for obj, line, _ in utils.read_obj_zst_meta(minute_file_path): 49 | total_bytes += len(line.encode('utf-8')) 50 | total_bytes += 1 51 | 52 | total_objects += 1 53 | if total_objects % 1000000 == 0: 54 | log.info(f"{file_type}: Counting: {minute_iterator.strftime('%y-%m-%d_%H-%M')} : {total_objects:,} : {total_bytes:,}") 55 | 56 | minute_iterator += timedelta(minutes=1) 57 | 58 | log.info(f"{file_type}: Counting: {minute_iterator.strftime('%y-%m-%d_%H-%M')} : {total_objects:,} : {total_bytes:,}") 59 | 60 | output_path = os.path.join(output_folder, file_type, f"{prefix}_{month.strftime('%Y-%m')}.zst") 61 | output_handle = zstandard.ZstdCompressor(level=compression_level, write_content_size=True, write_checksum=True, threads=-1).stream_writer(open(output_path, 'wb'), size=total_bytes) 62 | 63 | count_objects = 0 64 | count_bytes = 0 65 | minute_iterator = month 66 | if month.month == 12: 67 | end_time = month.replace(year=month.year + 1, month=1) 68 | else: 69 | end_time = month.replace(month=month.month + 1) 70 | while minute_iterator < end_time: 71 | minute_file_path = os.path.join(input_folder, file_type, minute_iterator.strftime('%y-%m-%d'), f"{prefix}_{minute_iterator.strftime('%y-%m-%d_%H-%M')}.zst") 72 | for obj, line, _ in utils.read_obj_zst_meta(minute_file_path): 73 | line_encoded = line.encode('utf-8') 74 | count_bytes += len(line_encoded) 75 | count_bytes += 1 76 | output_handle.write(line_encoded) 77 | output_handle.write(NEWLINE_ENCODED) 78 | 79 | count_objects += 1 80 | if count_objects % 100000 == 0: 81 | log.info(f"{file_type}: Writing: {minute_iterator.strftime('%y-%m-%d_%H-%M')} : {count_objects:,}/{total_objects:,} : {count_bytes:,}/{total_bytes:,}") 82 | 83 | minute_iterator += timedelta(minutes=1) 84 | 85 | log.info(f"{file_type}: Writing: {minute_iterator.strftime('%y-%m-%d_%H-%M')} : {count_objects:,}/{total_objects:,} : {count_bytes:,}/{total_bytes:,}") 86 | output_handle.close() 87 | 88 | 89 | if __name__ == "__main__": 90 | parser = argparse.ArgumentParser(description="Combine the minute files into a single month") 91 | parser.add_argument("--type", help="The object type, either comments or submissions", required=True) 92 | parser.add_argument("--month", help="The month to process, format YY-MM", required=True) 93 | parser.add_argument('--input', help='Input folder', required=True) 94 | parser.add_argument('--output', help='Output folder', required=True) 95 | parser.add_argument("--debug", help="Enable debug logging", action='store_const', const=True, default=False) 96 | parser.add_argument("--level", help="The compression ratio to output at", default="3") 97 | args = parser.parse_args() 98 | 99 | if args.debug: 100 | discord_logging.set_level(logging.DEBUG) 101 | 102 | month = datetime.strptime(args.month, '%y-%m') 103 | level = int(args.level) 104 | 105 | log.info(f"Input folder: {args.input}") 106 | log.info(f"Output folder: {args.output}") 107 | log.info(f"Month: {args.month}") 108 | log.info(f"Compression level: {level}") 109 | 110 | build_month( 111 | month, 112 | args.input, 113 | args.output, 114 | args.type, 115 | level 116 | ) 117 | -------------------------------------------------------------------------------- /personal/combine/classes.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import os 3 | import discord_logging 4 | import sys 5 | import zstandard 6 | import json 7 | from enum import Enum 8 | from sortedcontainers import SortedList 9 | from collections import defaultdict 10 | 11 | log = discord_logging.get_logger() 12 | 13 | import utils 14 | import merge 15 | 16 | NEWLINE_ENCODED = "\n".encode('utf-8') 17 | 18 | 19 | class ApiRequest: 20 | def __init__(self, ids, is_submission, ingest_name, estimated_datetime=None, missing_expected=False): 21 | self.ids = ids 22 | self.is_submission = is_submission 23 | self.ingest_name = ingest_name 24 | self.estimated_datetime = estimated_datetime 25 | self.missing_expected = missing_expected 26 | self.results = None 27 | self.complete = False 28 | self.tries = 0 29 | self.prev_lengths = [] 30 | 31 | def should_retry(self): 32 | if self.complete: 33 | return False # the request is complete, no need to retry 34 | if len(self.prev_lengths) <= 1: 35 | return True # we've only made one attempt and it didn't work, do retry 36 | if self.prev_lengths[-1] == 0: 37 | if len(self.prev_lengths) < (10 if self.missing_expected else 100): 38 | return True # the most recent result was 0 objects, retry up to 100 times 39 | else: 40 | log.info(f"Force finished request with retries: {self}") 41 | self.complete = True 42 | return False 43 | if self.prev_lengths[-1] == self.prev_lengths[-2]: 44 | if self.missing_expected: 45 | self.complete = True 46 | return False # the latest two requests were the same and we're expecting missing objects, mark as complete 47 | elif len(self.prev_lengths) >= 4 and \ 48 | self.prev_lengths[-1] == self.prev_lengths[-3] and \ 49 | self.prev_lengths[-1] == self.prev_lengths[-4]: 50 | log.info(f"Force finished request with retries: {self}") 51 | self.complete = True 52 | return False # the latest four requests were the same, go ahead and mark as complete 53 | return True # recent requests didn't match, and weren't 0, go ahead and retry 54 | 55 | def get_body_key(self): 56 | return "self_text" if self.is_submission else "body" 57 | 58 | def get_string_type(self): 59 | return "submission" if self.is_submission else "comment" 60 | 61 | def get_prefix(self): 62 | return "t3_" if self.is_submission else "t1_" 63 | 64 | def set_results(self, results): 65 | self.prev_lengths.append(len(results)) 66 | self.results = [] 67 | current_timestamp = int(datetime.utcnow().timestamp()) 68 | for result in results: 69 | obj = result['data'] 70 | if 'body_html' in obj: 71 | del obj['body_html'] 72 | if 'selftext_html' in obj: 73 | del obj['selftext_html'] 74 | obj['retrieved_on'] = current_timestamp 75 | self.results.append(obj) 76 | log.debug(f"Set result: {self}") 77 | 78 | def id_string(self): 79 | return f"{self.get_prefix()}{(f',{self.get_prefix()}'.join(self.ids))}" 80 | 81 | def __str__(self): 82 | return \ 83 | f"{self.ingest_name}: {self.ids[0]}-{self.ids[-1]} {self.get_string_type()}: " \ 84 | f"{len(self.results) if self.results else self.results} : {self.tries} : " \ 85 | f"{self.complete} : {','.join([str(val) for val in self.prev_lengths])}" 86 | 87 | def __gt__(self, other): 88 | if isinstance(other, ApiRequest): 89 | return False 90 | return True 91 | 92 | def __lt__(self, other): 93 | if isinstance(other, ApiRequest): 94 | return True 95 | return False 96 | 97 | def __eq__(self, other): 98 | if isinstance(other, ApiRequest): 99 | return True 100 | return False 101 | 102 | 103 | class Queue: 104 | def __init__(self, max_size): 105 | self.list = [] 106 | self.max_size = max_size 107 | 108 | def put(self, item): 109 | if len(self.list) >= self.max_size: 110 | self.list.pop(0) 111 | self.list.append(item) 112 | 113 | def peek(self): 114 | return self.list[0] if len(self.list) > 0 else None 115 | 116 | 117 | class OutputHandle: 118 | def __init__(self, is_submission, dump_folder): 119 | self.handle = None 120 | self.current_path = None 121 | self.current_minute = None 122 | self.is_submission = is_submission 123 | self.dump_folder = dump_folder 124 | 125 | if not os.path.exists(dump_folder): 126 | os.makedirs(dump_folder) 127 | 128 | def matched_minute(self, new_date_time): 129 | return self.current_minute is not None and new_date_time.minute == self.current_minute 130 | 131 | def get_path(self, date_folder, export_filename, increment=None): 132 | folder = f"{self.dump_folder}{os.path.sep}{date_folder}" 133 | if not os.path.exists(folder): 134 | os.makedirs(folder) 135 | 136 | bldr = [folder] 137 | bldr.append(os.path.sep) 138 | if self.is_submission: 139 | bldr.append("RS_") 140 | else: 141 | bldr.append("RC_") 142 | bldr.append(export_filename) 143 | if increment is not None: 144 | bldr.append("_") 145 | bldr.append(str(increment)) 146 | bldr.append(".zst") 147 | 148 | return ''.join(bldr) 149 | 150 | def rollover_to_minute(self, date_time): 151 | if self.handle is not None: 152 | self.handle.close() 153 | os.rename(self.current_path + ".tmp", self.current_path) 154 | date_folder = date_time.strftime('%y-%m-%d') 155 | export_filename = date_time.strftime('%y-%m-%d_%H-%M') 156 | export_path = self.get_path(date_folder, export_filename) 157 | if os.path.exists(export_path + ".tmp"): 158 | os.rename(export_path + ".tmp", export_path) 159 | i = 0 160 | while os.path.exists(export_path): 161 | log.info(f"Dump exists, incrementing: {export_path}") 162 | i += 1 163 | export_path = self.get_path(date_folder, export_filename, i) 164 | if i > 100: 165 | log.warning(f"Something went wrong, more than 100 dumps for minute, aborting") 166 | sys.exit(3) 167 | self.current_path = export_path 168 | self.handle = zstandard.ZstdCompressor().stream_writer(open(export_path + ".tmp", 'wb')) 169 | self.current_minute = date_time.minute 170 | 171 | def write_object(self, obj): 172 | self.handle.write(json.dumps(obj, sort_keys=True).encode('utf-8')) 173 | self.handle.write(NEWLINE_ENCODED) 174 | 175 | def flush(self): 176 | self.handle.flush() 177 | 178 | def close(self): 179 | if self.handle is not None: 180 | self.handle.close() 181 | 182 | 183 | class IngestType(Enum): 184 | INGEST = 1 185 | RESCAN = 2 186 | DOWNLOAD = 3 187 | PUSHSHIFT = 4 188 | BACKFILL = 5 189 | MISSING = 6 190 | 191 | 192 | class ObjectDict: 193 | def __init__(self, min_datetime, max_datetime, obj_type): 194 | self.min_datetime = min_datetime 195 | self.max_datetime = max_datetime 196 | self.obj_type = obj_type 197 | 198 | self.counts = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) 199 | self.min_id = None 200 | self.max_id = None 201 | 202 | self.by_id = {} 203 | self.by_minute = defaultdict(ObjectMinuteList) 204 | 205 | def contains_id(self, str_id): 206 | return str_id in self.by_id 207 | 208 | def delete_object_id(self, str_id): 209 | del self.by_id[str_id] 210 | 211 | def delete_objects_below_minute(self, delete_below_minute): 212 | for minute, minute_list in self.by_minute.items(): 213 | if minute < delete_below_minute: 214 | for obj in minute_list.obj_list: 215 | self.delete_object_id(obj['id']) 216 | 217 | def rebuild_minute_dict(self): 218 | self.by_minute = defaultdict(ObjectMinuteList) 219 | for obj in self.by_id.values(): 220 | created_minute = datetime.utcfromtimestamp(obj["created_utc"]).replace(second=0, microsecond=0) 221 | self.by_minute[created_minute].add(obj) 222 | 223 | def count_minutes(self): 224 | return len(self.by_minute) 225 | 226 | @staticmethod 227 | def get_counts_string_from_dict(counts_dict, ingest_types): 228 | bldr = [] 229 | for ingest_type in ingest_types: 230 | if ingest_type in counts_dict: 231 | bldr.append(f"{counts_dict[ingest_type][True]}({counts_dict[ingest_type][False]})") 232 | else: 233 | bldr.append("0(0)") 234 | return "|".join(bldr) 235 | 236 | def get_counts_string_by_minute(self, minute, ingest_types): 237 | count_string = ObjectDict.get_counts_string_from_dict(self.counts[minute], ingest_types) 238 | minute_dict = self.by_minute.get(minute) 239 | if minute_dict is None or minute_dict.max_id is None or minute_dict.min_id is None: 240 | range_string = "" 241 | else: 242 | range_string = f" - {len(minute_dict.obj_list)}({minute_dict.max_id - minute_dict.min_id}) ({utils.base36encode(minute_dict.min_id)}-{utils.base36encode(minute_dict.max_id)})" 243 | return count_string + range_string 244 | 245 | def get_counts_string(self): 246 | sum_dict = defaultdict(lambda: defaultdict(int)) 247 | for counts_dict in self.counts.values(): 248 | for ingest_type in IngestType: 249 | if ingest_type in counts_dict: 250 | sum_dict[ingest_type][True] += counts_dict[ingest_type][True] 251 | sum_dict[ingest_type][False] += counts_dict[ingest_type][False] 252 | 253 | return ObjectDict.get_counts_string_from_dict(sum_dict, IngestType) 254 | 255 | def get_missing_ids_by_minutes(self, start_minute, end_minute, ignore_ids): 256 | start_id = self.by_minute[start_minute].min_id 257 | end_id = self.by_minute[end_minute].max_id 258 | missing_ids = [] 259 | count_ignored_ids = 0 260 | for int_id in range(start_id, end_id + 1): 261 | ignored = False 262 | for ignore_start, ignore_end in ignore_ids: 263 | if ignore_start <= int_id <= ignore_end: 264 | count_ignored_ids += 1 265 | ignored = True 266 | break 267 | if ignored: 268 | continue 269 | 270 | string_id = utils.base36encode(int_id) 271 | if not self.contains_id(string_id): 272 | missing_ids.append(string_id) 273 | if count_ignored_ids > 0: 274 | log.warning(f"Ignored {count_ignored_ids} ids in range {utils.base36encode(start_id)}-{utils.base36encode(end_id)}") 275 | return missing_ids, start_id, end_id 276 | 277 | def add_object(self, obj, ingest_type): 278 | created_utc = datetime.utcfromtimestamp(obj["created_utc"]) 279 | created_minute = created_utc.replace(second=0, microsecond=0) 280 | if obj['id'] in self.by_id: 281 | existing_obj = self.by_id[obj['id']] 282 | unmatched_field = merge.merge_fields(existing_obj, obj, self.obj_type) 283 | self.counts[created_minute][ingest_type][False] += 1 284 | return unmatched_field 285 | if created_utc < self.min_datetime or created_utc > self.max_datetime: 286 | return False 287 | unmatched_field = merge.parse_fields(obj, self.obj_type) 288 | self.by_id[obj['id']] = obj 289 | self.by_minute[created_minute].add(obj) 290 | self.counts[created_minute][ingest_type][True] += 1 291 | self.min_id, self.max_id = utils.merge_lowest_highest_id(obj['id'], self.min_id, self.max_id) 292 | return unmatched_field 293 | 294 | def add_missing_object(self, obj_id): 295 | if obj_id in self.by_id: 296 | return 297 | int_id = utils.base36decode(obj_id) 298 | for minute, minute_dict in self.by_minute.items(): 299 | if minute_dict.min_id is None: 300 | continue 301 | if minute_dict.min_id < int_id < minute_dict.max_id: 302 | self.counts[minute][IngestType.MISSING][True] += 1 303 | return 304 | 305 | 306 | class ObjectMinuteList: 307 | def __init__(self): 308 | self.obj_list = SortedList(key=lambda x: f"{x['created_utc']}:{x['id']}") 309 | self.min_id = None 310 | self.max_id = None 311 | 312 | def add(self, obj): 313 | self.min_id, self.max_id = utils.merge_lowest_highest_id(obj['id'], self.min_id, self.max_id) 314 | self.obj_list.add(obj) 315 | -------------------------------------------------------------------------------- /personal/combine/merge.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | from enum import Enum 4 | from datetime import datetime 5 | import discord_logging 6 | 7 | log = discord_logging.get_logger() 8 | 9 | 10 | class FieldAction(Enum): 11 | OVERWRITE = 1 12 | OVERWRITE_NOT_NONE = 2 13 | OVERWRITE_IF_NONE = 3 14 | DONT_OVERWRITE = 4 15 | DELETE = 5 16 | SPECIAL = 6 17 | SPECIAL_NO_OVERWRITE = 7 18 | ALLOW = 8 19 | ALLOW_EMPTY = 9 20 | 21 | 22 | class ObjectType(Enum): 23 | COMMENT = 1 24 | SUBMISSION = 2 25 | 26 | 27 | field_actions = { 28 | ObjectType.COMMENT: { 29 | "_meta": FieldAction.OVERWRITE, 30 | "all_awardings": FieldAction.OVERWRITE_NOT_NONE, 31 | "approved": FieldAction.DELETE, 32 | "approved_at_utc": FieldAction.SPECIAL_NO_OVERWRITE, 33 | "approved_by": FieldAction.SPECIAL_NO_OVERWRITE, 34 | "archived": FieldAction.OVERWRITE, 35 | "associated_award": FieldAction.ALLOW_EMPTY, 36 | "author": FieldAction.OVERWRITE_IF_NONE, 37 | "author_cakeday": FieldAction.DONT_OVERWRITE, 38 | "author_flair_background_color": FieldAction.OVERWRITE_IF_NONE, 39 | "author_flair_css_class": FieldAction.OVERWRITE_IF_NONE, 40 | "author_flair_richtext": FieldAction.OVERWRITE_IF_NONE, 41 | "author_flair_template_id": FieldAction.OVERWRITE_IF_NONE, 42 | "author_flair_text": FieldAction.OVERWRITE_IF_NONE, 43 | "author_flair_text_color": FieldAction.OVERWRITE_IF_NONE, 44 | "author_flair_type": FieldAction.OVERWRITE_IF_NONE, 45 | "author_fullname": FieldAction.OVERWRITE_IF_NONE, 46 | "author_is_blocked": FieldAction.SPECIAL_NO_OVERWRITE, 47 | "author_patreon_flair": FieldAction.OVERWRITE, 48 | "author_premium": FieldAction.OVERWRITE, 49 | "awarders": FieldAction.OVERWRITE_IF_NONE, 50 | "ban_note": FieldAction.DELETE, 51 | "banned_at_utc": FieldAction.SPECIAL_NO_OVERWRITE, 52 | "banned_by": FieldAction.SPECIAL_NO_OVERWRITE, 53 | "body": FieldAction.SPECIAL, 54 | "body_html": FieldAction.DELETE, 55 | "body_sha1": FieldAction.OVERWRITE_NOT_NONE, 56 | "can_gild": FieldAction.OVERWRITE, 57 | "can_mod_post": FieldAction.SPECIAL_NO_OVERWRITE, 58 | "collapsed": FieldAction.OVERWRITE, 59 | "collapsed_because_crowd_control": FieldAction.ALLOW_EMPTY, 60 | "collapsed_reason": FieldAction.OVERWRITE, 61 | "collapsed_reason_code": FieldAction.OVERWRITE, 62 | "comment_type": FieldAction.OVERWRITE_NOT_NONE, 63 | "controversiality": FieldAction.OVERWRITE, 64 | "created": FieldAction.OVERWRITE_IF_NONE, 65 | "created_utc": FieldAction.OVERWRITE_IF_NONE, 66 | "distinguished": FieldAction.OVERWRITE, 67 | "downs": FieldAction.OVERWRITE_IF_NONE, 68 | "editable": FieldAction.OVERWRITE, 69 | "edited": FieldAction.OVERWRITE_NOT_NONE, 70 | "edited_on": FieldAction.ALLOW, 71 | "expression_asset_data": FieldAction.OVERWRITE_NOT_NONE, 72 | "gilded": FieldAction.OVERWRITE_NOT_NONE, 73 | "gildings": FieldAction.OVERWRITE_NOT_NONE, 74 | "id": FieldAction.ALLOW, 75 | "ignore_reports": FieldAction.DELETE, 76 | "is_submitter": FieldAction.DONT_OVERWRITE, 77 | "likes": FieldAction.OVERWRITE_NOT_NONE, 78 | "link_id": FieldAction.ALLOW, 79 | "locked": FieldAction.OVERWRITE, 80 | "media_metadata": FieldAction.OVERWRITE, 81 | "mod_note": FieldAction.ALLOW_EMPTY, 82 | "mod_reason_by": FieldAction.SPECIAL_NO_OVERWRITE, 83 | "mod_reason_title": FieldAction.SPECIAL_NO_OVERWRITE, 84 | "mod_reports": FieldAction.SPECIAL_NO_OVERWRITE, 85 | "mod_reports_dismissed": FieldAction.SPECIAL_NO_OVERWRITE, 86 | "name": FieldAction.OVERWRITE_IF_NONE, 87 | "nest_level": FieldAction.OVERWRITE_NOT_NONE, 88 | "no_follow": FieldAction.OVERWRITE, 89 | "num_reports": FieldAction.SPECIAL_NO_OVERWRITE, 90 | "parent_id": FieldAction.OVERWRITE_IF_NONE, 91 | "permalink": FieldAction.DONT_OVERWRITE, 92 | "removal_reason": FieldAction.SPECIAL, 93 | "removed": FieldAction.DELETE, 94 | "replies": FieldAction.OVERWRITE_IF_NONE, 95 | "report_reasons": FieldAction.SPECIAL_NO_OVERWRITE, 96 | "retrieved_on": FieldAction.SPECIAL, 97 | "retrieved_utc": FieldAction.SPECIAL, 98 | "rte_mode": FieldAction.OVERWRITE_NOT_NONE, 99 | "saved": FieldAction.SPECIAL_NO_OVERWRITE, 100 | "score": FieldAction.SPECIAL, 101 | "score_hidden": FieldAction.OVERWRITE, 102 | "send_replies": FieldAction.OVERWRITE, 103 | "spam": FieldAction.DELETE, 104 | "stickied": FieldAction.OVERWRITE, 105 | "subreddit": FieldAction.OVERWRITE_NOT_NONE, 106 | "subreddit_id": FieldAction.OVERWRITE_NOT_NONE, 107 | "subreddit_name_prefixed": FieldAction.OVERWRITE_NOT_NONE, 108 | "subreddit_type": FieldAction.DONT_OVERWRITE, 109 | "top_awarded_type": FieldAction.ALLOW_EMPTY, 110 | "total_awards_received": FieldAction.OVERWRITE_NOT_NONE, 111 | "treatment_tags": FieldAction.OVERWRITE_NOT_NONE, 112 | "unrepliable_reason": FieldAction.OVERWRITE_NOT_NONE, 113 | "ups": FieldAction.OVERWRITE_NOT_NONE, 114 | "user_reports": FieldAction.SPECIAL_NO_OVERWRITE, 115 | "user_reports_dismissed": FieldAction.SPECIAL_NO_OVERWRITE, 116 | "updated_on": FieldAction.SPECIAL, 117 | "updated_utc": FieldAction.SPECIAL, 118 | "utc_datetime_str": FieldAction.DELETE, 119 | }, 120 | ObjectType.SUBMISSION: { 121 | "_meta": FieldAction.OVERWRITE, 122 | "ad_business": FieldAction.OVERWRITE_NOT_NONE, 123 | "ad_promoted_user_posts": FieldAction.OVERWRITE_NOT_NONE, 124 | "ad_supplementary_text_md": FieldAction.OVERWRITE_NOT_NONE, 125 | "ad_user_targeting": FieldAction.OVERWRITE_NOT_NONE, 126 | "adserver_click_url": FieldAction.ALLOW_EMPTY, 127 | "adserver_imp_pixel": FieldAction.ALLOW_EMPTY, 128 | "all_awardings": FieldAction.OVERWRITE_NOT_NONE, 129 | "allow_live_comments": FieldAction.OVERWRITE, 130 | "app_store_data": FieldAction.ALLOW_EMPTY, 131 | "approved": FieldAction.DELETE, 132 | "approved_at_utc": FieldAction.SPECIAL_NO_OVERWRITE, 133 | "approved_by": FieldAction.SPECIAL_NO_OVERWRITE, 134 | "archived": FieldAction.ALLOW_EMPTY, 135 | "author": FieldAction.OVERWRITE_IF_NONE, 136 | "author_cakeday": FieldAction.DONT_OVERWRITE, 137 | "author_flair_background_color": FieldAction.OVERWRITE_NOT_NONE, 138 | "author_flair_css_class": FieldAction.OVERWRITE_NOT_NONE, 139 | "author_flair_richtext": FieldAction.OVERWRITE_NOT_NONE, 140 | "author_flair_template_id": FieldAction.OVERWRITE_NOT_NONE, 141 | "author_flair_text": FieldAction.OVERWRITE_NOT_NONE, 142 | "author_flair_text_color": FieldAction.OVERWRITE_NOT_NONE, 143 | "author_flair_type": FieldAction.OVERWRITE_NOT_NONE, 144 | "author_fullname": FieldAction.OVERWRITE_NOT_NONE, 145 | "author_id": FieldAction.OVERWRITE_NOT_NONE, 146 | "author_is_blocked": FieldAction.SPECIAL_NO_OVERWRITE, 147 | "author_patreon_flair": FieldAction.OVERWRITE, 148 | "author_premium": FieldAction.OVERWRITE, 149 | "awarders": FieldAction.ALLOW_EMPTY, 150 | "ban_note": FieldAction.DELETE, 151 | "banned_at_utc": FieldAction.SPECIAL_NO_OVERWRITE, 152 | "banned_by": FieldAction.SPECIAL_NO_OVERWRITE, 153 | "call_to_action": FieldAction.OVERWRITE, 154 | "campaign_id": FieldAction.ALLOW_EMPTY, 155 | "can_gild": FieldAction.OVERWRITE, 156 | "can_mod_post": FieldAction.SPECIAL_NO_OVERWRITE, 157 | "category": FieldAction.OVERWRITE_NOT_NONE, 158 | "clicked": FieldAction.SPECIAL_NO_OVERWRITE, 159 | "collections": FieldAction.OVERWRITE_NOT_NONE, 160 | "content_categories": FieldAction.ALLOW, 161 | "contest_mode": FieldAction.OVERWRITE, 162 | "created": FieldAction.OVERWRITE_IF_NONE, 163 | "created_utc": FieldAction.OVERWRITE_IF_NONE, 164 | "crosspost_parent": FieldAction.OVERWRITE_NOT_NONE, 165 | "crosspost_parent_list": FieldAction.OVERWRITE_NOT_NONE, 166 | "discussion_type": FieldAction.OVERWRITE_NOT_NONE, 167 | "distinguished": FieldAction.OVERWRITE, 168 | "domain": FieldAction.OVERWRITE_NOT_NONE, 169 | "domain_override": FieldAction.OVERWRITE_NOT_NONE, 170 | "downs": FieldAction.SPECIAL_NO_OVERWRITE, 171 | "edited": FieldAction.OVERWRITE, 172 | "edited_on": FieldAction.ALLOW, 173 | "embed_type": FieldAction.OVERWRITE_NOT_NONE, 174 | "embed_url": FieldAction.OVERWRITE_NOT_NONE, 175 | "event_end": FieldAction.OVERWRITE_NOT_NONE, 176 | "event_is_live": FieldAction.OVERWRITE_NOT_NONE, 177 | "event_start": FieldAction.OVERWRITE_NOT_NONE, 178 | "events": FieldAction.ALLOW_EMPTY, 179 | "eventsOnRender": FieldAction.ALLOW_EMPTY, 180 | "gallery_data": FieldAction.OVERWRITE_NOT_NONE, 181 | "gilded": FieldAction.OVERWRITE_NOT_NONE, 182 | "gildings": FieldAction.OVERWRITE_NOT_NONE, 183 | "hidden": FieldAction.SPECIAL_NO_OVERWRITE, 184 | "hide_score": FieldAction.OVERWRITE, 185 | "href_url": FieldAction.DONT_OVERWRITE, 186 | "id": FieldAction.ALLOW, 187 | "ignore_reports": FieldAction.DELETE, 188 | "impression_id": FieldAction.ALLOW_EMPTY, 189 | "impression_id_str": FieldAction.ALLOW_EMPTY, 190 | "is_blank": FieldAction.ALLOW_EMPTY, 191 | "is_created_from_ads_ui": FieldAction.OVERWRITE_NOT_NONE, 192 | "is_crosspostable": FieldAction.OVERWRITE, 193 | "is_gallery": FieldAction.OVERWRITE_NOT_NONE, 194 | "is_meta": FieldAction.OVERWRITE, 195 | "is_original_content": FieldAction.OVERWRITE, 196 | "is_reddit_media_domain": FieldAction.OVERWRITE, 197 | "is_robot_indexable": FieldAction.OVERWRITE, 198 | "is_self": FieldAction.DONT_OVERWRITE, 199 | "is_survey_ad": FieldAction.ALLOW_EMPTY, 200 | "is_video": FieldAction.OVERWRITE, 201 | "likes": FieldAction.OVERWRITE_NOT_NONE, 202 | "link_flair_background_color": FieldAction.OVERWRITE_NOT_NONE, 203 | "link_flair_css_class": FieldAction.OVERWRITE_NOT_NONE, 204 | "link_flair_richtext": FieldAction.OVERWRITE_NOT_NONE, 205 | "link_flair_template_id": FieldAction.OVERWRITE_NOT_NONE, 206 | "link_flair_text": FieldAction.OVERWRITE_NOT_NONE, 207 | "link_flair_text_color": FieldAction.OVERWRITE_NOT_NONE, 208 | "link_flair_type": FieldAction.OVERWRITE_NOT_NONE, 209 | "locked": FieldAction.OVERWRITE, 210 | "media": FieldAction.OVERWRITE_NOT_NONE, 211 | "media_embed": FieldAction.OVERWRITE_NOT_NONE, 212 | "media_metadata": FieldAction.OVERWRITE_NOT_NONE, 213 | "media_only": FieldAction.OVERWRITE, 214 | "mobile_ad_url": FieldAction.OVERWRITE_NOT_NONE, 215 | "mod_note": FieldAction.SPECIAL_NO_OVERWRITE, 216 | "mod_reason_by": FieldAction.SPECIAL_NO_OVERWRITE, 217 | "mod_reason_title": FieldAction.SPECIAL_NO_OVERWRITE, 218 | "mod_reports": FieldAction.SPECIAL_NO_OVERWRITE, 219 | "mod_reports_dismissed": FieldAction.SPECIAL_NO_OVERWRITE, 220 | "name": FieldAction.OVERWRITE_IF_NONE, 221 | "no_follow": FieldAction.OVERWRITE, 222 | "num_comments": FieldAction.OVERWRITE_NOT_NONE, 223 | "num_crossposts": FieldAction.OVERWRITE, 224 | "num_reports": FieldAction.SPECIAL_NO_OVERWRITE, 225 | "original_link": FieldAction.ALLOW_EMPTY, 226 | "outbound_link": FieldAction.ALLOW_EMPTY, 227 | "over_18": FieldAction.OVERWRITE, 228 | "parent_whitelist_status": FieldAction.OVERWRITE, 229 | "permalink": FieldAction.DONT_OVERWRITE, 230 | "pinned": FieldAction.ALLOW_EMPTY, 231 | "poll_data": FieldAction.OVERWRITE_NOT_NONE, 232 | "post_hint": FieldAction.OVERWRITE, 233 | "preview": FieldAction.OVERWRITE_NOT_NONE, 234 | "previous_selftext": FieldAction.ALLOW, 235 | "priority_id": FieldAction.ALLOW_EMPTY, 236 | "product_ids": FieldAction.ALLOW_EMPTY, 237 | "promo_layout": FieldAction.OVERWRITE, 238 | "promoted": FieldAction.ALLOW_EMPTY, 239 | "promoted_by": FieldAction.ALLOW_EMPTY, 240 | "promoted_display_name": FieldAction.ALLOW_EMPTY, 241 | "promoted_url": FieldAction.ALLOW_EMPTY, 242 | "pwls": FieldAction.OVERWRITE, 243 | "quarantine": FieldAction.DONT_OVERWRITE, 244 | "removal_reason": FieldAction.SPECIAL, 245 | "removed": FieldAction.DELETE, 246 | "removed_by": FieldAction.SPECIAL_NO_OVERWRITE, 247 | "removed_by_category": FieldAction.OVERWRITE, 248 | "report_reasons": FieldAction.SPECIAL_NO_OVERWRITE, 249 | "retrieved_on": FieldAction.SPECIAL, 250 | "retrieved_utc": FieldAction.SPECIAL, 251 | "rte_mode": FieldAction.OVERWRITE_NOT_NONE, 252 | "saved": FieldAction.SPECIAL_NO_OVERWRITE, 253 | "score": FieldAction.SPECIAL, 254 | "secure_media": FieldAction.OVERWRITE_NOT_NONE, 255 | "secure_media_embed": FieldAction.OVERWRITE_NOT_NONE, 256 | "selftext": FieldAction.SPECIAL, 257 | "selftext_html": FieldAction.DELETE, 258 | "send_replies": FieldAction.OVERWRITE, 259 | "show_media": FieldAction.ALLOW, 260 | "sk_ad_network_data": FieldAction.ALLOW_EMPTY, 261 | "spam": FieldAction.DELETE, 262 | "spoiler": FieldAction.OVERWRITE, 263 | "stickied": FieldAction.OVERWRITE, 264 | "subcaption": FieldAction.OVERWRITE, 265 | "subreddit": FieldAction.OVERWRITE_NOT_NONE, 266 | "subreddit_id": FieldAction.OVERWRITE_NOT_NONE, 267 | "subreddit_name_prefixed": FieldAction.OVERWRITE_NOT_NONE, 268 | "subreddit_subscribers": FieldAction.OVERWRITE_IF_NONE, 269 | "subreddit_type": FieldAction.DONT_OVERWRITE, 270 | "suggested_sort": FieldAction.OVERWRITE, 271 | "third_party_trackers": FieldAction.ALLOW_EMPTY, 272 | "third_party_tracking": FieldAction.ALLOW_EMPTY, 273 | "third_party_tracking_2": FieldAction.ALLOW_EMPTY, 274 | "thumbnail": FieldAction.OVERWRITE_NOT_NONE, 275 | "thumbnail_height": FieldAction.OVERWRITE_NOT_NONE, 276 | "thumbnail_width": FieldAction.OVERWRITE_NOT_NONE, 277 | "title": FieldAction.DONT_OVERWRITE, 278 | "top_awarded_type": FieldAction.OVERWRITE, 279 | "total_awards_received": FieldAction.OVERWRITE_NOT_NONE, 280 | "treatment_tags": FieldAction.OVERWRITE_NOT_NONE, 281 | "tournament_data": FieldAction.OVERWRITE_NOT_NONE, 282 | "unrepliable_reason": FieldAction.OVERWRITE_NOT_NONE, 283 | "updated_on": FieldAction.SPECIAL, 284 | "updated_utc": FieldAction.SPECIAL, 285 | "ups": FieldAction.OVERWRITE_NOT_NONE, 286 | "upvote_ratio": FieldAction.OVERWRITE, 287 | "url": FieldAction.OVERWRITE_NOT_NONE, 288 | "url_overridden_by_dest": FieldAction.OVERWRITE_NOT_NONE, 289 | "user_reports": FieldAction.SPECIAL_NO_OVERWRITE, 290 | "user_reports_dismissed": FieldAction.SPECIAL_NO_OVERWRITE, 291 | "utc_datetime_str": FieldAction.DELETE, 292 | "view_count": FieldAction.ALLOW_EMPTY, 293 | "visited": FieldAction.SPECIAL_NO_OVERWRITE, 294 | "whitelist_status": FieldAction.OVERWRITE, 295 | "wls": FieldAction.OVERWRITE, 296 | }, 297 | } 298 | 299 | 300 | def is_empty(value): 301 | return value is None \ 302 | or value == "" \ 303 | or value == "[deleted]" \ 304 | or value == "[removed]" \ 305 | or value == [] \ 306 | or value == {} \ 307 | or value is False \ 308 | or value == 0 309 | 310 | 311 | def replace(match): 312 | if match.group(0) == "amp;": return "" 313 | if match.group(0) == "<": return "<" 314 | if match.group(0) == ">": return ">" 315 | log.warning(f"Unknown group: {match}") 316 | sys.exit(2) 317 | 318 | 319 | unencode_regex = re.compile(r"amp;|<|>") 320 | 321 | 322 | def merge_fields(existing_obj, new_obj, obj_type): 323 | unmatched_field = False 324 | type_actions = field_actions[obj_type] 325 | for key, new_value in new_obj.items(): 326 | action = type_actions.get(key) 327 | 328 | original_value = existing_obj.get(key) 329 | if new_value != original_value: 330 | # if isinstance(new_value, str) and unencode_regex.search(new_value): 331 | # new_value_no_encode = unencode_regex.sub(replace, new_value) 332 | # if new_value_no_encode == original_value: 333 | # continue 334 | if action == FieldAction.OVERWRITE: 335 | existing_obj[key] = new_value 336 | elif action == FieldAction.OVERWRITE_NOT_NONE: 337 | if not is_empty(new_value): 338 | existing_obj[key] = new_value 339 | elif action == FieldAction.OVERWRITE_IF_NONE: 340 | if is_empty(original_value): 341 | existing_obj[key] = new_value 342 | elif action == FieldAction.SPECIAL: 343 | if key == "body": 344 | if not is_empty(new_value): 345 | if 'previous_body' in existing_obj: 346 | existing_obj['previous_body'] = original_value 347 | existing_obj['body'] = new_value 348 | elif key == "score": 349 | if not is_empty(new_value): 350 | if is_empty(original_value) or abs(new_value) > abs(original_value): 351 | existing_obj['score'] = new_value 352 | elif key == "selftext": 353 | if not is_empty(new_value): 354 | if 'previous_selftext' not in existing_obj: 355 | existing_obj['previous_selftext'] = original_value 356 | existing_obj['selftext'] = new_value 357 | elif key == "removal_reason" and new_value in ["legal", None]: 358 | existing_obj[key] = new_value 359 | elif key in ["retrieved_on", "retrieved_utc"]: 360 | prev_retrieved_on = existing_obj["retrieved_on"] 361 | if new_value < prev_retrieved_on: 362 | existing_obj["retrieved_on"] = new_value 363 | existing_obj["updated_on"] = prev_retrieved_on 364 | if new_value > prev_retrieved_on: 365 | existing_obj["updated_on"] = new_value 366 | elif key in ["updated_on", "updated_utc"]: 367 | if new_value > existing_obj["updated_on"]: 368 | existing_obj["updated_on"] = new_value 369 | else: 370 | log.info(f"{new_obj['id']} unmatched special: {key}: {original_value} != {new_value}") 371 | unmatched_field = True 372 | elif action == FieldAction.DELETE or action == FieldAction.DONT_OVERWRITE or action == FieldAction.SPECIAL_NO_OVERWRITE: 373 | pass 374 | else: 375 | log.info(f"{new_obj['id']} unmatched no action: {key}|{action}: {original_value} != {new_value}") 376 | unmatched_field = True 377 | elif action is None: 378 | log.info(f"{new_obj['id']} matched no action: {key}: {new_value}") 379 | unmatched_field = True 380 | 381 | return unmatched_field 382 | 383 | 384 | def parse_fields(new_obj, obj_type): 385 | keys_to_delete = [] 386 | keys_to_add = [] 387 | unmatched_field = False 388 | type_actions = field_actions[obj_type] 389 | for key, new_value in new_obj.items(): 390 | action = type_actions.get(key) 391 | if action is not None: 392 | if action == FieldAction.DELETE: 393 | keys_to_delete.append(key) 394 | elif action == FieldAction.ALLOW_EMPTY: 395 | if not is_empty(new_value): 396 | log.info(f"{new_obj['id']} not empty: {key}: {new_value}") 397 | unmatched_field = True 398 | keys_to_delete.append(key) 399 | elif action == FieldAction.SPECIAL: 400 | if key in ["retrieved_on", "body", "selftext", "updated_on", "score"]: 401 | pass 402 | elif key == "removal_reason" and new_value in ["legal", None]: 403 | pass 404 | elif key == "retrieved_utc": 405 | keys_to_add.append(("retrieved_on", new_value)) 406 | keys_to_delete.append(key) 407 | elif key == "updated_utc": 408 | keys_to_add.append(("updated_on", new_value)) 409 | keys_to_delete.append(key) 410 | else: 411 | log.info(f"{new_obj['id']} special no match: {key}: {new_value}") 412 | unmatched_field = True 413 | keys_to_delete.append(key) 414 | elif action == FieldAction.SPECIAL_NO_OVERWRITE: 415 | if key in ["can_mod_post", "saved", "clicked", "visited", "author_is_blocked", "hidden"]: 416 | new_obj[key] = False 417 | elif key in ["banned_at_utc", "banned_by", "approved_at_utc", "approved_by", "user_reports_dismissed", "mod_reports_dismissed", "removed_by", "mod_note", "mod_reason_by", "mod_reason_title"]: 418 | new_obj[key] = None 419 | elif key in ["num_reports", "downs"]: 420 | new_obj[key] = 0 421 | elif key in ["report_reasons", "user_reports", "mod_reports"]: 422 | new_obj[key] = [] 423 | else: 424 | log.info(f"{new_obj['id']} special no overwrite no match: {key}: {new_value}") 425 | unmatched_field = True 426 | keys_to_delete.append(key) 427 | else: 428 | log.info(f"{new_obj['id']} no action: {key}: {new_value}") 429 | unmatched_field = True 430 | 431 | for key in keys_to_delete: 432 | del new_obj[key] 433 | 434 | for key, value in keys_to_add: 435 | new_obj[key] = value 436 | 437 | if 'retrieved_on' not in new_obj: 438 | new_obj['retrieved_on'] = int(datetime.utcnow().timestamp()) 439 | 440 | return unmatched_field 441 | -------------------------------------------------------------------------------- /personal/combine/merge_and_backfill.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import requests 3 | import time 4 | import discord_logging 5 | import argparse 6 | import os 7 | import re 8 | import zstandard 9 | from datetime import datetime, timedelta 10 | import json 11 | import praw 12 | from praw import endpoints 13 | import prawcore 14 | import logging.handlers 15 | 16 | sys.path.append('personal') 17 | 18 | log = discord_logging.get_logger(init=True) 19 | 20 | import utils 21 | import classes 22 | from classes import IngestType 23 | from merge import ObjectType 24 | 25 | 26 | NEWLINE_ENCODED = "\n".encode('utf-8') 27 | reg = re.compile(r"\d\d-\d\d-\d\d_\d\d-\d\d") 28 | 29 | 30 | def get_pushshift_token(old_token): 31 | saved_token = load_pushshift_token() 32 | if saved_token is None or old_token == saved_token: 33 | log.info(f"Requesting new token") 34 | result_token = re_auth_pushshift(old_token) 35 | save_pushshift_token(result_token) 36 | else: 37 | result_token = saved_token 38 | 39 | return result_token 40 | 41 | 42 | def save_pushshift_token(token): 43 | with open("pushshift.txt", 'w') as file: 44 | file.write(token) 45 | 46 | 47 | def load_pushshift_token(): 48 | with open("pushshift.txt", 'r') as file: 49 | token = file.read().strip() 50 | return token 51 | 52 | 53 | def re_auth_pushshift(old_token): 54 | url = f"https://auth.pushshift.io/refresh?access_token={old_token}" 55 | log.warning(f"Reauth request: {url}") 56 | response = requests.post(url) 57 | result = response.json() 58 | log.warning(f"Reauth response: {str(result)}") 59 | discord_logging.flush_discord() 60 | if 'access_token' in result: 61 | new_token = result['access_token'] 62 | log.warning(f"New pushshift token: {new_token}") 63 | save_pushshift_token(new_token) 64 | discord_logging.flush_discord() 65 | return new_token 66 | elif 'detail' in result: 67 | if result['detail'] == 'Access token is still active and can not be refreshed.': 68 | log.warning(f"Access token still active, trying request again") 69 | time.sleep(5) 70 | return old_token 71 | 72 | log.warning(f"Reauth failed: {result['detail']}") 73 | discord_logging.flush_discord() 74 | sys.exit(1) 75 | else: 76 | log.warning(f"Something went wrong re-authing") 77 | discord_logging.flush_discord() 78 | sys.exit(1) 79 | 80 | 81 | def query_pushshift(ids, bearer, object_type, pushshift_token_function): 82 | object_name = "comment" if object_type == ObjectType.COMMENT else "submission" 83 | url = f"https://api.pushshift.io/reddit/{object_name}/search?limit=1000&ids={','.join(ids)}" 84 | log.debug(f"pushshift query: {url}") 85 | response = None 86 | total_attempts = 100 87 | current_attempt = 0 88 | sleep_per_attempt = 10 89 | for current_attempt in range(total_attempts): 90 | try: 91 | response = requests.get(url, headers={ 92 | 'User-Agent': "In script by /u/Watchful1", 93 | 'Authorization': f"Bearer {bearer}"}, timeout=20) 94 | except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout) as err: 95 | log.info(f"Pushshift failed, sleeping {current_attempt * sleep_per_attempt} : {err}") 96 | time.sleep(current_attempt * sleep_per_attempt) 97 | continue 98 | if response is None: 99 | log.info(f"Pushshift failed, sleeping {current_attempt * sleep_per_attempt} : no response") 100 | time.sleep(current_attempt * sleep_per_attempt) 101 | continue 102 | if response.status_code == 200: 103 | break 104 | if response.status_code == 403: 105 | log.warning(f"Pushshift 403, trying reauth: {response.json()}") 106 | log.warning(url) 107 | log.warning(f"'Authorization': Bearer {bearer}") 108 | bearer = pushshift_token_function(bearer) 109 | log.info(f"Pushshift failed, sleeping {current_attempt * sleep_per_attempt} : status {response.status_code}") 110 | time.sleep(current_attempt * sleep_per_attempt) 111 | if response is None: 112 | log.warning(f"{current_attempt + 1} requests failed with no response") 113 | log.warning(url) 114 | log.warning(f"'Authorization': Bearer {bearer}") 115 | discord_logging.flush_discord() 116 | sys.exit(1) 117 | if response.status_code != 200: 118 | log.warning(f"{current_attempt + 1} requests failed with status code {response.status_code}") 119 | log.warning(url) 120 | log.warning(f"'Authorization': Bearer {bearer}") 121 | discord_logging.flush_discord() 122 | sys.exit(1) 123 | if current_attempt > 0: 124 | log.info(f"Pushshift call succeeded after {current_attempt + 1} retries") 125 | return response.json()['data'], bearer 126 | 127 | 128 | def query_reddit(ids, reddit, object_type): 129 | id_prefix = 't1_' if object_type == ObjectType.COMMENT else 't3_' 130 | id_string = f"{id_prefix}{(f',{id_prefix}'.join(ids))}" 131 | response = None 132 | for i in range(20): 133 | try: 134 | response = reddit.request(method="GET", path=endpoints.API_PATH["info"], params={"id": id_string}) 135 | break 136 | except (prawcore.exceptions.ServerError, prawcore.exceptions.RequestException) as err: 137 | log.info(f"No response from reddit api for {object_type}, sleeping {i * 5} seconds: {err} : {id_string}") 138 | time.sleep(i * 5) 139 | if response is None: 140 | log.warning(f"Reddit api failed, aborting") 141 | return [] 142 | return response['data']['children'] 143 | 144 | 145 | def end_of_day(input_minute): 146 | return input_minute.replace(hour=0, minute=0, second=0) + timedelta(days=1) 147 | 148 | 149 | def build_day(day_to_process, input_folders, output_folder, object_type, reddit, ignore_ids, pushshift_token_function): 150 | file_type = "comments" if object_type == ObjectType.COMMENT else "submissions" 151 | 152 | pushshift_token = pushshift_token_function(None) 153 | log.info(f"{file_type}: Using pushshift token: {pushshift_token}") 154 | 155 | file_minutes = {} 156 | minute_iterator = day_to_process - timedelta(minutes=2) 157 | end_time = end_of_day(day_to_process) + timedelta(minutes=2) 158 | while minute_iterator <= end_time: 159 | file_minutes[minute_iterator] = [] 160 | minute_iterator += timedelta(minutes=1) 161 | 162 | for merge_folder, ingest_type in input_folders: 163 | merge_date_folder = os.path.join(merge_folder, file_type, day_to_process.strftime('%y-%m-%d')) 164 | if os.path.exists(merge_date_folder): 165 | for file in os.listdir(merge_date_folder): 166 | match = reg.search(file) 167 | if not match: 168 | log.info(f"{file_type}: File doesn't match regex: {file}") 169 | continue 170 | file_date = datetime.strptime(match.group(), '%y-%m-%d_%H-%M') 171 | if file_date in file_minutes: 172 | file_minutes[file_date].append((os.path.join(merge_date_folder, file), ingest_type)) 173 | 174 | objects = classes.ObjectDict(day_to_process, day_to_process + timedelta(days=1) - timedelta(seconds=1), object_type) 175 | unmatched_field = False 176 | minute_iterator = day_to_process - timedelta(minutes=2) 177 | working_lowest_minute = day_to_process 178 | last_minute_of_day = end_of_day(day_to_process) - timedelta(minutes=1) 179 | while minute_iterator <= end_time: 180 | for ingest_file, ingest_type in file_minutes[minute_iterator]: 181 | for obj in utils.read_obj_zst(ingest_file): 182 | if objects.add_object(obj, ingest_type): 183 | unmatched_field = True 184 | log.info(f"{file_type}: Loaded {minute_iterator.strftime('%y-%m-%d_%H-%M')} : {objects.get_counts_string_by_minute(minute_iterator, [IngestType.INGEST, IngestType.RESCAN, IngestType.DOWNLOAD])}") 185 | 186 | if minute_iterator >= end_time or objects.count_minutes() >= 11: 187 | if minute_iterator > last_minute_of_day: 188 | working_highest_minute = last_minute_of_day 189 | else: 190 | working_highest_minute = minute_iterator - timedelta(minutes=1) 191 | missing_ids, start_id, end_id = objects.get_missing_ids_by_minutes(working_lowest_minute, working_highest_minute, ignore_ids) 192 | log.debug( 193 | f"{file_type}: Backfilling from: {working_lowest_minute.strftime('%y-%m-%d_%H-%M')} ({utils.base36encode(start_id)}|{start_id}) to " 194 | f"{working_highest_minute.strftime('%y-%m-%d_%H-%M')} ({utils.base36encode(end_id)}|{end_id}) with {len(missing_ids)} ({end_id - start_id}) ids") 195 | 196 | for chunk in utils.chunk_list(missing_ids, 50): 197 | pushshift_objects, pushshift_token = query_pushshift(chunk, pushshift_token, object_type, pushshift_token_function) 198 | for pushshift_object in pushshift_objects: 199 | if objects.add_object(pushshift_object, IngestType.PUSHSHIFT): 200 | unmatched_field = True 201 | 202 | for chunk in utils.chunk_list(missing_ids, 100): 203 | reddit_objects = query_reddit(chunk, reddit, object_type) 204 | for reddit_object in reddit_objects: 205 | if objects.add_object(reddit_object['data'], IngestType.BACKFILL): 206 | unmatched_field = True 207 | 208 | for missing_id in missing_ids: 209 | if missing_id not in objects.by_id: 210 | objects.add_missing_object(missing_id) 211 | 212 | objects.delete_objects_below_minute(working_lowest_minute) 213 | while working_lowest_minute <= working_highest_minute: 214 | folder = os.path.join(output_folder, file_type, working_lowest_minute.strftime('%y-%m-%d')) 215 | if not os.path.exists(folder): 216 | os.makedirs(folder) 217 | output_path = os.path.join(folder, f"{('RC' if object_type == ObjectType.COMMENT else 'RS')}_{working_lowest_minute.strftime('%y-%m-%d_%H-%M')}.zst") 218 | output_handle = zstandard.ZstdCompressor().stream_writer(open(output_path, 'wb')) 219 | 220 | for obj in objects.by_minute[working_lowest_minute].obj_list: 221 | output_handle.write(json.dumps(obj, sort_keys=True).encode('utf-8')) 222 | output_handle.write(NEWLINE_ENCODED) 223 | objects.delete_object_id(obj['id']) 224 | log.info( 225 | f"{file_type}: Wrote up to {working_lowest_minute.strftime('%y-%m-%d_%H-%M')} : " 226 | f"{objects.get_counts_string_by_minute(working_lowest_minute, [IngestType.PUSHSHIFT, IngestType.BACKFILL, IngestType.MISSING])}") 227 | output_handle.close() 228 | working_lowest_minute += timedelta(minutes=1) 229 | 230 | objects.rebuild_minute_dict() 231 | 232 | discord_logging.flush_discord() 233 | if unmatched_field: 234 | log.warning(f"{file_type}: Unmatched field, aborting") 235 | discord_logging.flush_discord() 236 | sys.exit(1) 237 | 238 | minute_iterator += timedelta(minutes=1) 239 | 240 | log.info(f"{file_type}: Finished day {day_to_process.strftime('%y-%m-%d')}: {objects.get_counts_string()}") 241 | 242 | 243 | def merge_and_backfill(start_date, end_date, input_folders, output_folder, object_type, ignore_ids, reddit_username, pushshift_token_function): 244 | reddit = praw.Reddit(reddit_username) 245 | while start_date <= end_date: 246 | build_day(start_date, input_folders, output_folder, object_type, reddit, ignore_ids, pushshift_token_function) 247 | start_date = end_of_day(start_date) 248 | 249 | 250 | if __name__ == "__main__": 251 | parser = argparse.ArgumentParser(description="Combine the ingest and rescan files, clean and do pushshift lookups as needed") 252 | parser.add_argument("--type", help="The object type, either comments or submissions", required=True) 253 | parser.add_argument("--start_date", help="The start of the date range to process, format YY-MM-DD_HH-MM", required=True) 254 | parser.add_argument("--end_date", help="The end of the date range to process, format YY-MM-DD. If not provided, the script processes to the end of the day") 255 | parser.add_argument('--input', help='Input folder', required=True) 256 | parser.add_argument('--output', help='Output folder', required=True) 257 | parser.add_argument('--pushshift', help='The pushshift token') 258 | parser.add_argument("--debug", help="Enable debug logging", action='store_const', const=True, default=False) 259 | parser.add_argument("--ignore_ids", help="Ignore ids between the id ranges listed", default=None) 260 | args = parser.parse_args() 261 | 262 | if args.debug: 263 | discord_logging.set_level(logging.DEBUG) 264 | 265 | input_folders = [ 266 | (os.path.join(args.input, "ingest"), IngestType.INGEST), 267 | (os.path.join(args.input, "rescan"), IngestType.RESCAN), 268 | (os.path.join(args.input, "download"), IngestType.DOWNLOAD), 269 | ] 270 | 271 | if args.start_date is None: 272 | log.error(f"No start date provided") 273 | sys.exit(2) 274 | start_date = datetime.strptime(args.start_date, '%y-%m-%d_%H-%M') 275 | end_date = end_of_day(start_date) 276 | if args.end_date is not None: 277 | end_date = datetime.strptime(args.end_date, '%y-%m-%d') 278 | 279 | for input_folder, ingest_type in input_folders: 280 | log.info(f"Input folder: {input_folder}") 281 | log.info(f"Output folder: {args.output}") 282 | 283 | object_type = None 284 | if args.type == "comments": 285 | object_type = ObjectType.COMMENT 286 | elif args.type == "submissions": 287 | object_type = ObjectType.SUBMISSION 288 | else: 289 | log.error(f"Invalid type: {args.type}") 290 | sys.exit(2) 291 | 292 | ignore_ids = [] 293 | if args.ignore_ids is not None: 294 | for id_range in args.ignore_ids.split(","): 295 | start_id, end_id = id_range.split("-") 296 | ignore_ids.append((utils.base36decode(start_id), utils.base36decode(end_id))) 297 | 298 | discord_logging.init_discord_logging( 299 | section_name="Watchful12", 300 | log_level=logging.WARNING 301 | ) 302 | 303 | if args.pushshift is not None: 304 | log.warning(f"Saving pushshift token: {args.pushshift}") 305 | save_pushshift_token(args.pushshift) 306 | 307 | merge_and_backfill( 308 | start_date, 309 | end_date, 310 | input_folders, 311 | args.output, 312 | object_type, 313 | ignore_ids, 314 | "Watchful12", 315 | get_pushshift_token 316 | ) 317 | -------------------------------------------------------------------------------- /personal/combine/merge_minutes.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import requests 3 | import time 4 | import discord_logging 5 | import argparse 6 | import os 7 | import re 8 | import zstandard 9 | from datetime import datetime, timedelta 10 | import json 11 | import praw 12 | from praw import endpoints 13 | import prawcore 14 | import logging.handlers 15 | 16 | sys.path.append('personal') 17 | 18 | log = discord_logging.init_logging(debug=False) 19 | 20 | import utils 21 | import classes 22 | from classes import IngestType 23 | from merge import ObjectType 24 | 25 | 26 | NEWLINE_ENCODED = "\n".encode('utf-8') 27 | reg = re.compile(r"\d\d-\d\d-\d\d_\d\d-\d\d") 28 | 29 | 30 | def end_of_day(input_minute): 31 | return input_minute.replace(hour=0, minute=0, second=0) + timedelta(days=1) 32 | 33 | 34 | def build_day(day_to_process, input_folders, output_folder, object_type): 35 | file_type = "comments" if object_type == ObjectType.COMMENT else "submissions" 36 | 37 | file_minutes = {} 38 | minute_iterator = day_to_process - timedelta(minutes=2) 39 | end_time = end_of_day(day_to_process) + timedelta(minutes=2) 40 | while minute_iterator <= end_time: 41 | file_minutes[minute_iterator] = [] 42 | minute_iterator += timedelta(minutes=1) 43 | 44 | for merge_folder, ingest_type in input_folders: 45 | merge_date_folder = os.path.join(merge_folder, file_type, day_to_process.strftime('%y-%m-%d')) 46 | if os.path.exists(merge_date_folder): 47 | for file in os.listdir(merge_date_folder): 48 | match = reg.search(file) 49 | if not match: 50 | log.info(f"File doesn't match regex: {file}") 51 | continue 52 | file_date = datetime.strptime(match.group(), '%y-%m-%d_%H-%M') 53 | if file_date in file_minutes: 54 | file_minutes[file_date].append((os.path.join(merge_date_folder, file), ingest_type)) 55 | 56 | objects = classes.ObjectDict(day_to_process, day_to_process + timedelta(days=1) - timedelta(seconds=1), object_type) 57 | unmatched_field = False 58 | minute_iterator = day_to_process - timedelta(minutes=2) 59 | working_lowest_minute = day_to_process 60 | last_minute_of_day = end_of_day(day_to_process) - timedelta(minutes=1) 61 | while minute_iterator <= end_time: 62 | for ingest_file, ingest_type in file_minutes[minute_iterator]: 63 | for obj in utils.read_obj_zst(ingest_file): 64 | if objects.add_object(obj, ingest_type): 65 | unmatched_field = True 66 | log.info(f"Loaded {minute_iterator.strftime('%y-%m-%d_%H-%M')} : {objects.get_counts_string_by_minute(minute_iterator, [IngestType.INGEST, IngestType.DOWNLOAD])}") 67 | 68 | if minute_iterator >= end_time or objects.count_minutes() >= 11: 69 | if minute_iterator > last_minute_of_day: 70 | working_highest_minute = last_minute_of_day 71 | else: 72 | working_highest_minute = minute_iterator - timedelta(minutes=1) 73 | 74 | objects.delete_objects_below_minute(working_lowest_minute) 75 | while working_lowest_minute <= working_highest_minute: 76 | folder = os.path.join(output_folder, file_type, working_lowest_minute.strftime('%y-%m-%d')) 77 | if not os.path.exists(folder): 78 | os.makedirs(folder) 79 | output_path = os.path.join(folder, f"{('RS' if object_type == ObjectType.COMMENT else 'RC')}_{working_lowest_minute.strftime('%y-%m-%d_%H-%M')}.zst") 80 | output_handle = zstandard.ZstdCompressor().stream_writer(open(output_path, 'wb')) 81 | 82 | for obj in objects.by_minute[working_lowest_minute].obj_list: 83 | output_handle.write(json.dumps(obj, sort_keys=True).encode('utf-8')) 84 | output_handle.write(NEWLINE_ENCODED) 85 | objects.delete_object_id(obj['id']) 86 | log.info(f"Wrote up to {working_lowest_minute.strftime('%y-%m-%d_%H-%M')}") 87 | output_handle.close() 88 | working_lowest_minute += timedelta(minutes=1) 89 | 90 | objects.rebuild_minute_dict() 91 | 92 | discord_logging.flush_discord() 93 | if unmatched_field: 94 | log.info(f"Unmatched field, aborting") 95 | sys.exit(1) 96 | 97 | minute_iterator += timedelta(minutes=1) 98 | 99 | log.info(f"Finished day {day_to_process.strftime('%y-%m-%d')}: {objects.get_counts_string()}") 100 | 101 | 102 | if __name__ == "__main__": 103 | parser = argparse.ArgumentParser(description="Combine two ingest files") 104 | parser.add_argument("--type", help="The object type, either comments or submissions", required=True) 105 | parser.add_argument("--start_date", help="The start of the date range to process, format YY-MM-DD_HH-MM", required=True) 106 | parser.add_argument("--end_date", help="The end of the date range to process, format YY-MM-DD. If not provided, the script processes to the end of the day") 107 | parser.add_argument('--input', help='Input folder', required=True) 108 | parser.add_argument('--output', help='Output folder', required=True) 109 | parser.add_argument("--debug", help="Enable debug logging", action='store_const', const=True, default=False) 110 | args = parser.parse_args() 111 | 112 | if args.debug: 113 | discord_logging.set_level(logging.DEBUG) 114 | 115 | if args.start_date is None: 116 | log.error(f"No start date provided") 117 | sys.exit(2) 118 | start_date = datetime.strptime(args.start_date, '%y-%m-%d_%H-%M') 119 | end_date = end_of_day(start_date) 120 | if args.end_date is not None: 121 | end_date = datetime.strptime(args.end_date, '%y-%m-%d') 122 | 123 | input_folders = [ 124 | (os.path.join(args.input, "combined"), IngestType.INGEST), 125 | (os.path.join(args.input, "download"), IngestType.DOWNLOAD), 126 | ] 127 | 128 | for input_folder, ingest_type in input_folders: 129 | log.info(f"Input folder: {input_folder}") 130 | log.info(f"Output folder: {args.output}") 131 | 132 | object_type = None 133 | if args.type == "comments": 134 | object_type = ObjectType.COMMENT 135 | elif args.type == "submissions": 136 | object_type = ObjectType.SUBMISSION 137 | else: 138 | log.error(f"Invalid type: {args.type}") 139 | sys.exit(2) 140 | 141 | while start_date <= end_date: 142 | build_day(start_date, input_folders, args.output, object_type) 143 | start_date = end_of_day(start_date) 144 | -------------------------------------------------------------------------------- /personal/compression/extract_file.py: -------------------------------------------------------------------------------- 1 | import utils 2 | import discord_logging 3 | import os 4 | import sys 5 | from datetime import datetime 6 | 7 | log = discord_logging.init_logging() 8 | 9 | 10 | if __name__ == "__main__": 11 | input_file_path = r"\\MYCLOUDPR4100\Public\reddit_final\curiousdrive_submissions.zst" 12 | output_file_path = r"\\MYCLOUDPR4100\Public\reddit_final\curiousdrive_submissions.txt" 13 | file_size = os.stat(input_file_path).st_size 14 | 15 | file_lines = 0 16 | file_bytes_processed = 0 17 | created = None 18 | inserts = [] 19 | output_file = open(output_file_path, 'w') 20 | for obj, line, file_bytes_processed in utils.read_obj_zst_meta(input_file_path): 21 | created = datetime.utcfromtimestamp(int(obj['created_utc'])) 22 | file_lines += 1 23 | output_file.write(line) 24 | output_file.write("\n") 25 | if file_lines % 100000 == 0: 26 | log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : {(file_bytes_processed / file_size) * 100:.0f}%") 27 | 28 | log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : 100%") 29 | output_file.close() 30 | 31 | -------------------------------------------------------------------------------- /personal/compression/recompress_file.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import zstandard 3 | import discord_logging 4 | import time 5 | import sys 6 | 7 | sys.path.append('personal') 8 | 9 | log = discord_logging.init_logging() 10 | 11 | import utils 12 | 13 | if __name__ == '__main__': 14 | parser = argparse.ArgumentParser(description="Take all the zst files in the input folder, extract them and compress them again at the ratio specified") 15 | parser.add_argument("input", help="The input file") 16 | parser.add_argument("output", help="The output file") 17 | parser.add_argument("--level", help="The compression ratio to output at", default="3") 18 | args = parser.parse_args() 19 | 20 | log.info(f"Input file {args.input}") 21 | log.info(f"Output file {args.output}") 22 | 23 | total_objects = 0 24 | total_bytes = 0 25 | for obj, line, _ in utils.read_obj_zst_meta(args.input): 26 | total_bytes += len(line.encode('utf-8')) 27 | total_bytes += 1 28 | 29 | total_objects += 1 30 | if total_objects % 1000000 == 0: 31 | log.info(f"{total_objects:,} : {total_bytes:,}") 32 | 33 | log.info(f"{total_objects:,} : {total_bytes:,}") 34 | 35 | for threads in range(-1, 21): 36 | decompressor = zstandard.ZstdDecompressor(max_window_size=2**31) 37 | compressor = zstandard.ZstdCompressor(level=22, write_content_size=True, write_checksum=True, threads=threads) 38 | start_time = time.time() 39 | with open(args.input, 'rb') as input_handle, open(args.output, "wb") as output_handle: 40 | compression_reader = decompressor.stream_reader(input_handle) 41 | read_count, write_count = compressor.copy_stream(compression_reader, output_handle, size=total_bytes) 42 | seconds = time.time() - start_time 43 | 44 | log.info(f"{read_count:,} to {write_count:,} in {seconds:,.2f} with {threads} threads") 45 | 46 | # compressed_bytes_read += file_size 47 | # uncompressed_bytes_read += read_count 48 | # bytes_written += write_count 49 | # log.info(f"{files_read:,}/{len(files):,} : {(compressed_bytes_read / (2**30)):.2f} gb of {(total_size / (2**30)):.2f} gb compressed to {(bytes_written / (2**30)):.2f} gb : {bytes_written /compressed_bytes_read:.3f}") 50 | -------------------------------------------------------------------------------- /personal/compression/recompress_folder.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import zstandard 3 | import os 4 | import logging.handlers 5 | 6 | log = logging.getLogger("bot") 7 | log.setLevel(logging.INFO) 8 | log_formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s') 9 | log_str_handler = logging.StreamHandler() 10 | log_str_handler.setFormatter(log_formatter) 11 | log.addHandler(log_str_handler) 12 | if not os.path.exists("logs"): 13 | os.makedirs("logs") 14 | log_file_handler = logging.handlers.RotatingFileHandler(os.path.join("logs", "bot.log"), maxBytes=1024*1024*16, backupCount=5) 15 | log_file_handler.setFormatter(log_formatter) 16 | log.addHandler(log_file_handler) 17 | 18 | if __name__ == '__main__': 19 | parser = argparse.ArgumentParser(description="Take all the zst files in the input folder, extract them and compress them again at the ratio specified") 20 | parser.add_argument("input", help="The input folder to read files from") 21 | parser.add_argument("output", help="The output folder to write files to") 22 | parser.add_argument("--level", help="The compression ratio to output at", default="3") 23 | args = parser.parse_args() 24 | 25 | log.info(f"Reading all files from {args.input}") 26 | 27 | files = [] 28 | total_size = 0 29 | for file_name in os.listdir(args.input): 30 | file_path = os.path.join(args.input, file_name) 31 | if file_name.endswith(".zst") and os.path.isfile(file_path): 32 | file_size = os.stat(file_path).st_size 33 | total_size += file_size 34 | files.append((file_name, file_size)) 35 | if len(files) % 1000 == 0: 36 | log.info(f"Loaded {len(files)} files") 37 | log.info(f"Loaded {len(files)} files of total size {total_size:,}") 38 | 39 | level = int(args.level) 40 | log.info(f"Writing files out to {args.output} at ratio {level}") 41 | if not os.path.exists(args.output): 42 | os.makedirs(args.output) 43 | 44 | compressed_bytes_read = 0 45 | uncompressed_bytes_read = 0 46 | bytes_written = 0 47 | files_read = 0 48 | 49 | decompressor = zstandard.ZstdDecompressor(max_window_size=2**31) 50 | compressor = zstandard.ZstdCompressor(level=level, threads=-1) 51 | for file_name, file_size in files: 52 | input_path = os.path.join(args.input, file_name) 53 | output_path = os.path.join(args.output, file_name) 54 | with open(input_path, 'rb') as input_handle, open(output_path, "wb") as output_handle: 55 | compression_reader = decompressor.stream_reader(input_handle) 56 | read_count, write_count = compressor.copy_stream(compression_reader, output_handle) 57 | 58 | compressed_bytes_read += file_size 59 | uncompressed_bytes_read += read_count 60 | bytes_written += write_count 61 | files_read += 1 62 | log.info(f"{files_read:,}/{len(files):,} : {(compressed_bytes_read / (2**30)):.2f} gb of {(total_size / (2**30)):.2f} gb compressed to {(bytes_written / (2**30)):.2f} gb : {bytes_written /compressed_bytes_read:.3f}") 63 | -------------------------------------------------------------------------------- /personal/compression/recompress_folder_multiprocess.py: -------------------------------------------------------------------------------- 1 | # this script iterates through zst compressed ndjson files, like the pushshift reddit dumps, loads each line 2 | # and if it matches the criteria in the command line arguments, it's written out into a separate file for 3 | # that month. After all the ndjson files are processed, it iterates through the resulting files and combines 4 | # them into a final file. 5 | 6 | # this script assumes the files are named in chronological order and prefixed with RS_ or RC_, like the pushshift dumps 7 | 8 | # features: 9 | # - multiple processes in parallel to maximize drive read and decompression 10 | # - saves state as it completes each file and picks up where it stopped 11 | # - detailed progress indicators 12 | 13 | # examples: 14 | # - get all comments that have a subreddit field (subreddit is the default) of "wallstreetbets". This will create a single output file "wallstreetbets_comments.zst" in the folder the script is run in 15 | # python3 combine_folder_multiprocess.py reddit/comments --value wallstreetbets 16 | # - get all comments and submissions (assuming both types of dump files are under the reddit folder) that have an author field of Watchful1 or spez and output the results to a folder called pushshift. 17 | # This will result in four files, pushshift/Watchful1_comments, pushshift/Watchful1_submissions, pushshift/spez_comments, pushshift/spez_submissions 18 | # python3 combine_folder_multiprocess.py reddit --field author --value Watchful1,spez --output pushshift 19 | 20 | import zstandard 21 | import os 22 | import json 23 | import sys 24 | import time 25 | import argparse 26 | import re 27 | from collections import defaultdict 28 | import logging.handlers 29 | import multiprocessing 30 | from enum import Enum 31 | 32 | 33 | # sets up logging to the console as well as a file 34 | log = logging.getLogger("bot") 35 | log.setLevel(logging.INFO) 36 | log_formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s') 37 | 38 | log_str_handler = logging.StreamHandler() 39 | log_str_handler.setFormatter(log_formatter) 40 | log.addHandler(log_str_handler) 41 | if not os.path.exists("logs"): 42 | os.makedirs("logs") 43 | log_file_handler = logging.handlers.RotatingFileHandler( 44 | os.path.join("logs", "bot.log"), maxBytes=1024*1024*16, backupCount=5) 45 | log_file_handler.setFormatter(log_formatter) 46 | log.addHandler(log_file_handler) 47 | 48 | 49 | # convenience object used to pass status information between processes 50 | class FileConfig: 51 | def __init__(self, input_path, output_path, complete=False, uncompressed_size=None, new_compressed_size=None, total_lines=None): 52 | self.input_path = input_path 53 | self.output_path = output_path 54 | self.complete = complete 55 | self.error_message = None 56 | 57 | self.old_compressed_size = os.stat(input_path).st_size 58 | self.uncompressed_size = uncompressed_size 59 | self.new_compressed_size = new_compressed_size 60 | 61 | self.total_lines = total_lines 62 | 63 | def __str__(self): 64 | return f"{self.input_path} : {self.output_path} : {self.complete} : {self.old_compressed_size} - {self.uncompressed_size} - {self.new_compressed_size}" 65 | 66 | 67 | def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0): 68 | chunk = reader.read(chunk_size) 69 | bytes_read += len(chunk) 70 | if previous_chunk is not None: 71 | chunk = previous_chunk + chunk 72 | try: 73 | return chunk.decode(), bytes_read 74 | except UnicodeDecodeError: 75 | if bytes_read > max_window_size: 76 | raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes") 77 | return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read) 78 | 79 | 80 | def count_lines_bytes(file_name): 81 | count_lines = 0 82 | uncompressed_bytes = 0 83 | with open(file_name, 'rb') as file_handle: 84 | buffer = '' 85 | reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle) 86 | 87 | while True: 88 | chunk, chunk_bytes = read_and_decode(reader, 2**27, (2**29) * 2) 89 | uncompressed_bytes += chunk_bytes 90 | if not chunk: 91 | break 92 | lines = (buffer + chunk).split("\n") 93 | count_lines += len(lines) - 1 94 | 95 | buffer = lines[-1] 96 | reader.close() 97 | return count_lines, uncompressed_bytes 98 | 99 | 100 | class Queue: 101 | def __init__(self, max_size): 102 | self.list = [] 103 | self.max_size = max_size 104 | 105 | def put(self, item): 106 | if len(self.list) >= self.max_size: 107 | self.list.pop(0) 108 | self.list.append(item) 109 | 110 | def peek(self): 111 | return self.list[0] if len(self.list) > 0 else None 112 | 113 | 114 | # save file information and progress to a json file 115 | # we don't want to save the whole FileConfig object, since some info resets if we restart 116 | def save_file_list(input_files, working_folder, status_json, arg_string, script_type): 117 | if not os.path.exists(working_folder): 118 | os.makedirs(working_folder) 119 | simple_file_list = [] 120 | for file in input_files: 121 | simple_file_list.append([file.input_path, file.output_path, file.complete, file.uncompressed_size, file.new_compressed_size, file.total_lines]) 122 | with open(status_json, 'w') as status_json_file: 123 | output_dict = { 124 | "args": arg_string, 125 | "type": script_type, 126 | "files": simple_file_list, 127 | } 128 | status_json_file.write(json.dumps(output_dict, indent=4)) 129 | 130 | 131 | # load file information from the json file and recalculate file sizes 132 | def load_file_list(status_json): 133 | if os.path.exists(status_json): 134 | with open(status_json, 'r') as status_json_file: 135 | output_dict = json.load(status_json_file) 136 | input_files = [] 137 | for simple_file in output_dict["files"]: 138 | input_files.append( 139 | FileConfig(simple_file[0], simple_file[1], simple_file[2], simple_file[3], simple_file[4], simple_file[5]) 140 | ) 141 | return input_files, output_dict["args"], output_dict["type"] 142 | else: 143 | return None, None, None 144 | 145 | 146 | # base of each separate process. Loads a file, iterates through lines and writes out 147 | # the ones where the `field` of the object matches `value`. Also passes status 148 | # information back to the parent via a queue 149 | def process_file(file, queue, threads, level): 150 | queue.put(file) 151 | file.total_lines, file.uncompressed_size = count_lines_bytes(file.input_path) 152 | queue.put(file) 153 | 154 | try: 155 | decompressor = zstandard.ZstdDecompressor(max_window_size=2**31) 156 | compressor = zstandard.ZstdCompressor(level=level, write_content_size=True, write_checksum=True, threads=threads) 157 | with open(file.input_path, 'rb') as input_handle, open(file.output_path, "wb") as output_handle: 158 | compression_reader = decompressor.stream_reader(input_handle) 159 | read_count, file.new_compressed_size = compressor.copy_stream(compression_reader, output_handle, size=file.uncompressed_size) 160 | file.complete = True 161 | except Exception as err: 162 | file.error_message = str(err) 163 | #log.info(f"{read_count:,} to {write_count:,} in {seconds:,.2f} with {threads} threads") 164 | queue.put(file) 165 | 166 | 167 | if __name__ == '__main__': 168 | parser = argparse.ArgumentParser(description="Use multiple processes to recompress zst files in a folder") 169 | parser.add_argument("input", help="The input folder to read files from") 170 | parser.add_argument("output", help="Put the output files in this folder") 171 | parser.add_argument("--level", help="The compression ratio to output at. From 0 to 22", default=22, type=int) 172 | parser.add_argument("--working", help="The folder to store temporary files in", default="pushshift_working") 173 | parser.add_argument("--processes", help="Number of processes to use", default=4, type=int) 174 | parser.add_argument("--threads", help="Number of threads per process", default=0, type=int) 175 | parser.add_argument("--debug", help="Enable debug logging", action='store_const', const=True, default=False) 176 | script_type = "compress" 177 | 178 | args = parser.parse_args() 179 | arg_string = f"{args.input}:{args.output}:{args.level}" 180 | 181 | if args.debug: 182 | log.setLevel(logging.DEBUG) 183 | 184 | log.info(f"Loading files from: {args.input}") 185 | log.info(f"Writing output to: {args.output}") 186 | 187 | multiprocessing.set_start_method('spawn') 188 | queue = multiprocessing.Manager().Queue() 189 | status_json = os.path.join(args.working, "status.json") 190 | input_files, saved_arg_string, saved_type = load_file_list(status_json) 191 | if saved_arg_string and saved_arg_string != arg_string: 192 | log.warning(f"Args don't match args from json file. Delete working folder") 193 | sys.exit(0) 194 | 195 | if saved_type and saved_type != script_type: 196 | log.warning(f"Script type doesn't match type from json file. Delete working folder") 197 | sys.exit(0) 198 | 199 | # if the file list wasn't loaded from the json, this is the first run, find what files we need to process 200 | if input_files is None: 201 | input_files = [] 202 | for file_name in os.listdir(args.input): 203 | input_path = os.path.join(args.input, file_name) 204 | if os.path.isfile(input_path) and file_name.endswith(".zst"): 205 | output_path = os.path.join(args.output, file_name) 206 | input_files.append(FileConfig(input_path, output_path=output_path)) 207 | 208 | save_file_list(input_files, args.working, status_json, arg_string, script_type) 209 | else: 210 | log.info(f"Existing input file was read, if this is not correct you should delete the {args.working} folder and run this script again") 211 | 212 | files_processed, total_old_bytes, processed_old_bytes, processed_uncompressed_bytes, processed_new_bytes, processed_lines = 0, 0, 0, 0, 0, 0 213 | files_to_process = [] 214 | # calculate the total file size for progress reports, build a list of incomplete files to process 215 | # do this largest to smallest by file size so that we aren't processing a few really big files with only a few threads at the end 216 | for file in sorted(input_files, key=lambda item: item.old_compressed_size, reverse=True): 217 | total_old_bytes += file.old_compressed_size 218 | if file.complete: 219 | files_processed += 1 220 | processed_old_bytes += file.old_compressed_size 221 | processed_uncompressed_bytes += file.uncompressed_size 222 | processed_new_bytes += file.new_compressed_size 223 | processed_lines += file.total_lines 224 | else: 225 | files_to_process.append(file) 226 | 227 | log.info(f"Processed {files_processed} of {len(input_files)} files with {(processed_old_bytes / (2**30)):.2f} of {(total_old_bytes / (2**30)):.2f} gigabytes") 228 | 229 | start_time = time.time() 230 | if len(files_to_process): 231 | progress_queue = Queue(40) 232 | progress_queue.put([start_time, processed_old_bytes]) 233 | speed_queue = Queue(40) 234 | # start the workers 235 | with multiprocessing.Pool(processes=min(args.processes, len(files_to_process))) as pool: 236 | workers = pool.starmap_async(process_file, [(file, queue, args.threads, args.level) for file in files_to_process], chunksize=1, error_callback=log.info) 237 | while not workers.ready() or not queue.empty(): 238 | # loop until the workers are all done, pulling in status messages as they are sent 239 | file_update = queue.get() 240 | if file_update.error_message is not None: 241 | log.warning(f"File failed {file_update.input_path}: {file_update.error_message}") 242 | 243 | # this is the workers telling us they are starting a new file, print the debug message but nothing else 244 | if not file_update.complete: 245 | if file_update.uncompressed_size is not None: 246 | log.debug(f"Calculated uncompressed size: {file_update.input_path} : {file_update.uncompressed_size:,}") 247 | else: 248 | log.debug(f"Starting file: {file_update.input_path} : {file_update.old_compressed_size:,}") 249 | continue 250 | 251 | # I'm going to assume that the list of files is short enough that it's no 252 | # big deal to just iterate each time since that saves a bunch of work 253 | files_processed, processed_old_bytes, processed_uncompressed_bytes, processed_new_bytes, processed_lines, files_errored, i = 0, 0, 0, 0, 0, 0, 0 254 | for file in input_files: 255 | if file.input_path == file_update.input_path: 256 | input_files[i] = file_update 257 | file = file_update 258 | if file.complete: 259 | processed_old_bytes += file.old_compressed_size 260 | processed_uncompressed_bytes += file.uncompressed_size if file.uncompressed_size is not None else 0 261 | processed_new_bytes += file.new_compressed_size if file.new_compressed_size is not None else 0 262 | processed_lines += file.total_lines if file.total_lines is not None else 0 263 | files_processed += 1 if file.complete or file.error_message is not None else 0 264 | files_errored += 1 if file.error_message is not None else 0 265 | i += 1 266 | if file_update.complete or file_update.error_message is not None: 267 | save_file_list(input_files, args.working, status_json, arg_string, script_type) 268 | log.debug(f"Finished file: {file_update.input_path}") 269 | current_time = time.time() 270 | progress_queue.put([current_time, processed_old_bytes]) 271 | 272 | first_time, first_bytes = progress_queue.peek() 273 | bytes_per_second = int((processed_old_bytes - first_bytes)/(current_time - first_time)) 274 | speed_queue.put(bytes_per_second) 275 | seconds_left = int((total_old_bytes - processed_old_bytes) / int(sum(speed_queue.list) / len(speed_queue.list))) 276 | minutes_left = int(seconds_left / 60) 277 | hours_left = int(minutes_left / 60) 278 | days_left = int(hours_left / 24) 279 | 280 | log.info( 281 | f"{(processed_old_bytes / (2**30)):.3f} gb at {(bytes_per_second / (2**20)):,.2f} mb/s, {(processed_old_bytes / total_old_bytes) * 100:.0f}% : " 282 | f"{(processed_uncompressed_bytes / (2**30)):.3f} gb uncompressed down to {(processed_new_bytes / (2**30)):.3f} gb compressed : " 283 | f"{(processed_old_bytes / processed_uncompressed_bytes):.3f} old ratio : {(processed_new_bytes / processed_uncompressed_bytes):.3f} new ratio : {(processed_new_bytes / processed_old_bytes):.3f} difference : " 284 | f"{files_processed}({files_errored})/{len(input_files)} files : " 285 | f"{(str(days_left) + 'd ' if days_left > 0 else '')}{hours_left - (days_left * 24)}:{minutes_left - (hours_left * 60):02}:{seconds_left - (minutes_left * 60):02} remaining : " 286 | f"{first_time}:{first_bytes}:{current_time}:{processed_old_bytes}:{processed_uncompressed_bytes}:{processed_new_bytes}:{total_old_bytes}:{int(sum(speed_queue.list))}:{len(speed_queue.list)}") 287 | 288 | log.info(f"{(processed_old_bytes / (2**30)):.2f} gb, {(processed_old_bytes / total_old_bytes) * 100:.0f}% : {files_processed}/{len(input_files)}") 289 | -------------------------------------------------------------------------------- /personal/diagnostic/comments_per_day.py: -------------------------------------------------------------------------------- 1 | import utils 2 | import discord_logging 3 | from datetime import datetime 4 | 5 | log = discord_logging.init_logging() 6 | 7 | 8 | if __name__ == "__main__": 9 | day = None 10 | day_comments = 0 11 | for comment in utils.read_obj_zst(r"\\MYCLOUDPR4100\Public\reddit\subreddits23\antiwork_comments.zst"): 12 | created_day = datetime.utcfromtimestamp(int(comment['created_utc'])).strftime("%y-%m-%d") 13 | if day is None: 14 | day = created_day 15 | if day != created_day: 16 | log.info(f"{day} {day_comments}") 17 | day_comments = 0 18 | day = created_day 19 | day_comments += 1 20 | 21 | log.info(f"{day} {day_comments}") 22 | -------------------------------------------------------------------------------- /personal/diagnostic/comments_per_day_with_score.py: -------------------------------------------------------------------------------- 1 | import utils 2 | import discord_logging 3 | from datetime import datetime 4 | 5 | log = discord_logging.init_logging() 6 | 7 | 8 | if __name__ == "__main__": 9 | day = None 10 | day_comments, day_comments_with_score = 0, 0 11 | for comment in utils.read_obj_zst(r"\\MYCLOUDPR4100\Public\reddit\subreddits23\antiwork_comments.zst"): 12 | created_day = datetime.utcfromtimestamp(int(comment['created_utc'])).strftime("%y-%m-%d") 13 | if day is None: 14 | day = created_day 15 | if day != created_day: 16 | log.info(f"{day} {day_comments} {day_comments_with_score} {int((day_comments_with_score / day_comments) * 100):.2}%") 17 | day_comments, day_comments_with_score = 0, 0 18 | day = created_day 19 | day_comments += 1 20 | if comment['score'] != 1: 21 | day_comments_with_score += 1 22 | 23 | log.info(f"{day} {day_comments} {day_comments_with_score} {int((day_comments_with_score / day_comments) * 100):.2}%") 24 | -------------------------------------------------------------------------------- /personal/diagnostic/compare_lines.py: -------------------------------------------------------------------------------- 1 | import utils 2 | import discord_logging 3 | import os 4 | import sys 5 | from datetime import datetime 6 | 7 | log = discord_logging.init_logging() 8 | 9 | 10 | if __name__ == "__main__": 11 | file_one = open(r"\\MYCLOUDPR4100\Public\reddit_final\RelationshipsOver35_comments_dump.txt", 'r') 12 | file_two = open(r"\\MYCLOUDPR4100\Public\reddit_final\RelationshipsOver35_comments_mongo.txt", 'r') 13 | 14 | file_lines = 0 15 | while True: 16 | file_lines += 1 17 | line_one = file_one.readline().rstrip() 18 | line_two = file_two.readline().rstrip() 19 | if line_one != line_two: 20 | log.info(f"lines not matching: {file_lines}") 21 | log.info(line_one) 22 | log.info(line_two) 23 | #break 24 | 25 | if file_lines % 100000 == 0: 26 | log.info(f"{file_lines:,}") 27 | 28 | if not line_one: 29 | break 30 | 31 | log.info(f"{file_lines:,}") 32 | file_one.close() 33 | file_two.close() 34 | -------------------------------------------------------------------------------- /personal/diagnostic/count_fields.py: -------------------------------------------------------------------------------- 1 | import zstandard 2 | import os 3 | import json 4 | import sys 5 | from datetime import datetime 6 | import logging.handlers 7 | from collections import defaultdict 8 | 9 | 10 | log = logging.getLogger("bot") 11 | log.setLevel(logging.DEBUG) 12 | log.addHandler(logging.StreamHandler()) 13 | 14 | 15 | def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0): 16 | chunk = reader.read(chunk_size) 17 | bytes_read += chunk_size 18 | if previous_chunk is not None: 19 | chunk = previous_chunk + chunk 20 | try: 21 | return chunk.decode() 22 | except UnicodeDecodeError: 23 | if bytes_read > max_window_size: 24 | raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes") 25 | log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk") 26 | return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read) 27 | 28 | 29 | def read_lines_zst(file_name): 30 | with open(file_name, 'rb') as file_handle: 31 | buffer = '' 32 | reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle) 33 | while True: 34 | chunk = read_and_decode(reader, 2**27, (2**29) * 2) 35 | if not chunk: 36 | break 37 | lines = (buffer + chunk).split("\n") 38 | for line in lines[:-1]: 39 | yield json.loads(line) 40 | buffer = lines[-1] 41 | reader.close() 42 | 43 | 44 | if __name__ == "__main__": 45 | #input_folder = r"\\MYCLOUDPR4100\Public\ingest\ingest\comments\23-06-23" 46 | input_folder = r"\\MYCLOUDPR4100\Public\reddit\comments" 47 | input_files = [] 48 | total_size = 0 49 | for subdir, dirs, files in os.walk(input_folder): 50 | for filename in files: 51 | input_path = os.path.join(subdir, filename) 52 | if input_path.endswith(".zst"): 53 | file_size = os.stat(input_path).st_size 54 | total_size += file_size 55 | input_files.append([input_path, file_size]) 56 | 57 | log.info(f"Processing {len(input_files)} files of {(total_size / (2**30)):.2f} gigabytes") 58 | 59 | total_lines = 0 60 | fields = defaultdict(lambda: defaultdict(int)) 61 | for input_file in input_files: 62 | file_lines = 0 63 | created = None 64 | for obj in read_lines_zst(input_file[0]): 65 | for key, value in obj.items(): 66 | value = str(value)[:20] 67 | fields[key][value] += 1 68 | 69 | created = datetime.utcfromtimestamp(int(obj['created_utc'])) 70 | file_lines += 1 71 | if file_lines % 100000 == 0: 72 | log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines + total_lines:,}") 73 | if file_lines >= 1000: 74 | break 75 | total_lines += file_lines 76 | log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines + total_lines:,}") 77 | 78 | sorted_fields = [] 79 | for key, values in fields.items(): 80 | total_occurrences = 0 81 | unique_values = 0 82 | examples = [] 83 | for value_name, count in values.items(): 84 | unique_values += 1 85 | total_occurrences += count 86 | if len(examples) < 3: 87 | examples.append(value_name) 88 | sorted_fields.append((total_occurrences, f"{key}: {(total_occurrences / total_lines) * 100:.2f} : {unique_values:,} : {','.join(examples)}")) 89 | sorted_fields.sort(key=lambda x:x[0], reverse=True) 90 | for count, string in sorted_fields: 91 | log.info(string) 92 | -------------------------------------------------------------------------------- /personal/diagnostic/count_subreddits_multiprocess.py: -------------------------------------------------------------------------------- 1 | import zstandard 2 | import os 3 | import json 4 | import sys 5 | import time 6 | import argparse 7 | import re 8 | from collections import defaultdict 9 | from datetime import datetime 10 | import logging.handlers 11 | import multiprocessing 12 | 13 | 14 | # sets up logging to the console as well as a file 15 | log = logging.getLogger("bot") 16 | log.setLevel(logging.INFO) 17 | log_formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s') 18 | 19 | log_stderr_handler = logging.StreamHandler() 20 | log_stderr_handler.setFormatter(log_formatter) 21 | log.addHandler(log_stderr_handler) 22 | if not os.path.exists("logs"): 23 | os.makedirs("logs") 24 | log_file_handler = logging.handlers.RotatingFileHandler( 25 | os.path.join("logs", "bot.log"), maxBytes=1024*1024*16, backupCount=5) 26 | log_file_handler.setFormatter(log_formatter) 27 | log.addHandler(log_file_handler) 28 | 29 | 30 | # convenience object used to pass status information between processes 31 | class FileConfig: 32 | def __init__(self, input_path, output_path=None, complete=False, lines_processed=0, error_lines=0, count_file_path=None): 33 | self.input_path = input_path 34 | self.output_path = output_path 35 | self.count_file_path = count_file_path 36 | self.file_size = os.stat(input_path).st_size 37 | self.complete = complete 38 | self.bytes_processed = self.file_size if complete else 0 39 | self.lines_processed = lines_processed if complete else 0 40 | self.error_message = None 41 | self.error_lines = error_lines 42 | 43 | def __str__(self): 44 | return f"{self.input_path} : {self.output_path} : {self.file_size} : {self.complete} : {self.bytes_processed} : {self.lines_processed}" 45 | 46 | 47 | # used for calculating running average of read speed 48 | class Queue: 49 | def __init__(self, max_size): 50 | self.list = [] 51 | self.max_size = max_size 52 | 53 | def put(self, item): 54 | if len(self.list) >= self.max_size: 55 | self.list.pop(0) 56 | self.list.append(item) 57 | 58 | def peek(self): 59 | return self.list[0] if len(self.list) > 0 else None 60 | 61 | 62 | # save file information and progress to a json file 63 | # we don't want to save the whole FileConfig object, since some info resets if we restart 64 | def save_file_list(input_files, working_folder, status_json, script_type, stage): 65 | if not os.path.exists(working_folder): 66 | os.makedirs(working_folder) 67 | simple_file_list = [] 68 | for file in input_files: 69 | simple_file_list.append([file.input_path, file.output_path, file.complete, file.lines_processed, file.error_lines, file.monthly_count_file]) 70 | with open(status_json, 'w') as status_json_file: 71 | output_dict = { 72 | "files": simple_file_list, 73 | "type": script_type, 74 | "stage": stage, 75 | } 76 | status_json_file.write(json.dumps(output_dict, indent=4)) 77 | 78 | 79 | # load file information from the json file and recalculate file sizes 80 | def load_file_list(status_json): 81 | if os.path.exists(status_json): 82 | with open(status_json, 'r') as status_json_file: 83 | output_dict = json.load(status_json_file) 84 | input_files = [] 85 | for simple_file in output_dict["files"]: 86 | input_files.append( 87 | FileConfig(simple_file[0], simple_file[1], simple_file[2], simple_file[3], simple_file[4], simple_file[5] if len(simple_file) > 5 else None) 88 | ) 89 | return input_files, output_dict["type"], output_dict["stage"] 90 | else: 91 | return None, None, "count" 92 | 93 | 94 | # recursively decompress and decode a chunk of bytes. If there's a decode error then read another chunk and try with that, up to a limit of max_window_size bytes 95 | def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0): 96 | chunk = reader.read(chunk_size) 97 | bytes_read += chunk_size 98 | if previous_chunk is not None: 99 | chunk = previous_chunk + chunk 100 | try: 101 | return chunk.decode() 102 | except UnicodeDecodeError: 103 | if bytes_read > max_window_size: 104 | raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes") 105 | return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read) 106 | 107 | 108 | # open a zst compressed ndjson file and yield lines one at a time 109 | # also passes back file progress 110 | def read_lines_zst(file_name): 111 | with open(file_name, 'rb') as file_handle: 112 | buffer = '' 113 | reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle) 114 | while True: 115 | chunk = read_and_decode(reader, 2**27, (2**29) * 2) 116 | if not chunk: 117 | break 118 | lines = (buffer + chunk).split("\n") 119 | 120 | for line in lines[:-1]: 121 | yield line, file_handle.tell() 122 | 123 | buffer = lines[-1] 124 | reader.close() 125 | 126 | 127 | # base of each separate process. Loads a file, iterates through lines and writes out 128 | # the ones where the `field` of the object matches `value`. Also passes status 129 | # information back to the parent via a queue 130 | def process_file(file, queue, field): 131 | output_file = None 132 | try: 133 | for line, file_bytes_processed in read_lines_zst(file.input_path): 134 | try: 135 | obj = json.loads(line) 136 | observed = obj[field].lower() 137 | if output_file is None: 138 | output_file = open(file.output_path, 'w', encoding="utf-8") 139 | output_file.write(observed) 140 | output_file.write("\n") 141 | except (KeyError, json.JSONDecodeError) as err: 142 | file.error_lines += 1 143 | file.lines_processed += 1 144 | if file.lines_processed % 1000000 == 0: 145 | file.bytes_processed = file_bytes_processed 146 | queue.put(file) 147 | 148 | if output_file is not None: 149 | output_file.close() 150 | 151 | file.complete = True 152 | file.bytes_processed = file.file_size 153 | except Exception as err: 154 | file.error_message = str(err) 155 | queue.put(file) 156 | 157 | 158 | if __name__ == '__main__': 159 | parser = argparse.ArgumentParser(description="Use multiple processes to decompress and iterate over pushshift dump files") 160 | parser.add_argument("input", help="The input folder to recursively read files from") 161 | parser.add_argument("--output", help="Name of the output file", default="field_counts") 162 | parser.add_argument("--working", help="The folder to store temporary files in", default="pushshift_working") 163 | parser.add_argument("--monthly_count_folder", help="The folder to store monthly count files in", default="pushshift_counts") 164 | parser.add_argument("--field", help="Which field to count", default="subreddit") 165 | parser.add_argument("--min_count", help="Dont write any counts below this number", default=1000, type=int) 166 | parser.add_argument("--processes", help="Number of processes to use", default=10, type=int) 167 | parser.add_argument("--file_filter", help="Regex filenames have to match to be processed", default="^rc_|rs_") 168 | parser.add_argument( 169 | "--error_rate", help= 170 | "Percentage as an integer from 0 to 100 of the lines where the field can be missing. For the subreddit field especially, " 171 | "there are a number of posts that simply don't have a subreddit attached", default=1, type=int) 172 | parser.add_argument("--debug", help="Enable debug logging", action='store_const', const=True, default=False) 173 | script_type = "count" 174 | 175 | args = parser.parse_args() 176 | 177 | if args.debug: 178 | log.setLevel(logging.DEBUG) 179 | 180 | log.info(f"Loading files from: {args.input}") 181 | if args.output: 182 | log.info(f"Writing output to: {args.output}") 183 | else: 184 | log.info(f"Writing output to working folder") 185 | 186 | multiprocessing.set_start_method('spawn') 187 | queue = multiprocessing.Manager().Queue() 188 | status_json = os.path.join(args.working, "status.json") 189 | input_files, saved_type, stage = load_file_list(status_json) 190 | 191 | if saved_type and saved_type != script_type: 192 | log.warning(f"Script type doesn't match type from json file. Delete working folder") 193 | sys.exit(0) 194 | 195 | if stage == "count": 196 | # if the file list wasn't loaded from the json, this is the first run, find what files we need to process 197 | if input_files is None: 198 | input_files = [] 199 | for subdir, dirs, files in os.walk(args.input): 200 | files.sort() 201 | for file_name in files: 202 | if file_name.endswith(".zst") and re.search(args.file_filter, file_name, re.IGNORECASE) is not None: 203 | input_path = os.path.join(subdir, file_name) 204 | output_path = os.path.join(args.working, file_name[:-4]) 205 | input_files.append(FileConfig(input_path, output_path=output_path)) 206 | 207 | save_file_list(input_files, args.working, status_json, script_type, "count") 208 | else: 209 | log.info(f"Existing input file was read, if this is not correct you should delete the {args.working} folder and run this script again") 210 | 211 | files_processed = 0 212 | total_bytes = 0 213 | total_bytes_processed = 0 214 | total_lines_processed = 0 215 | total_lines_errored = 0 216 | files_to_process = [] 217 | # calculate the total file size for progress reports, build a list of incomplete files to process 218 | # do this largest to smallest by file size so that we aren't processing a few really big files with only a few threads at the end 219 | for file in sorted(input_files, key=lambda item: item.file_size, reverse=True): 220 | total_bytes += file.file_size 221 | if file.complete: 222 | files_processed += 1 223 | total_lines_processed += file.lines_processed 224 | total_bytes_processed += file.file_size 225 | total_lines_errored += file.error_lines 226 | else: 227 | files_to_process.append(file) 228 | 229 | log.info(f"Processed {files_processed} of {len(input_files)} files with {(total_bytes_processed / (2**30)):.2f} of {(total_bytes / (2**30)):.2f} gigabytes") 230 | 231 | start_time = time.time() 232 | if len(files_to_process): 233 | progress_queue = Queue(40) 234 | progress_queue.put([start_time, total_lines_processed, total_bytes_processed]) 235 | speed_queue = Queue(40) 236 | for file in files_to_process: 237 | log.info(f"Processing file: {file.input_path}") 238 | # start the workers 239 | with multiprocessing.Pool(processes=min(args.processes, len(files_to_process))) as pool: 240 | workers = pool.starmap_async(process_file, [(file, queue, args.field) for file in files_to_process], chunksize=1, error_callback=log.info) 241 | while not workers.ready(): 242 | # loop until the workers are all done, pulling in status messages as they are sent 243 | file_update = queue.get() 244 | if file_update.error_message is not None: 245 | log.warning(f"File failed {file_update.input_path}: {file_update.error_message}") 246 | # I'm going to assume that the list of files is short enough that it's no 247 | # big deal to just iterate each time since that saves a bunch of work 248 | total_lines_processed = 0 249 | total_bytes_processed = 0 250 | total_lines_errored = 0 251 | files_processed = 0 252 | files_errored = 0 253 | i = 0 254 | for file in input_files: 255 | if file.input_path == file_update.input_path: 256 | input_files[i] = file_update 257 | file = file_update 258 | total_lines_processed += file.lines_processed 259 | total_bytes_processed += file.bytes_processed 260 | total_lines_errored += file.error_lines 261 | files_processed += 1 if file.complete or file.error_message is not None else 0 262 | files_errored += 1 if file.error_message is not None else 0 263 | i += 1 264 | if file_update.complete or file_update.error_message is not None: 265 | save_file_list(input_files, args.working, status_json, script_type) 266 | current_time = time.time() 267 | progress_queue.put([current_time, total_lines_processed, total_bytes_processed]) 268 | 269 | first_time, first_lines, first_bytes = progress_queue.peek() 270 | bytes_per_second = int((total_bytes_processed - first_bytes)/(current_time - first_time)) 271 | speed_queue.put(bytes_per_second) 272 | seconds_left = int((total_bytes - total_bytes_processed) / int(sum(speed_queue.list) / len(speed_queue.list))) 273 | minutes_left = int(seconds_left / 60) 274 | hours_left = int(minutes_left / 60) 275 | days_left = int(hours_left / 24) 276 | 277 | log.info( 278 | f"{total_lines_processed:,} lines at {(total_lines_processed - first_lines)/(current_time - first_time):,.0f}/s, {total_lines_errored:,} errored : " 279 | f"{(total_bytes_processed / (2**30)):.2f} gb at {(bytes_per_second / (2**20)):,.0f} mb/s, {(total_bytes_processed / total_bytes) * 100:.0f}% : " 280 | f"{files_processed}({files_errored})/{len(input_files)} files : " 281 | f"{(str(days_left) + 'd ' if days_left > 0 else '')}{hours_left - (days_left * 24)}:{minutes_left - (hours_left * 60):02}:{seconds_left - (minutes_left * 60):02} remaining") 282 | 283 | log.info(f"{total_lines_processed:,}, {total_lines_errored} errored : {(total_bytes_processed / (2**30)):.2f} gb, {(total_bytes_processed / total_bytes) * 100:.0f}% : {files_processed}/{len(input_files)}") 284 | stage = "sum" 285 | save_file_list(input_files, args.working, status_json, script_type, stage) 286 | 287 | if stage == "sum": 288 | #working_file_paths = [] 289 | count_incomplete = 0 290 | # build a list of output files to combine 291 | input_files = sorted(input_files, key=lambda item: os.path.split(item.output_path)[1]) 292 | for file in input_files: 293 | if not file.complete: 294 | if file.error_message is not None: 295 | log.info(f"File {file.input_path} errored {file.error_message}") 296 | else: 297 | log.info(f"File {file.input_path} is not marked as complete") 298 | count_incomplete += 1 299 | else: 300 | if file.error_lines > file.lines_processed * (args.error_rate * 0.01): 301 | log.info( 302 | f"File {file.input_path} has {file.error_lines:,} errored lines out of {file.lines_processed:,}, " 303 | f"{(file.error_lines / file.lines_processed) * (args.error_rate * 0.01):.2f}% which is above the limit of {args.error_rate}%") 304 | count_incomplete += 1 305 | 306 | if count_incomplete > 0: 307 | log.info(f"{count_incomplete} files were not completed, errored or don't exist, something went wrong. Aborting") 308 | sys.exit() 309 | 310 | log.info(f"Processing complete, combining {len(input_files)} result files") 311 | 312 | if not os.path.exists(args.monthly_count_folder): 313 | os.makedirs(args.monthly_count_folder) 314 | input_lines = 0 315 | files_counted = 0 316 | monthly_count_folder_paths = [] 317 | for file in input_files: 318 | files_counted += 1 319 | if not os.path.exists(file.output_path): 320 | log.info(f"Output file {file.output_path} does not exist, skipping") 321 | continue 322 | monthly_counts = defaultdict(int) 323 | log.info(f"Reading {files_counted}/{len(input_files)} : {input_lines:,} : {os.path.split(file.output_path)[1]}") 324 | with open(file.output_path, 'r') as input_file: 325 | for line in input_file: 326 | input_lines += 1 327 | monthly_counts[line.strip()] += 1 328 | 329 | file.monthly_count_file = os.path.join(args.monthly_count_folder, os.path.basename(file.output_path)) 330 | with open(file.monthly_count_file, 'w') as output_handle: 331 | for field, count in sorted(monthly_counts.items(), key=lambda item: item[1], reverse=True): 332 | output_handle.write(f"{field} {count}\n") 333 | 334 | log.info(f"Finished combining files into monthlies, {input_lines:,} lines read. Combining into result output") 335 | stage = "agg" 336 | save_file_list(input_files, args.working, status_json, script_type, stage) 337 | 338 | if stage == "agg": 339 | field_counts = defaultdict(int) 340 | for file in input_files: 341 | with open(file.monthly_count_file, 'r') as input_handle: 342 | for line in input_handle: 343 | field, count = line.strip().split("\t") 344 | field_counts[field] = count 345 | 346 | sorted_counts = sorted(field_counts.items(), key=lambda item: item[1], reverse=True) 347 | 348 | output_counts = 0 349 | with open(f"{args.output}.txt", 'w') as output_handle: 350 | for field, count in sorted_counts: 351 | if count >= args.min_count: 352 | output_counts += 1 353 | output_handle.write(f"{field} {count}\n") 354 | 355 | log.info(f"Finished combining files, {output_counts:,} field counts written") 356 | -------------------------------------------------------------------------------- /personal/diagnostic/get_zst_details.py: -------------------------------------------------------------------------------- 1 | import zstandard 2 | import os 3 | import json 4 | import sys 5 | import time 6 | import argparse 7 | import re 8 | from collections import defaultdict 9 | import logging.handlers 10 | import multiprocessing 11 | import utils 12 | 13 | 14 | # sets up logging to the console as well as a file 15 | log = logging.getLogger("bot") 16 | log.setLevel(logging.INFO) 17 | log_formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s') 18 | 19 | log_str_handler = logging.StreamHandler() 20 | log_str_handler.setFormatter(log_formatter) 21 | log.addHandler(log_str_handler) 22 | if not os.path.exists("logs"): 23 | os.makedirs("logs") 24 | log_file_handler = logging.handlers.RotatingFileHandler( 25 | os.path.join("logs", "bot.log"), maxBytes=1024*1024*16, backupCount=5) 26 | log_file_handler.setFormatter(log_formatter) 27 | log.addHandler(log_file_handler) 28 | 29 | 30 | def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0): 31 | chunk = reader.read(chunk_size) 32 | bytes_read += len(chunk) 33 | if previous_chunk is not None: 34 | chunk = previous_chunk + chunk 35 | try: 36 | return chunk.decode(), bytes_read 37 | except UnicodeDecodeError: 38 | if bytes_read > max_window_size: 39 | raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes") 40 | return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read) 41 | 42 | 43 | def count_lines_bytes(file_name): 44 | count_lines = 0 45 | uncompressed_bytes = 0 46 | with open(file_name, 'rb') as file_handle: 47 | buffer = '' 48 | reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle) 49 | 50 | while True: 51 | chunk, chunk_bytes = read_and_decode(reader, 2**27, (2**29) * 2) 52 | uncompressed_bytes += chunk_bytes 53 | if not chunk: 54 | break 55 | lines = (buffer + chunk).split("\n") 56 | count_lines += len(lines) - 1 57 | 58 | buffer = lines[-1] 59 | reader.close() 60 | return count_lines, uncompressed_bytes 61 | 62 | 63 | if __name__ == '__main__': 64 | input_path = r"\\MYCLOUDPR4100\Public\reddit\comments\RC_2008-03.zst" 65 | compressed_size = os.stat(input_path).st_size 66 | count_lines, uncompressed_bytes = count_lines_bytes(input_path) 67 | log.info(f"Compressed size: {compressed_size:,} : {(compressed_size / (2**30)):.2f} gb") 68 | log.info(f"Uncompressed size: {uncompressed_bytes:,} : {(uncompressed_bytes / (2**30)):.2f} gb") 69 | log.info(f"Ratio: {(uncompressed_bytes / compressed_size):.2f}") 70 | log.info(f"Lines: {count_lines:,}") 71 | -------------------------------------------------------------------------------- /personal/diagnostic/sum_subreddit_counts.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging.handlers 3 | from collections import defaultdict 4 | 5 | 6 | log = logging.getLogger("bot") 7 | log.setLevel(logging.DEBUG) 8 | log.addHandler(logging.StreamHandler()) 9 | 10 | if __name__ == '__main__': 11 | input_folder = r"\\MYCLOUDPR4100\Public\pushshift_counts_summed" 12 | output_file = r"\\MYCLOUDPR4100\Public\subreddit_counts_total.txt" 13 | subreddits = defaultdict(int) 14 | 15 | for subdir, dirs, files in os.walk(input_folder): 16 | for filename in files: 17 | log.info(f"Processing file: {filename}") 18 | input_path = os.path.join(subdir, filename) 19 | with open(input_path, 'r') as input_handle: 20 | line_count = 0 21 | for line in input_handle: 22 | subreddit, count_string = line.strip().split("\t") 23 | count = int(count_string) 24 | subreddits[subreddit] += count 25 | line_count += 1 26 | 27 | log.info(f"Total subreddits: {len(subreddits):,}") 28 | 29 | count_written = 0 30 | with open(output_file, 'w') as output_handle: 31 | for subreddit, count in sorted(subreddits.items(), key=lambda item: item[1], reverse=True): 32 | output_handle.write(f"{subreddit} {count}\n") 33 | count_written += 1 34 | if count_written % 1000000 == 0: 35 | log.info(f"Written: {count_written:,}/{len(subreddits):,}") 36 | 37 | log.info(f"Written: {count_written:,}/{len(subreddits):,}") 38 | -------------------------------------------------------------------------------- /personal/diagnostic/test_file.py: -------------------------------------------------------------------------------- 1 | import utils 2 | import discord_logging 3 | import os 4 | import sys 5 | from datetime import datetime 6 | 7 | log = discord_logging.init_logging() 8 | 9 | 10 | if __name__ == "__main__": 11 | input_path = r"\\MYCLOUDPR4100\Public\reddit\submissions\RS_2023-04.zst" 12 | 13 | input_file_paths = [] 14 | if os.path.isdir(input_path): 15 | for subdir, dirs, files in os.walk(input_path): 16 | files.sort() 17 | for file_name in files: 18 | if file_name.endswith(".zst"): 19 | input_file_paths.append(os.path.join(subdir, file_name)) 20 | else: 21 | input_file_paths.append(input_path) 22 | 23 | files_processed = 0 24 | for file_path in input_file_paths: 25 | file_name = os.path.basename(file_path) 26 | file_size = os.stat(file_path).st_size 27 | file_lines = 0 28 | file_bytes_processed = 0 29 | created = None 30 | previous_timestamp = None 31 | inserts = [] 32 | for obj, line, file_bytes_processed in utils.read_obj_zst_meta(file_path): 33 | new_timestamp = int(obj['created_utc']) 34 | created = datetime.utcfromtimestamp(new_timestamp) 35 | if previous_timestamp is not None and previous_timestamp - (2) > new_timestamp: 36 | log.warning(f"Out of order timestamps {datetime.utcfromtimestamp(previous_timestamp).strftime('%Y-%m-%d %H:%M:%S')} - 4 hours > {created.strftime('%Y-%m-%d %H:%M:%S')}") 37 | previous_timestamp = new_timestamp 38 | file_lines += 1 39 | if file_lines % 100000 == 0: 40 | log.info(f"{files_processed}/{len(input_file_paths)}: {file_name} : {created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : {(file_bytes_processed / file_size) * 100:.0f}%") 41 | 42 | files_processed += 1 43 | log.info(f"{files_processed}/{len(input_file_paths)}: {file_name} : {created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : 100%") 44 | -------------------------------------------------------------------------------- /personal/diagnostic/test_files_multiprocess.py: -------------------------------------------------------------------------------- 1 | import zstandard 2 | import os 3 | import json 4 | import sys 5 | import time 6 | import argparse 7 | import re 8 | from collections import defaultdict 9 | from datetime import datetime 10 | import logging.handlers 11 | import multiprocessing 12 | 13 | 14 | # sets up logging to the console as well as a file 15 | log = logging.getLogger("bot") 16 | log.setLevel(logging.INFO) 17 | log_formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s') 18 | 19 | log_stderr_handler = logging.StreamHandler() 20 | log_stderr_handler.setFormatter(log_formatter) 21 | log.addHandler(log_stderr_handler) 22 | if not os.path.exists("logs"): 23 | os.makedirs("logs") 24 | log_file_handler = logging.handlers.RotatingFileHandler( 25 | os.path.join("logs", "bot.log"), maxBytes=1024*1024*16, backupCount=5) 26 | log_file_handler.setFormatter(log_formatter) 27 | log.addHandler(log_file_handler) 28 | 29 | 30 | # convenience object used to pass status information between processes 31 | class FileConfig: 32 | def __init__(self, input_path, output_path=None, complete=False, lines_processed=0, error_lines=0): 33 | self.input_path = input_path 34 | self.output_path = output_path 35 | self.file_size = os.stat(input_path).st_size 36 | self.complete = complete 37 | self.bytes_processed = self.file_size if complete else 0 38 | self.lines_processed = lines_processed if complete else 0 39 | self.error_message = None 40 | self.error_lines = error_lines 41 | 42 | def __str__(self): 43 | return f"{self.input_path} : {self.output_path} : {self.file_size} : {self.complete} : {self.bytes_processed} : {self.lines_processed}" 44 | 45 | 46 | # used for calculating running average of read speed 47 | class Queue: 48 | def __init__(self, max_size): 49 | self.list = [] 50 | self.max_size = max_size 51 | 52 | def put(self, item): 53 | if len(self.list) >= self.max_size: 54 | self.list.pop(0) 55 | self.list.append(item) 56 | 57 | def peek(self): 58 | return self.list[0] if len(self.list) > 0 else None 59 | 60 | 61 | # save file information and progress to a json file 62 | # we don't want to save the whole FileConfig object, since some info resets if we restart 63 | def save_file_list(input_files, status_json, script_type): 64 | simple_file_list = [] 65 | for file in input_files.values(): 66 | simple_file_list.append([file.input_path, file.output_path, file.complete, file.lines_processed, file.error_lines]) 67 | with open(status_json, 'w') as status_json_file: 68 | output_dict = { 69 | "files": simple_file_list, 70 | "type": script_type, 71 | } 72 | status_json_file.write(json.dumps(output_dict, indent=4)) 73 | 74 | 75 | # load file information from the json file and recalculate file sizes 76 | def load_file_list(status_json): 77 | if os.path.exists(status_json): 78 | with open(status_json, 'r') as status_json_file: 79 | output_dict = json.load(status_json_file) 80 | input_files = {} 81 | for simple_file in output_dict["files"]: 82 | input_files[simple_file[0]] = FileConfig(simple_file[0], simple_file[1], simple_file[2], simple_file[3], simple_file[4]) 83 | return input_files, output_dict["type"] 84 | else: 85 | return None, None 86 | 87 | 88 | # recursively decompress and decode a chunk of bytes. If there's a decode error then read another chunk and try with that, up to a limit of max_window_size bytes 89 | def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0): 90 | chunk = reader.read(chunk_size) 91 | bytes_read += chunk_size 92 | if previous_chunk is not None: 93 | chunk = previous_chunk + chunk 94 | try: 95 | return chunk.decode() 96 | except UnicodeDecodeError: 97 | if bytes_read > max_window_size: 98 | raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes") 99 | return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read) 100 | 101 | 102 | # open a zst compressed ndjson file and yield lines one at a time 103 | # also passes back file progress 104 | def read_lines_zst(file_name): 105 | with open(file_name, 'rb') as file_handle: 106 | buffer = '' 107 | reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle) 108 | while True: 109 | chunk = read_and_decode(reader, 2**27, (2**29) * 2) 110 | if not chunk: 111 | break 112 | lines = (buffer + chunk).split("\n") 113 | 114 | for line in lines[:-1]: 115 | yield line, file_handle.tell() 116 | 117 | buffer = lines[-1] 118 | reader.close() 119 | 120 | 121 | def process_file(file, queue): 122 | try: 123 | for line, file_bytes_processed in read_lines_zst(file.input_path): 124 | try: 125 | obj = json.loads(line) 126 | observed = obj["created_utc"] 127 | # just load the json and try to access a field to make sure it works 128 | except (KeyError, json.JSONDecodeError) as err: 129 | file.error_lines += 1 130 | file.lines_processed += 1 131 | if file.lines_processed % 1000000 == 0: 132 | file.bytes_processed = file_bytes_processed 133 | queue.put(file) 134 | 135 | file.complete = True 136 | file.bytes_processed = file.file_size 137 | except Exception as err: 138 | file.error_message = str(err) 139 | queue.put(file) 140 | 141 | 142 | def process_update(input_files, queue, last_log_time, force_write): 143 | file_update = queue.get() 144 | if file_update.error_message is not None: 145 | log.warning(f"File failed {file_update.input_path}: {file_update.error_message}") 146 | current_time = time.time() 147 | 148 | input_files[file_update.input_path] = file_update 149 | if force_write or last_log_time is None or (current_time - last_log_time) > 5 or queue.empty(): 150 | total_lines_processed = 0 151 | total_bytes_processed = 0 152 | total_lines_errored = 0 153 | files_processed = 0 154 | files_errored = 0 155 | i = 0 156 | for file in input_files.values(): 157 | total_lines_processed += file.lines_processed 158 | total_bytes_processed += file.bytes_processed 159 | total_lines_errored += file.error_lines 160 | files_processed += 1 if file.complete or file.error_message is not None else 0 161 | files_errored += 1 if file.error_message is not None else 0 162 | i += 1 163 | if file_update.complete or file_update.error_message is not None: 164 | save_file_list(input_files, status_json, script_type) 165 | progress_queue.put([current_time, total_lines_processed, total_bytes_processed]) 166 | 167 | first_time, first_lines, first_bytes = progress_queue.peek() 168 | bytes_per_second = int((total_bytes_processed - first_bytes)/(current_time - first_time)) 169 | speed_queue.put(bytes_per_second) 170 | seconds_left = int((total_bytes - total_bytes_processed) / int(sum(speed_queue.list) / len(speed_queue.list))) 171 | minutes_left = int(seconds_left / 60) 172 | hours_left = int(minutes_left / 60) 173 | days_left = int(hours_left / 24) 174 | 175 | log.info( 176 | f"{total_lines_processed:,} lines at {(total_lines_processed - first_lines)/(current_time - first_time):,.0f}/s, {total_lines_errored:,} errored : " 177 | f"{(total_bytes_processed / (2**30)):.2f} gb at {(bytes_per_second / (2**20)):,.0f} mb/s, {(total_bytes_processed / total_bytes) * 100:.0f}% : " 178 | f"{files_processed}({files_errored})/{len(input_files)} files : " 179 | f"{(str(days_left) + 'd ' if days_left > 0 else '')}{hours_left - (days_left * 24)}:{minutes_left - (hours_left * 60):02}:{seconds_left - (minutes_left * 60):02} remaining : " 180 | f"{queue.qsize()} files in queue : {current_time} : {last_log_time} : {current_time - last_log_time if last_log_time is not None else 0} : " 181 | f"{(current_time - last_log_time) > 5 if last_log_time is not None else 0} : {int(current_time - last_log_time) > 5 if last_log_time is not None else 0} : " 182 | f"{last_log_time is None or (current_time - last_log_time) > 5 or queue.empty()} : {queue.empty()}") 183 | last_log_time = time.time() 184 | return last_log_time 185 | 186 | 187 | if __name__ == '__main__': 188 | parser = argparse.ArgumentParser(description="Use multiple processes to decompress and iterate over pushshift dump files") 189 | parser.add_argument("input", help="The input folder to recursively read files from") 190 | parser.add_argument("--processes", help="Number of processes to use", default=10, type=int) 191 | parser.add_argument("--debug", help="Enable debug logging", action='store_const', const=True, default=False) 192 | script_type = "test" 193 | 194 | args = parser.parse_args() 195 | 196 | if args.debug: 197 | log.setLevel(logging.DEBUG) 198 | 199 | log.info(f"Loading files from: {args.input}") 200 | 201 | multiprocessing.set_start_method('spawn') 202 | queue = multiprocessing.Manager().Queue() 203 | status_json = "status.json" 204 | input_files, saved_type = load_file_list(status_json) 205 | 206 | if saved_type and saved_type != script_type: 207 | log.warning(f"Script type doesn't match type from json file. Delete working folder") 208 | sys.exit(0) 209 | 210 | # if the file list wasn't loaded from the json, this is the first run, find what files we need to process 211 | if input_files is None: 212 | input_files = {} 213 | for subdir, dirs, files in os.walk(args.input): 214 | files.sort() 215 | for file_name in files: 216 | if file_name.endswith(".zst"): 217 | input_path = os.path.join(subdir, file_name) 218 | input_files[input_path] = FileConfig(input_path) 219 | 220 | save_file_list(input_files, status_json, script_type) 221 | else: 222 | log.info(f"Existing input file was read, if this is not correct you should delete the {status_json} folder and run this script again") 223 | 224 | files_processed = 0 225 | total_bytes = 0 226 | total_bytes_processed = 0 227 | total_lines_processed = 0 228 | total_lines_errored = 0 229 | files_to_process = [] 230 | # calculate the total file size for progress reports, build a list of incomplete files to process 231 | # do this largest to smallest by file size so that we aren't processing a few really big files with only a few threads at the end 232 | for file in sorted(input_files.values(), key=lambda item: item.file_size, reverse=True): 233 | total_bytes += file.file_size 234 | if file.complete: 235 | files_processed += 1 236 | total_lines_processed += file.lines_processed 237 | total_bytes_processed += file.file_size 238 | total_lines_errored += file.error_lines 239 | else: 240 | files_to_process.append(file) 241 | 242 | log.info(f"Processed {files_processed} of {len(input_files)} files with {(total_bytes_processed / (2**30)):.2f} of {(total_bytes / (2**30)):.2f} gigabytes") 243 | 244 | start_time = time.time() 245 | last_log_time = None 246 | if len(files_to_process): 247 | progress_queue = Queue(40) 248 | progress_queue.put([start_time, total_lines_processed, total_bytes_processed]) 249 | speed_queue = Queue(40) 250 | for file in files_to_process: 251 | log.debug(f"Processing file: {file.input_path}") 252 | # start the workers 253 | with multiprocessing.Pool(processes=min(args.processes, len(files_to_process))) as pool: 254 | workers = pool.starmap_async(process_file, [(file, queue) for file in files_to_process], chunksize=1, error_callback=log.info) 255 | while not workers.ready(): 256 | # loop until the workers are all done, pulling in status messages as they are sent 257 | last_log_time = process_update(input_files, queue, last_log_time, False) 258 | 259 | while not queue.empty(): 260 | 261 | 262 | log.info(f"{total_lines_processed:,}, {total_lines_errored} errored : {(total_bytes_processed / (2**30)):.2f} gb, {(total_bytes_processed / total_bytes) * 100:.0f}% : {files_processed}/{len(input_files)}") 263 | 264 | count_complete = 0 265 | count_incomplete = 0 266 | # build a list of output files to combine 267 | for file in input_files.values(): 268 | if not file.complete: 269 | if file.error_message is not None: 270 | log.info(f"File {file.input_path} errored {file.error_message}") 271 | else: 272 | log.info(f"File {file.input_path} is not marked as complete") 273 | count_incomplete += 1 274 | else: 275 | if file.error_lines > 0: 276 | log.info(f"File {file.input_path} has {file.error_lines:,} errored lines out of {file.lines_processed:,}") 277 | count_incomplete += 1 278 | else: 279 | count_complete += 1 280 | 281 | if count_incomplete > 0: 282 | log.info(f"{count_incomplete} files were not completed, errored or don't exist, something went wrong. Aborting") 283 | else: 284 | log.info(f"Processing complete, {count_complete} successful files") 285 | -------------------------------------------------------------------------------- /personal/mongo/export_mongo.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import utils 4 | import discord_logging 5 | import pymongo 6 | import time 7 | import sys 8 | from datetime import datetime 9 | 10 | log = discord_logging.init_logging() 11 | 12 | 13 | if __name__ == "__main__": 14 | mongo_address = sys.argv[1] # 192.168.1.131 15 | client = pymongo.MongoClient(f"mongodb://{mongo_address}:27017", serverSelectionTimeoutMS=5000) 16 | log.info(f"Database connected at {mongo_address} on {client.admin.command('serverStatus')['host']}") 17 | 18 | subreddits = [ 19 | "PersonalFinanceCanada" 20 | ] 21 | start_date = datetime(2020, 1, 1) 22 | end_date = datetime(2021, 1, 1) 23 | 24 | for subreddit in subreddits: 25 | count = 0 26 | start_time = time.time() 27 | cursor = client.reddit_database.comments.find( 28 | filter={"subreddit": subreddit, "created_utc": {"$gte": int(start_date.timestamp()), "$lt": int(end_date.timestamp())}}, 29 | projection={'_id': False}, 30 | sort=[('created_utc', pymongo.ASCENDING)] 31 | ) 32 | log.info(f"Got cursor in {int(time.time() - start_time)} seconds") 33 | 34 | output_writer = utils.OutputZst(r"\\MYCLOUDPR4100\Public\reddit_final\{0}_comments.zst".format(subreddit)) 35 | start_time = time.time() 36 | for comment in cursor: 37 | count += 1 38 | output_writer.write(json.dumps(comment, separators=(',', ':'))) 39 | output_writer.write("\n") 40 | if count % 10000 == 0: 41 | log.info(f"{count:,} through {datetime.utcfromtimestamp(int(comment['created_utc'])).strftime('%Y-%m-%d %H:%M:%S')} in {int(time.time() - start_time)} seconds r/{subreddit}") 42 | 43 | output_writer.close() 44 | log.info(f"{count:,} in {int(time.time() - start_time)} seconds r/{subreddit}") 45 | 46 | 47 | # db.comments.createIndex({subreddit:1}) // remove 48 | # db.comments.createIndex({subreddit:1, created_utc:1}) 49 | # db.comments.createIndex({author:1, created_utc:1}) 50 | # db.comments.createIndex({id:1}) 51 | # db.submissions.createIndex({subreddit:1, created_utc:1}) 52 | # db.submissions.createIndex({author:1, created_utc:1}) 53 | # db.submissions.createIndex({id:1}) 54 | # db.submissions.createIndex({created_utc:1}) 55 | # db.comments.createIndex({created_utc:1}) 56 | -------------------------------------------------------------------------------- /personal/mongo/group_subs.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import datetime 3 | import utils 4 | import discord_logging 5 | import pymongo 6 | import time 7 | import sys 8 | 9 | log = discord_logging.init_logging() 10 | 11 | 12 | if __name__ == "__main__": 13 | mongo_address = sys.argv[1] # 192.168.1.131 14 | client = pymongo.MongoClient(f"mongodb://{mongo_address}:27017", serverSelectionTimeoutMS=5000) 15 | log.info(f"Database connected at {mongo_address} on {client.admin.command('serverStatus')['host']}") 16 | 17 | count = 0 18 | start_time = time.time() 19 | start_date = int(datetime(2021, 6, 1).timestamp()) 20 | cursor = client.reddit_database.submissions.aggregate( 21 | [ 22 | {"$match": {"created_utc": {"$gt": start_date}}}, 23 | {"$project": {"subreddit": 1, "over_18": {"$cond": ["$over_18", 1, 0]}}}, 24 | {"$group": {"_id": "$subreddit", "countTotal": {"$count": {}}, "countNsfw": {"$sum": "$over_18"}}}, 25 | {"$match": {"countTotal": {"$gt": 100}}}, 26 | ], 27 | allowDiskUse=True 28 | ) 29 | log.info(f"Got cursor in {int(time.time() - start_time)} seconds") 30 | 31 | start_time = time.time() 32 | subreddits = [] 33 | for subreddit in cursor: 34 | subreddit['percent'] = int((subreddit['countNsfw']/subreddit['countTotal'])*100) 35 | if subreddit['percent'] >= 10: 36 | subreddits.append(subreddit) 37 | count += 1 38 | if count % 100000 == 0: 39 | log.info(f"{count:,} in {int(time.time() - start_time)} seconds") 40 | 41 | log.info(f"{count:,} in {int(time.time() - start_time)} seconds") 42 | 43 | file_out = open(r"\\MYCLOUDPR4100\Public\reddit_final\subreddits.txt", 'w') 44 | for subreddit in sorted(subreddits, key=lambda item: (item['percent'], item['countTotal']), reverse=True): 45 | file_out.write(f"{subreddit['_id']: <22}{subreddit['countTotal']: <8}{subreddit['countNsfw']: <8}{subreddit['percent']}%\n") 46 | file_out.close() 47 | 48 | 49 | # db.comments.createIndex({subreddit:1}) // remove 50 | # db.comments.createIndex({subreddit:1, created_utc:1}) 51 | # db.comments.createIndex({author:1, created_utc:1}) 52 | # db.comments.createIndex({id:1}) 53 | # db.submissions.createIndex({subreddit:1, created_utc:1}) 54 | # db.submissions.createIndex({author:1, created_utc:1}) 55 | # db.submissions.createIndex({id:1}) 56 | # db.submissions.createIndex({created_utc:1}) 57 | # db.comments.createIndex({created_utc:1}) 58 | -------------------------------------------------------------------------------- /personal/mongo/insert_mongo.py: -------------------------------------------------------------------------------- 1 | import utils 2 | import discord_logging 3 | import os 4 | import pymongo 5 | import sys 6 | from datetime import datetime 7 | 8 | log = discord_logging.init_logging() 9 | 10 | 11 | if __name__ == "__main__": 12 | mongo_address = sys.argv[1] # 192.168.1.131 13 | client = pymongo.MongoClient(f"mongodb://{mongo_address}:27017", serverSelectionTimeoutMS=5000) 14 | 15 | log.info(f"Database connected at {mongo_address} on {client.admin.command('serverStatus')['host']}") 16 | 17 | object_type = sys.argv[2] 18 | input_folder = sys.argv[3] 19 | input_files = [] 20 | total_size = 0 21 | for subdir, dirs, files in os.walk(input_folder + os.sep + object_type): 22 | files.sort() 23 | for filename in files: 24 | input_path = os.path.join(subdir, filename) 25 | if input_path.endswith(".zst"): 26 | file_size = os.stat(input_path).st_size 27 | total_size += file_size 28 | input_files.append([input_path, file_size]) 29 | 30 | log.info(f"Processing {len(input_files)} files of {(total_size / (2 ** 30)):.2f} gigabytes") 31 | 32 | collection = client.reddit_database[object_type] 33 | 34 | log.info(f"Using collection {object_type} which has {collection.estimated_document_count()} objects already") 35 | 36 | total_lines = 0 37 | total_bytes_processed = 0 38 | for input_file in input_files: 39 | file_lines = 0 40 | file_bytes_processed = 0 41 | created = None 42 | inserts = [] 43 | for obj, line, file_bytes_processed in utils.read_obj_zst_meta(input_file[0]): 44 | inserts.append(obj) 45 | if len(inserts) >= 10000: 46 | collection.insert_many(inserts) 47 | inserts = [] 48 | 49 | created = datetime.utcfromtimestamp(int(obj['created_utc'])) 50 | file_lines += 1 51 | if file_lines == 1: 52 | log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines + total_lines:,} : 0% : {(total_bytes_processed / total_size) * 100:.0f}%") 53 | if file_lines % 100000 == 0: 54 | log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines + total_lines:,} : {(file_bytes_processed / input_file[1]) * 100:.0f}% : {(total_bytes_processed / total_size) * 100:.0f}%") 55 | 56 | if len(inserts) >= 0: 57 | collection.insert_many(inserts) 58 | total_lines += file_lines 59 | total_bytes_processed += input_file[1] 60 | log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {total_lines:,} : 100% : {(total_bytes_processed / total_size) * 100:.0f}%") 61 | 62 | log.info(f"Total: {total_lines}") 63 | -------------------------------------------------------------------------------- /personal/move/copy_listed_files.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import os 3 | import logging.handlers 4 | import re 5 | 6 | log = logging.getLogger("bot") 7 | log.setLevel(logging.DEBUG) 8 | log.addHandler(logging.StreamHandler()) 9 | 10 | if __name__ == '__main__': 11 | input_folder = r"\\MYCLOUDPR4100\Public\pushshift_output" 12 | output_folder = r"\\MYCLOUDPR4100\Public\request" 13 | subs = ['PoliticalDiscussion', 'worldnews', 'science'] 14 | overwrite = False 15 | 16 | lower_subs = set() 17 | for sub in subs: 18 | lower_subs.add(sub.lower()) 19 | 20 | matched_subs = set() 21 | total_size = 0 22 | for file_name in os.listdir(input_folder): 23 | file_path = os.path.join(input_folder, file_name) 24 | if file_name.endswith(".zst") and os.path.isfile(file_path): 25 | match = re.match(r"(\w+)(?:_(?:comments|submissions).zst)", file_name) 26 | if match: 27 | sub_cased = match.group(1) 28 | if sub_cased.lower() in lower_subs: 29 | matched_subs.add(sub_cased) 30 | file_size = os.stat(file_path).st_size 31 | total_size += file_size 32 | log.info(f"Copying {file_name} : {(file_size / (2**20)):,.0f} mb : {(total_size / (2**20)):,.0f} mb") 33 | output_path = os.path.join(output_folder, file_name) 34 | if overwrite or not os.path.exists(output_path): 35 | shutil.copy(file_path, output_path) 36 | 37 | log.info(f"Copied {len(matched_subs)}/{len(subs)} subs of total size {(total_size / (2**20)):,.0f} mb") 38 | if len(matched_subs) != len(lower_subs): 39 | lower_matched_subs = [sub.lower() for sub in matched_subs] 40 | for sub in lower_subs: 41 | if sub not in lower_matched_subs: 42 | log.info(f"Missing r/{sub}") 43 | 44 | sorted_case_subs = sorted(matched_subs) 45 | bldr = ['torrenttools create -a "https://academictorrents.com/announce.php" -c "Comments and submissions from r/'] 46 | bldr.append(', r/'.join(sorted_case_subs)) 47 | bldr.append(' through the end of 2022" --include ".*') 48 | bldr.append('.*zst" --include ".*'.join(sorted_case_subs)) 49 | bldr.append('.*zst" -o username.torrent reddit') 50 | log.info(''.join(bldr)) 51 | -------------------------------------------------------------------------------- /personal/move/move_files.py: -------------------------------------------------------------------------------- 1 | import os 2 | import discord_logging 3 | import re 4 | from datetime import datetime 5 | 6 | log = discord_logging.init_logging() 7 | 8 | 9 | if __name__ == "__main__": 10 | parent_folder = r"\\MYCLOUDPR4100\Public\ingest" 11 | folders = [r"ingest\comments",r"ingest\submissions",r"rescan\comments",r"rescan\submissions"] 12 | reg = re.compile(r"\d\d-\d\d-\d\d_\d\d-\d\d") 13 | for folder in folders: 14 | files = [] 15 | created_date_folders = set() 16 | folder_path = os.path.join(parent_folder, folder) 17 | for file in os.listdir(folder_path): 18 | file_path = os.path.join(folder_path, file) 19 | if file.endswith(".zst"): 20 | files.append(file) 21 | log.info(f"{folder}: {len(files):,}") 22 | 23 | count_moved = 0 24 | for file in files: 25 | match = reg.search(file) 26 | if not match: 27 | log.info(f"File doesn't match regex: {file}") 28 | continue 29 | file_date = datetime.strptime(match.group(), '%y-%m-%d_%H-%M') 30 | date_folder_name = file_date.strftime('%y-%m-%d') 31 | date_folder_path = os.path.join(folder_path, date_folder_name) 32 | if date_folder_name not in created_date_folders: 33 | log.info(f"Creating folder: {date_folder_path}") 34 | if not os.path.exists(date_folder_path): 35 | os.makedirs(date_folder_path) 36 | created_date_folders.add(date_folder_name) 37 | old_file_path = os.path.join(folder_path, file) 38 | new_file_path = os.path.join(date_folder_path, file) 39 | os.rename(old_file_path, new_file_path) 40 | count_moved += 1 41 | if count_moved % 100 == 0: 42 | log.info(f"{count_moved:,}/{len(files):,}: {folder}") 43 | log.info(f"{count_moved:,}/{len(files):,}: {folder}") 44 | -------------------------------------------------------------------------------- /personal/move/rename_files.py: -------------------------------------------------------------------------------- 1 | import os 2 | import discord_logging 3 | import re 4 | from datetime import datetime 5 | 6 | log = discord_logging.init_logging() 7 | 8 | 9 | if __name__ == "__main__": 10 | parent_folder = r"\\MYCLOUDPR4100\Public\ingest\combined\comments" 11 | files = [] 12 | for folder_name in os.listdir(parent_folder): 13 | folder = os.path.join(parent_folder, folder_name) 14 | for file in os.listdir(folder): 15 | file_path = os.path.join(parent_folder, folder, file) 16 | if file.endswith(".zst"): 17 | files.append((folder, file)) 18 | log.info(f"{parent_folder}: {len(files):,}") 19 | 20 | count_moved = 0 21 | for folder, old_file in files: 22 | old_path = os.path.join(folder, old_file) 23 | new_file = old_file.replace("RS_", "RC_") 24 | new_path = os.path.join(folder, new_file) 25 | 26 | os.rename(old_path, new_path) 27 | count_moved += 1 28 | if count_moved % 100 == 0: 29 | log.info(f"{count_moved:,}/{len(files):,}: {folder}") 30 | log.info(f"{count_moved:,}/{len(files):,}") 31 | -------------------------------------------------------------------------------- /personal/opt_in_quarantined.py: -------------------------------------------------------------------------------- 1 | import asyncpraw 2 | import requests 3 | import asyncio 4 | 5 | 6 | async def opt_in(reddit, subreddit_name): 7 | subreddit = await reddit.subreddit(subreddit_name) 8 | await subreddit.quaran.opt_in() 9 | 10 | 11 | async def main(subreddits): 12 | reddit = asyncpraw.Reddit("Watchful12") 13 | for subreddit_name in subreddits: 14 | print(f"r/{subreddit_name}") 15 | try: 16 | subreddit = await reddit.subreddit(subreddit_name) 17 | await subreddit.quaran.opt_in() 18 | except Exception as err: 19 | print(f"Error opting into r/{subreddit_name} : {err}") 20 | await reddit.close() 21 | 22 | 23 | if __name__ == "__main__": 24 | subreddits = requests.get("https://pastebin.com/raw/WKi36t1w").text.split("\r\n") 25 | asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) 26 | asyncio.run(main(subreddits)) 27 | -------------------------------------------------------------------------------- /personal/process_month.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('personal') 3 | sys.path.append('combine') 4 | sys.path.append('personal/combine') 5 | 6 | import os 7 | import argparse 8 | import json 9 | import time 10 | import logging.handlers 11 | import requests 12 | import praw 13 | import traceback 14 | from datetime import datetime, timedelta 15 | import multiprocessing_logging 16 | 17 | import discord_logging 18 | import multiprocessing 19 | 20 | log = discord_logging.init_logging() 21 | discord_logging.init_discord_logging( 22 | section_name="Watchful12", 23 | log_level=logging.WARNING, 24 | ) 25 | multiprocessing_logging.install_mp_handler(log) 26 | 27 | import utils 28 | from transform import split_blocks_by_minutes 29 | from combine.merge_and_backfill import build_day, IngestType, ObjectType 30 | from combine import build_month 31 | 32 | 33 | def get_pushshift_token(old_token): 34 | global pushshift_lock 35 | pushshift_lock.acquire() 36 | saved_token = load_pushshift_token() 37 | if saved_token is None or saved_token == "" or old_token == saved_token: 38 | if old_token is None: 39 | log.warning("No saved or passed in token") 40 | save_pushshift_token("") 41 | raise ValueError("No saved or passed in token") 42 | 43 | log.info(f"Requesting new token") 44 | result_token = re_auth_pushshift(old_token) 45 | save_pushshift_token(result_token) 46 | else: 47 | result_token = saved_token 48 | 49 | pushshift_lock.release() 50 | return result_token 51 | 52 | 53 | def save_pushshift_token(token): 54 | with open("pushshift.txt", 'w') as file: 55 | file.write(token) 56 | 57 | 58 | def load_pushshift_token(): 59 | if not os.path.exists("pushshift.txt"): 60 | return None 61 | with open("pushshift.txt", 'r') as file: 62 | token = file.read().strip() 63 | return token 64 | 65 | 66 | def re_auth_pushshift(old_token): 67 | url = f"https://auth.pushshift.io/refresh?access_token={old_token}" 68 | log.warning(f"Reauth request: {url}") 69 | response = requests.post(url) 70 | result = response.json() 71 | log.warning(f"Reauth response: {str(result)}") 72 | discord_logging.flush_discord() 73 | if 'access_token' in result: 74 | new_token = result['access_token'] 75 | log.warning(f"New pushshift token: {new_token}") 76 | save_pushshift_token(new_token) 77 | discord_logging.flush_discord() 78 | return new_token 79 | elif 'detail' in result: 80 | if result['detail'] == 'Access token is still active and can not be refreshed.': 81 | log.warning(f"Access token still active, trying request again") 82 | time.sleep(5) 83 | return old_token 84 | 85 | log.warning(f"Reauth failed: {result['detail']}") 86 | discord_logging.flush_discord() 87 | return old_token 88 | else: 89 | log.warning(f"Something went wrong re-authing") 90 | discord_logging.flush_discord() 91 | return old_token 92 | 93 | 94 | def init(p_lock): 95 | global pushshift_lock 96 | pushshift_lock = p_lock 97 | 98 | 99 | def save_status(status_json, stages, month): 100 | log.debug(f"Saving status: {stages}") 101 | output_dict = { 102 | "stages": stages, 103 | "month": month, 104 | } 105 | json_string = json.dumps(output_dict, indent=4, default=str) 106 | with open(status_json, 'w') as status_json_file: 107 | status_json_file.write(json_string) 108 | 109 | 110 | def load_status(status_json): 111 | if os.path.exists(status_json): 112 | with open(status_json, 'r') as status_json_file: 113 | output_dict = json.load(status_json_file) 114 | for stage_type, stage in output_dict["stages"].items(): 115 | if stage["merge"] is not None: 116 | stage["merge"] = datetime.strptime(stage["merge"], "%Y-%m-%d %H:%M:%S") 117 | return output_dict["stages"], output_dict["month"] 118 | else: 119 | stages = { 120 | "comment": { 121 | "split": False, 122 | "merge": None, # 24-02-01 123 | "build": False, 124 | }, 125 | "submission": { 126 | "split": False, 127 | "merge": None, # 24-02-01 128 | "build": False, 129 | } 130 | } 131 | return stages, None 132 | 133 | 134 | def end_of_day(input_minute): 135 | return input_minute.replace(hour=0, minute=0, second=0) + timedelta(days=1) 136 | 137 | 138 | def process(queue, base_folder, month, file_type, type_stages, reddit_username, compression_level, ignore_ids): 139 | try: 140 | # for stage, status in type_stages.items(): 141 | # log.info(f"{file_type} {stage}: {status}") 142 | file_prefix = "RC" if file_type == "comment" else "RS" 143 | if not type_stages["split"]: 144 | original_split_file = os.path.join(base_folder, "reddit", "blocks", f"{file_prefix}_20{month}.zst") 145 | split_file = os.path.join(base_folder, "reddit", "blocks", f"{file_prefix}B_20{month}.zst") 146 | if os.path.exists(original_split_file): 147 | os.rename(original_split_file, split_file) 148 | 149 | if not os.path.exists(split_file): 150 | log.info(f"{file_type}: File {split_file} doesn't exist, checking for blocks") 151 | split_file = os.path.join(base_folder, "reddit", "blocks", f"{file_prefix}_20{month}.zst_blocks") 152 | if not os.path.exists(split_file): 153 | log.error(f"{file_type}: File {split_file} doesn't exist, aborting") 154 | return False 155 | 156 | split_folder = os.path.join(base_folder, "ingest", "download") 157 | 158 | log.info(f"{file_type}: Starting {file_type} split") 159 | log.info(f"{file_type}: Reading from: {split_file}") 160 | log.info(f"{file_type}: Writing to: {split_folder}") 161 | split_blocks_by_minutes.split_by_minutes(split_file, split_folder) 162 | 163 | log.warning(f"{file_type}: {file_type} split complete") 164 | discord_logging.flush_discord() 165 | queue.put((file_type, "split", True)) 166 | 167 | start_date = datetime.strptime(month, "%y-%m") 168 | if start_date.month == 12: 169 | end_date = start_date.replace(year=start_date.year + 1, month=1) 170 | else: 171 | end_date = start_date.replace(month=start_date.month + 1) 172 | if type_stages["merge"] is None or type_stages["merge"] < end_date: 173 | if type_stages["merge"] is not None: 174 | start_date = type_stages["merge"] 175 | 176 | log.info(f"{file_type}: Starting {file_type} merge from {start_date}") 177 | 178 | reddit = praw.Reddit(reddit_username) 179 | 180 | input_folders = [ 181 | (os.path.join(base_folder, "ingest", "ingest"), IngestType.INGEST), 182 | (os.path.join(base_folder, "ingest", "rescan"), IngestType.RESCAN), 183 | (os.path.join(base_folder, "ingest", "download"), IngestType.DOWNLOAD), 184 | ] 185 | for input_folder in input_folders: 186 | log.info(f"{file_type}: Reading from: {input_folder[0]} : {input_folder[1]}") 187 | combined_folder = os.path.join(base_folder, "ingest", "combined") 188 | log.info(f"{file_type}: Writing to: {combined_folder}") 189 | while start_date < end_date: 190 | build_day( 191 | start_date, 192 | input_folders, 193 | combined_folder, 194 | ObjectType.COMMENT if file_type == "comment" else ObjectType.SUBMISSION, 195 | reddit, 196 | ignore_ids, 197 | get_pushshift_token 198 | ) 199 | start_date = end_of_day(start_date) 200 | queue.put((file_type, "merge", start_date)) 201 | log.warning(f"{file_type}: {file_type} merge complete") 202 | discord_logging.flush_discord() 203 | 204 | if not type_stages["build"]: 205 | log.info(f"{file_type}: Starting {file_type} build") 206 | start_date = datetime.strptime(month, "%y-%m") 207 | 208 | input_folder = os.path.join(base_folder, "ingest", "combined") 209 | output_folder = os.path.join(base_folder, "reddit") 210 | log.info(f"{file_type}: Reading from: {input_folder}") 211 | log.info(f"{file_type}: Writing to: {output_folder}") 212 | build_month.build_month( 213 | start_date, 214 | input_folder, 215 | output_folder, 216 | file_type+"s", 217 | compression_level 218 | ) 219 | queue.put((file_type, "build", True)) 220 | log.warning(f"{file_type}: {file_type} build complete") 221 | discord_logging.flush_discord() 222 | 223 | log.warning(f"{file_type}: {file_type} all steps complete") 224 | 225 | log.info(f'torrenttools create -a "https://academictorrents.com/announce.php" -c "Reddit comments and submissions from 20{month}" --include ".*(comments|submissions).*R._20{month}.zst$" -o reddit_20{month}.torrent reddit') 226 | 227 | discord_logging.flush_discord() 228 | 229 | # for stage, status in type_stages.items(): 230 | # log.info(f"{file_type} {stage}: {status}") 231 | except Exception as err: 232 | log.warning(f"Error in {type}: {err}") 233 | log.warning(traceback.format_exc()) 234 | queue.put((file_type, "error", str(err))) 235 | discord_logging.flush_discord() 236 | # for stage, status in type_stages.items(): 237 | # log.info(f"{file_type} {stage}: {status}") 238 | 239 | 240 | if __name__ == "__main__": 241 | parser = argparse.ArgumentParser(description="") 242 | parser.add_argument('month', help='Month to process') 243 | parser.add_argument('folder', help='Folder under which all the files are stored') 244 | parser.add_argument("--ignore_ids", help="Ignore ids between the id ranges listed", default=None) 245 | parser.add_argument("--level", help="The compression ratio to output at", default="22") 246 | args = parser.parse_args() 247 | 248 | ignore_ids = [] 249 | if args.ignore_ids is not None: 250 | for id_range in args.ignore_ids.split(","): 251 | start_id, end_id = id_range.split("-") 252 | ignore_ids.append((utils.base36decode(start_id), utils.base36decode(end_id))) 253 | 254 | log.warning(f"Processing {args.month}") 255 | discord_logging.flush_discord() 256 | 257 | status_file = "process.json" 258 | stages, month = load_status(status_file) 259 | 260 | if month is not None and args.month != month: 261 | log.error(f"Month does not match saved month, aborting: {month} : {args.month}") 262 | sys.exit(0) 263 | month = args.month 264 | log.info(f"Processing {month}") 265 | level = int(args.level) 266 | log.info(f"Compression level: {level}") 267 | 268 | multiprocessing.set_start_method('spawn', force=True) 269 | queue = multiprocessing.Manager().Queue() 270 | p_lock = multiprocessing.Lock() 271 | with multiprocessing.Pool(processes=2, initializer=init, initargs=(p_lock,)) as pool: 272 | arguments = [] 273 | for file_type, type_stages in stages.items(): 274 | arguments.append((queue, args.folder, month, file_type, type_stages, "Watchful12", level, ignore_ids)) 275 | workers = pool.starmap_async(process, arguments, chunksize=1, error_callback=log.info) 276 | while not workers.ready() or not queue.empty(): 277 | file_type, stage, status = queue.get() 278 | if stage == "error": 279 | log.error(f"Error in {file_type}: {status}") 280 | stages[file_type][stage] = status 281 | save_status(status_file, stages, month) 282 | discord_logging.flush_discord() 283 | #log.info(f"workers {workers.ready()} : queue {queue.empty()}") 284 | discord_logging.flush_discord() 285 | -------------------------------------------------------------------------------- /personal/transform/split_blocks_by_minutes.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('personal') 3 | 4 | import discord_logging 5 | import os 6 | import zstandard 7 | from datetime import datetime 8 | import json 9 | import argparse 10 | 11 | log = discord_logging.get_logger(init=True) 12 | 13 | import utils 14 | 15 | NEWLINE_ENCODED = "\n".encode('utf-8') 16 | 17 | 18 | def split_by_minutes(input_file, output_file): 19 | file_type = "comments" if "RC" in input_file else "submissions" 20 | 21 | log.info(f"{file_type}: Input file: {input_file}") 22 | log.info(f"{file_type}: Output folder: {output_file}") 23 | previous_minute, output_handle, created_utc = None, None, None 24 | count_objects, count_minute = 0, 0 25 | if input_file.endswith(".zst"): 26 | reader = utils.read_obj_zst(input_file) 27 | elif input_file.endswith(".zst_blocks"): 28 | reader = utils.read_obj_zst_blocks(input_file) 29 | else: 30 | log.error(f"{file_type}: Unsupported file type: {input_file}") 31 | return 32 | for obj in reader: 33 | created_utc = datetime.utcfromtimestamp(obj["created_utc"]) 34 | current_minute = created_utc.replace(second=0) 35 | 36 | if previous_minute is None or current_minute > previous_minute: 37 | log.info(f"{file_type}: {created_utc.strftime('%y-%m-%d_%H-%M')}: {count_objects:,} : {count_minute: ,}") 38 | previous_minute = current_minute 39 | count_minute = 0 40 | if output_handle is not None: 41 | output_handle.close() 42 | 43 | output_path = os.path.join(output_file, file_type, created_utc.strftime('%y-%m-%d')) 44 | if not os.path.exists(output_path): 45 | os.makedirs(output_path) 46 | output_path = os.path.join(output_path, f"{('RC' if file_type == 'comments' else 'RS')}_{created_utc.strftime('%y-%m-%d_%H-%M')}.zst") 47 | output_handle = zstandard.ZstdCompressor().stream_writer(open(output_path, 'wb')) 48 | 49 | count_objects += 1 50 | count_minute += 1 51 | output_handle.write(json.dumps(obj, sort_keys=True).encode('utf-8')) 52 | output_handle.write(NEWLINE_ENCODED) 53 | 54 | if created_utc is None: 55 | log.error(f"{file_type}: {input_file} appears to be empty") 56 | sys.exit(1) 57 | log.info(f"{file_type}: {created_utc.strftime('%y-%m-%d_%H-%M')}: {count_objects:,} : {count_minute: ,}") 58 | if output_handle is not None: 59 | output_handle.close() 60 | 61 | 62 | if __name__ == "__main__": 63 | parser = argparse.ArgumentParser(description="Take a zst_blocks file and split it by minute chunks") 64 | parser.add_argument('--input', help='Input file', required=True) 65 | parser.add_argument('--output', help='Output folder', required=True) 66 | args = parser.parse_args() 67 | 68 | split_by_minutes(args.input, args.output) 69 | -------------------------------------------------------------------------------- /personal/transform/split_by_minutes.py: -------------------------------------------------------------------------------- 1 | import discord_logging 2 | import os 3 | import zstandard 4 | from datetime import datetime 5 | import json 6 | 7 | log = discord_logging.init_logging() 8 | 9 | import utils 10 | 11 | NEWLINE_ENCODED = "\n".encode('utf-8') 12 | 13 | 14 | if __name__ == "__main__": 15 | input_file = r"\\MYCLOUDPR4100\Public\RS_2023-09.zst" 16 | output_folder = r"\\MYCLOUDPR4100\Public\ingest\download" 17 | file_type = "comments" if "RC" in input_file else "submissions" 18 | 19 | log.info(f"Input: {input_file} - Output: {output_folder}") 20 | previous_minute, output_handle, created_utc = None, None, None 21 | count_objects, count_minute = 0, 0 22 | for obj in utils.read_obj_zst(input_file): 23 | created_utc = datetime.utcfromtimestamp(obj["created_utc"]) 24 | current_minute = created_utc.replace(second=0) 25 | 26 | if previous_minute is None or current_minute > previous_minute: 27 | log.info(f"{created_utc.strftime('%y-%m-%d_%H-%M')}: {count_objects:,} : {count_minute: ,}") 28 | previous_minute = current_minute 29 | count_minute = 0 30 | if output_handle is not None: 31 | output_handle.close() 32 | 33 | output_path = os.path.join(output_folder, file_type, created_utc.strftime('%y-%m-%d')) 34 | if not os.path.exists(output_path): 35 | os.makedirs(output_path) 36 | output_path = os.path.join(output_path, f"{('RC' if file_type == 'comments' else 'RS')}_{created_utc.strftime('%y-%m-%d_%H-%M')}.zst") 37 | output_handle = zstandard.ZstdCompressor().stream_writer(open(output_path, 'wb')) 38 | 39 | count_objects += 1 40 | count_minute += 1 41 | output_handle.write(json.dumps(obj, sort_keys=True).encode('utf-8')) 42 | output_handle.write(NEWLINE_ENCODED) 43 | 44 | log.info(f"{created_utc.strftime('%y-%m-%d_%H-%M')}: {count_objects:,} : {count_minute: ,}") 45 | if output_handle is not None: 46 | output_handle.close() 47 | -------------------------------------------------------------------------------- /personal/transform/split_by_subreddit.py: -------------------------------------------------------------------------------- 1 | import utils 2 | import discord_logging 3 | import os 4 | from datetime import datetime 5 | 6 | log = discord_logging.init_logging() 7 | 8 | 9 | if __name__ == "__main__": 10 | subreddits = {} 11 | field = 'subreddit' 12 | object_type = "submissions" 13 | folder = f"\\\\MYCLOUDPR4100\\Public\\reddit_final\\multisub_{object_type}" 14 | if not os.path.exists(folder): 15 | os.makedirs(folder) 16 | input_file = f"\\\\MYCLOUDPR4100\\Public\\reddit_final\\multisub_{object_type}.zst" 17 | input_file_size = os.stat(input_file).st_size 18 | total_lines = 0 19 | for comment, line, file_bytes_processed in utils.read_obj_zst_meta(input_file): 20 | if comment[field] not in subreddits: 21 | subreddits[comment[field]] = {'writer': utils.OutputZst(os.path.join(folder, comment[field] + f"_{object_type}.zst")), 'lines': 0} 22 | subreddit = subreddits[comment[field]] 23 | subreddit['writer'].write(line) 24 | subreddit['writer'].write("\n") 25 | subreddit['lines'] += 1 26 | total_lines += 1 27 | if total_lines % 100000 == 0: 28 | log.info(f"{total_lines:,} lines, {(file_bytes_processed / input_file_size) * 100:.0f}%") 29 | 30 | log.info(f"{total_lines:,} lines, 100%") 31 | 32 | for name, subreddit in subreddits.items(): 33 | log.info(f"r/{name}: {subreddit['lines']:,} lines") 34 | subreddit['writer'].close() 35 | -------------------------------------------------------------------------------- /personal/utils.py: -------------------------------------------------------------------------------- 1 | import zstandard 2 | import json 3 | import os 4 | from zst_blocks import ZstBlocksFile 5 | 6 | 7 | def read_obj_zst(file_name): 8 | with open(file_name, 'rb') as file_handle: 9 | buffer = '' 10 | reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle) 11 | while True: 12 | chunk = read_and_decode(reader, 2**27, (2**29) * 2) 13 | if not chunk: 14 | break 15 | lines = (buffer + chunk).split("\n") 16 | for line in lines[:-1]: 17 | if line == "": 18 | continue 19 | yield json.loads(line.strip()) 20 | 21 | buffer = lines[-1] 22 | reader.close() 23 | 24 | 25 | def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0): 26 | chunk = reader.read(chunk_size) 27 | bytes_read += chunk_size 28 | if previous_chunk is not None: 29 | chunk = previous_chunk + chunk 30 | try: 31 | return chunk.decode() 32 | except UnicodeDecodeError: 33 | if bytes_read > max_window_size: 34 | raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes") 35 | return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read) 36 | 37 | 38 | def read_obj_zst_meta(file_name): 39 | with open(file_name, 'rb') as file_handle: 40 | buffer = '' 41 | reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle) 42 | while True: 43 | chunk = read_and_decode(reader, 2**27, (2**29) * 2) 44 | if not chunk: 45 | break 46 | lines = (buffer + chunk).split("\n") 47 | 48 | for line in lines[:-1]: 49 | line = line.strip() 50 | try: 51 | json_object = json.loads(line) 52 | except (KeyError, json.JSONDecodeError) as err: 53 | continue 54 | yield json_object, line, file_handle.tell() 55 | 56 | buffer = lines[-1] 57 | reader.close() 58 | 59 | 60 | class OutputZst: 61 | def __init__(self, file_name): 62 | output_file = open(file_name, 'wb') 63 | self.writer = zstandard.ZstdCompressor().stream_writer(output_file) 64 | 65 | def write(self, line): 66 | encoded_line = line.encode('utf-8') 67 | self.writer.write(encoded_line) 68 | 69 | def close(self): 70 | self.writer.close() 71 | 72 | def __enter__(self): 73 | return self 74 | 75 | def __exit__(self, exc_type, exc_value, exc_traceback): 76 | self.close() 77 | return True 78 | 79 | 80 | # copied from https://github.com/ArthurHeitmann/zst_blocks_format 81 | def read_obj_zst_blocks(file_name): 82 | with open(file_name, "rb") as file: 83 | for row in ZstBlocksFile.streamRows(file): 84 | line = row.decode() 85 | yield json.loads(line.strip()) 86 | 87 | 88 | def base36encode(integer: int) -> str: 89 | chars = '0123456789abcdefghijklmnopqrstuvwxyz' 90 | sign = '-' if integer < 0 else '' 91 | integer = abs(integer) 92 | result = '' 93 | while integer > 0: 94 | integer, remainder = divmod(integer, 36) 95 | result = chars[remainder] + result 96 | return sign + result 97 | 98 | 99 | def base36decode(base36: str) -> int: 100 | return int(base36, 36) 101 | 102 | 103 | def merge_lowest_highest_id(str_id, lowest_id, highest_id): 104 | int_id = base36decode(str_id) 105 | if lowest_id is None or int_id < lowest_id: 106 | lowest_id = int_id 107 | if highest_id is None or int_id > highest_id: 108 | highest_id = int_id 109 | return lowest_id, highest_id 110 | 111 | 112 | def chunk_list(items, chunk_size): 113 | for i in range(0, len(items), chunk_size): 114 | yield items[i:i + chunk_size] 115 | -------------------------------------------------------------------------------- /personal/zst_blocks.py: -------------------------------------------------------------------------------- 1 | # copied from https://github.com/ArthurHeitmann/zst_blocks_format 2 | 3 | from __future__ import annotations 4 | from dataclasses import dataclass 5 | import os 6 | import time 7 | import struct 8 | from typing import BinaryIO, Callable, Iterable, Literal 9 | from zstandard import ZstdDecompressor, ZstdCompressor 10 | 11 | _endian: Literal["little", "big"] = "little" 12 | 13 | _uint32Struct = struct.Struct(" bytes: 27 | file.seek(rowPosition.blockOffset) 28 | return ZstBlock.readRow(file, rowPosition.rowIndex) 29 | 30 | @staticmethod 31 | def readMultipleBlocks(file: BinaryIO, rowPositions: list[RowPosition]) -> \ 32 | list[bytes]: 33 | blockGroupsDict: dict[int, RowPositionGroup] = {} 34 | for i, rowPosition in enumerate(rowPositions): 35 | if rowPosition.blockOffset not in blockGroupsDict: 36 | blockGroupsDict[rowPosition.blockOffset] = RowPositionGroup( 37 | rowPosition.blockOffset, []) 38 | blockGroupsDict[rowPosition.blockOffset].rowIndices.append( 39 | RowIndex(rowPosition.rowIndex, i)) 40 | blockGroups = list(blockGroupsDict.values()) 41 | 42 | rows: list = [None] * len(rowPositions) 43 | for blockGroup in blockGroups: 44 | file.seek(blockGroup.blockOffset) 45 | blockRows = ZstBlock.readSpecificRows(file, map(lambda 46 | pair: pair.withinBlockIndex, 47 | blockGroup.rowIndices)) 48 | for originalPosition, row in zip(blockGroup.rowIndices, blockRows): 49 | rows[originalPosition.originalRowIndex] = row 50 | 51 | return rows 52 | 53 | @staticmethod 54 | def streamRows(file: BinaryIO, blockIndexProgressCallback: Callable[[ 55 | int], None] | None = None) -> Iterable[bytes]: 56 | fileSize = os.path.getsize(file.name) 57 | blockIndex = 0 58 | while file.tell() < fileSize: 59 | yield from ZstBlock.streamRows(file) 60 | blockIndex += 1 61 | if blockIndexProgressCallback is not None: 62 | blockIndexProgressCallback(blockIndex) 63 | 64 | @staticmethod 65 | def appendBlock(file: BinaryIO, rows: list[bytes], 66 | compressionLevel=_defaultCompressionLevel) -> None: 67 | file.seek(file.tell()) 68 | ZstBlock(rows).write(file, compressionLevel=compressionLevel) 69 | 70 | @staticmethod 71 | def writeStream(file: BinaryIO, rowStream: Iterable[bytes], blockSize: int, 72 | rowPositions: list[RowPosition] | None = None, 73 | compressionLevel=_defaultCompressionLevel) -> None: 74 | pendingRows = [] 75 | for row in rowStream: 76 | pendingRows.append(row) 77 | if len(pendingRows) >= blockSize: 78 | ZstBlock(pendingRows).write(file, rowPositions, 79 | compressionLevel=compressionLevel) 80 | pendingRows = [] 81 | if len(pendingRows) > 0: 82 | ZstBlock(pendingRows).write(file, rowPositions, 83 | compressionLevel=compressionLevel) 84 | 85 | @staticmethod 86 | def writeBlocksStream(file: BinaryIO, blocksStream: Iterable[list[bytes]], 87 | rowPositions: list[RowPosition] | None = None, 88 | compressionLevel=_defaultCompressionLevel) -> None: 89 | for rows in blocksStream: 90 | ZstBlock(rows).write(file, rowPositions, 91 | compressionLevel=compressionLevel) 92 | 93 | @staticmethod 94 | def countBlocks(file: BinaryIO) -> int: 95 | fileSize = os.path.getsize(file.name) 96 | blockCount = 0 97 | initialPos = file.tell() 98 | pos = initialPos 99 | while pos < fileSize: 100 | blockCount += 1 101 | blockSize = _uint32Struct.unpack(file.read(4))[0] 102 | pos += 4 + blockSize 103 | file.seek(pos) 104 | file.seek(initialPos) 105 | return blockCount 106 | 107 | @staticmethod 108 | def generateRowPositions(file: BinaryIO) -> Iterable[RowPosition]: 109 | fileSize = os.path.getsize(file.name) 110 | while file.tell() < fileSize: 111 | yield from ZstBlock.generateRowPositions(file) 112 | 113 | 114 | class ZstBlock: 115 | rows: list[bytes] 116 | 117 | def __init__(self, rows: list[bytes]): 118 | self.rows = rows 119 | 120 | @classmethod 121 | def streamRows(cls, file: BinaryIO) -> Iterable[bytes]: 122 | compressedSize = _uint32Struct.unpack(file.read(4))[0] 123 | compressedData = file.read(compressedSize) 124 | decompressedData = ZstdDecompressor().decompress(compressedData) 125 | 126 | memoryView = memoryview(decompressedData) 127 | count = _uint32Struct.unpack(memoryView[0:4])[0] 128 | rows: list[ZstRowInfo] = [None] * count 129 | for i in range(count): 130 | rows[i] = ZstRowInfo.read(memoryView, 4 + i * ZstRowInfo.structSize) 131 | 132 | dataStart = 4 + count * ZstRowInfo.structSize 133 | for row in rows: 134 | yield decompressedData[ 135 | dataStart + row.offset: dataStart + row.offset + row.size] 136 | 137 | @classmethod 138 | def readSpecificRows(cls, file: BinaryIO, rowIndices: Iterable[int]) -> \ 139 | list[bytes]: 140 | compressedSize = _uint32Struct.unpack(file.read(4))[0] 141 | compressedData = file.read(compressedSize) 142 | decompressedData = ZstdDecompressor().decompress(compressedData) 143 | 144 | memoryView = memoryview(decompressedData) 145 | count = _uint32Struct.unpack(memoryView[0:4])[0] 146 | rows: list[ZstRowInfo] = [None] * count 147 | for i in range(count): 148 | rows[i] = ZstRowInfo.read(memoryView, 4 + i * ZstRowInfo.structSize) 149 | 150 | dataStart = 4 + count * ZstRowInfo.structSize 151 | return [ 152 | decompressedData[ 153 | dataStart + rows[rowIndex].offset: dataStart + rows[ 154 | rowIndex].offset + rows[rowIndex].size] 155 | for rowIndex in rowIndices 156 | ] 157 | 158 | @classmethod 159 | def readRow(cls, file: BinaryIO, rowIndex: int) -> bytes: 160 | compressedSize = _uint32Struct.unpack(file.read(4))[0] 161 | compressedData = file.read(compressedSize) 162 | decompressedData = ZstdDecompressor().decompress(compressedData) 163 | 164 | memoryView = memoryview(decompressedData) 165 | count = _uint32Struct.unpack(memoryView[0:4])[0] 166 | if rowIndex >= count: 167 | raise Exception("Row index out of range") 168 | row = ZstRowInfo.read(memoryView, 4 + rowIndex * ZstRowInfo.structSize) 169 | 170 | dataStart = 4 + count * ZstRowInfo.structSize 171 | return decompressedData[ 172 | dataStart + row.offset: dataStart + row.offset + row.size] 173 | 174 | def write(self, file: BinaryIO, 175 | rowPositions: list[RowPosition] | None = None, 176 | compressionLevel=_defaultCompressionLevel) -> None: 177 | uncompressedSize = \ 178 | 4 + \ 179 | len(self.rows) * ZstRowInfo.structSize + \ 180 | sum(len(row) for row in self.rows) 181 | uncompressedBytes = bytearray(uncompressedSize) 182 | uncompressedBytes[0:4] = len(self.rows).to_bytes(4, _endian) 183 | 184 | dataOffset = 4 + len(self.rows) * ZstRowInfo.structSize 185 | blockOffset = file.tell() 186 | currentDataLocalOffset = 0 187 | for i in range(len(self.rows)): 188 | row = self.rows[i] 189 | rowInfo = ZstRowInfo(currentDataLocalOffset, len(row)) 190 | rowInfo.write(uncompressedBytes, 4 + i * ZstRowInfo.structSize) 191 | uncompressedBytes[ 192 | dataOffset + currentDataLocalOffset: dataOffset + currentDataLocalOffset + len( 193 | row)] = row 194 | currentDataLocalOffset += len(row) 195 | if rowPositions is not None: 196 | rowPositions.append(RowPosition(blockOffset, i)) 197 | uncompressedData = bytes(uncompressedBytes) 198 | compressedData = ZstdCompressor(compressionLevel).compress( 199 | uncompressedData) 200 | compressedSize = len(compressedData) 201 | blockBytes = bytearray(4 + compressedSize) 202 | blockBytes[0:4] = compressedSize.to_bytes(4, _endian) 203 | blockBytes[4:4 + compressedSize] = compressedData 204 | file.write(blockBytes) 205 | 206 | @staticmethod 207 | def generateRowPositions(file: BinaryIO) -> Iterable[RowPosition]: 208 | blockOffset = file.tell() 209 | compressedSize = _uint32Struct.unpack(file.read(4))[0] 210 | compressedData = file.read(compressedSize) 211 | decompressedData = ZstdDecompressor().decompress(compressedData) 212 | 213 | memoryView = memoryview(decompressedData) 214 | count = _uint32Struct.unpack(memoryView[0:4])[0] 215 | for i in range(count): 216 | yield RowPosition(blockOffset, i) 217 | 218 | 219 | class ZstRowInfo: 220 | structSize = 8 221 | offset: int 222 | size: int 223 | 224 | def __init__(self, offset: int, size: int): 225 | self.offset = offset 226 | self.size = size 227 | 228 | @staticmethod 229 | def read(bytes: bytes, position: int) -> ZstRowInfo: 230 | offset, size = _uint32X2Struct.unpack( 231 | bytes[position: position + ZstRowInfo.structSize]) 232 | return ZstRowInfo(offset, size) 233 | 234 | def write(self, bytes: bytearray, position: int) -> None: 235 | bytes[position + 0: position + 4] = self.offset.to_bytes(4, _endian) 236 | bytes[position + 4: position + 8] = self.size.to_bytes(4, _endian) 237 | 238 | 239 | @dataclass 240 | class RowPosition: 241 | blockOffset: int 242 | rowIndex: int 243 | 244 | 245 | @dataclass 246 | class RowIndex: 247 | withinBlockIndex: int 248 | originalRowIndex: int 249 | 250 | 251 | @dataclass 252 | class RowPositionGroup: 253 | blockOffset: int 254 | rowIndices: list[RowIndex] 255 | -------------------------------------------------------------------------------- /scripts/combine_folder_multiprocess.py: -------------------------------------------------------------------------------- 1 | # this script iterates through zst compressed ndjson files, like the pushshift reddit dumps, loads each line 2 | # and if it matches the criteria in the command line arguments, it's written out into a separate file for 3 | # that month. After all the ndjson files are processed, it iterates through the resulting files and combines 4 | # them into a final file. 5 | 6 | # this script assumes the files are named in chronological order and prefixed with RS_ or RC_, like the pushshift dumps 7 | 8 | # features: 9 | # - multiple processes in parallel to maximize drive read and decompression 10 | # - saves state as it completes each file and picks up where it stopped 11 | # - detailed progress indicators 12 | 13 | # examples: 14 | # - get all comments that have a subreddit field (subreddit is the default) of "wallstreetbets". This will create a single output file "wallstreetbets_comments.zst" in the folder the script is run in 15 | # python3 combine_folder_multiprocess.py reddit/comments --value wallstreetbets 16 | # - get all comments and submissions (assuming both types of dump files are under the reddit folder) that have an author field of Watchful1 or spez and output the results to a folder called pushshift. 17 | # This will result in four files, pushshift/Watchful1_comments, pushshift/Watchful1_submissions, pushshift/spez_comments, pushshift/spez_submissions 18 | # python3 combine_folder_multiprocess.py reddit --field author --value Watchful1,spez --output pushshift 19 | 20 | import zstandard 21 | import os 22 | import json 23 | import sys 24 | import time 25 | import argparse 26 | import re 27 | from collections import defaultdict 28 | import logging.handlers 29 | import multiprocessing 30 | from enum import Enum 31 | 32 | 33 | # sets up logging to the console as well as a file 34 | log = logging.getLogger("bot") 35 | log.setLevel(logging.INFO) 36 | log_formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s') 37 | 38 | log_str_handler = logging.StreamHandler() 39 | log_str_handler.setFormatter(log_formatter) 40 | log.addHandler(log_str_handler) 41 | if not os.path.exists("logs"): 42 | os.makedirs("logs") 43 | log_file_handler = logging.handlers.RotatingFileHandler( 44 | os.path.join("logs", "bot.log"), maxBytes=1024*1024*16, backupCount=5) 45 | log_file_handler.setFormatter(log_formatter) 46 | log.addHandler(log_file_handler) 47 | 48 | 49 | class FileType(Enum): 50 | COMMENT = 1 51 | SUBMISSION = 2 52 | 53 | @staticmethod 54 | def to_str(file_type): 55 | if file_type == FileType.COMMENT: 56 | return "comments" 57 | elif file_type == FileType.SUBMISSION: 58 | return "submissions" 59 | return "other" 60 | 61 | 62 | # convenience object used to pass status information between processes 63 | class FileConfig: 64 | def __init__(self, input_path, output_path=None, complete=False, lines_processed=0, error_lines=0, lines_matched=0): 65 | self.input_path = input_path 66 | self.output_path = output_path 67 | self.file_size = os.stat(input_path).st_size 68 | self.complete = complete 69 | self.bytes_processed = self.file_size if complete else 0 70 | self.lines_processed = lines_processed if complete else 0 71 | self.error_message = None 72 | self.error_lines = error_lines 73 | self.lines_matched = lines_matched 74 | file_name = os.path.split(input_path)[1] 75 | if file_name.startswith("RS"): 76 | self.file_type = FileType.SUBMISSION 77 | elif file_name.startswith("RC"): 78 | self.file_type = FileType.COMMENT 79 | else: 80 | raise ValueError(f"Unknown working file type: {file_name}") 81 | 82 | def __str__(self): 83 | return f"{self.input_path} : {self.output_path} : {self.file_size} : {self.complete} : {self.bytes_processed} : {self.lines_processed}" 84 | 85 | 86 | # another convenience object to read and write from both zst files and ndjson files 87 | class FileHandle: 88 | newline_encoded = "\n".encode('utf-8') 89 | ext_len = len(".zst") 90 | 91 | def __init__(self, path, is_split=False): 92 | self.path = path 93 | self.is_split = is_split 94 | self.handles = {} 95 | 96 | def get_paths(self, character_filter=None): 97 | if self.is_split: 98 | paths = [] 99 | for file in os.listdir(self.path): 100 | if not file.endswith(".zst"): 101 | continue 102 | if character_filter is not None and character_filter != file[-FileHandle.ext_len - 1:-FileHandle.ext_len]: 103 | continue 104 | paths.append(os.path.join(self.path, file)) 105 | return paths 106 | else: 107 | return [self.path] 108 | 109 | def get_count_files(self): 110 | return len(self.get_paths()) 111 | 112 | # recursively decompress and decode a chunk of bytes. If there's a decode error then read another chunk and try with that, up to a limit of max_window_size bytes 113 | @staticmethod 114 | def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0): 115 | chunk = reader.read(chunk_size) 116 | bytes_read += chunk_size 117 | if previous_chunk is not None: 118 | chunk = previous_chunk + chunk 119 | try: 120 | return chunk.decode() 121 | except UnicodeDecodeError: 122 | if bytes_read > max_window_size: 123 | raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes") 124 | return FileHandle.read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read) 125 | 126 | # open a zst compressed ndjson file, or a regular uncompressed ndjson file and yield lines one at a time 127 | # also passes back file progress 128 | def yield_lines(self, character_filter=None): 129 | if self.is_split: 130 | if character_filter is not None: 131 | path = os.path.join(self.path, f"{character_filter}.zst") 132 | else: 133 | raise ValueError(f"{self.path} is split but no filter passed") 134 | else: 135 | path = self.path 136 | if os.path.exists(path): 137 | with open(path, 'rb') as file_handle: 138 | buffer = '' 139 | reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle) 140 | while True: 141 | chunk = FileHandle.read_and_decode(reader, 2**27, (2**29) * 2) 142 | if not chunk: 143 | break 144 | lines = (buffer + chunk).split("\n") 145 | 146 | for line in lines[:-1]: 147 | yield line, file_handle.tell() 148 | 149 | buffer = lines[-1] 150 | reader.close() 151 | 152 | # get either the main write handle or the character filter one, opening a new handle as needed 153 | def get_write_handle(self, character_filter=None): 154 | if character_filter is None: 155 | character_filter = 1 # use 1 as the default name since ints hash quickly 156 | handle = self.handles.get(character_filter) 157 | if handle is None: 158 | if character_filter == 1: 159 | path = self.path 160 | else: 161 | if not os.path.exists(self.path): 162 | os.makedirs(self.path) 163 | path = os.path.join(self.path, f"{character_filter}.zst") 164 | handle = zstandard.ZstdCompressor().stream_writer(open(path, 'wb')) 165 | self.handles[character_filter] = handle 166 | return handle 167 | 168 | # write a line, opening the appropriate handle 169 | def write_line(self, line, value=None): 170 | if self.is_split: 171 | if value is None: 172 | raise ValueError(f"{self.path} is split but no value passed") 173 | character_filter = value[:1] 174 | handle = self.get_write_handle(character_filter) 175 | else: 176 | handle = self.get_write_handle() 177 | 178 | handle.write(line.encode('utf-8')) 179 | handle.write(FileHandle.newline_encoded) 180 | 181 | def close(self): 182 | for handle in self.handles.values(): 183 | handle.close() 184 | 185 | 186 | # used for calculating running average of read speed 187 | class Queue: 188 | def __init__(self, max_size): 189 | self.list = [] 190 | self.max_size = max_size 191 | 192 | def put(self, item): 193 | if len(self.list) >= self.max_size: 194 | self.list.pop(0) 195 | self.list.append(item) 196 | 197 | def peek(self): 198 | return self.list[0] if len(self.list) > 0 else None 199 | 200 | 201 | # save file information and progress to a json file 202 | # we don't want to save the whole FileConfig object, since some info resets if we restart 203 | def save_file_list(input_files, working_folder, status_json, arg_string, script_type, completed_prefixes=None): 204 | if not os.path.exists(working_folder): 205 | os.makedirs(working_folder) 206 | simple_file_list = [] 207 | for file in input_files: 208 | simple_file_list.append([file.input_path, file.output_path, file.complete, file.lines_processed, file.error_lines, file.lines_matched]) 209 | if completed_prefixes is None: 210 | completed_prefixes = [] 211 | else: 212 | completed_prefixes = sorted([prefix for prefix in completed_prefixes]) 213 | with open(status_json, 'w') as status_json_file: 214 | output_dict = { 215 | "args": arg_string, 216 | "type": script_type, 217 | "completed_prefixes": completed_prefixes, 218 | "files": simple_file_list, 219 | } 220 | status_json_file.write(json.dumps(output_dict, indent=4)) 221 | 222 | 223 | # load file information from the json file and recalculate file sizes 224 | def load_file_list(status_json): 225 | if os.path.exists(status_json): 226 | with open(status_json, 'r') as status_json_file: 227 | output_dict = json.load(status_json_file) 228 | input_files = [] 229 | for simple_file in output_dict["files"]: 230 | input_files.append( 231 | FileConfig(simple_file[0], simple_file[1], simple_file[2], simple_file[3], simple_file[4], simple_file[5]) 232 | ) 233 | completed_prefixes = set() 234 | for prefix in output_dict["completed_prefixes"]: 235 | completed_prefixes.add(prefix) 236 | return input_files, output_dict["args"], output_dict["type"], completed_prefixes 237 | else: 238 | return None, None, None, set() 239 | 240 | 241 | # base of each separate process. Loads a file, iterates through lines and writes out 242 | # the ones where the `field` of the object matches `value`. Also passes status 243 | # information back to the parent via a queue 244 | def process_file(file, queue, field, values, partial, regex, split_intermediate): 245 | queue.put(file) 246 | input_handle = FileHandle(file.input_path) 247 | output_handle = FileHandle(file.output_path, is_split=split_intermediate) 248 | 249 | value = None 250 | if len(values) == 1: 251 | value = min(values) 252 | 253 | try: 254 | for line, file_bytes_processed in input_handle.yield_lines(): 255 | try: 256 | obj = json.loads(line) 257 | matched = False 258 | observed = obj[field].lower() 259 | if regex: 260 | for reg in values: 261 | if reg.search(observed): 262 | matched = True 263 | break 264 | elif partial: 265 | for val in values: 266 | if val in observed: 267 | matched = True 268 | break 269 | else: 270 | if value is not None: 271 | if observed == value: 272 | matched = True 273 | elif observed in values: 274 | matched = True 275 | 276 | if matched: 277 | output_handle.write_line(line, observed) 278 | file.lines_matched += 1 279 | except (KeyError, json.JSONDecodeError, AttributeError) as err: 280 | file.error_lines += 1 281 | file.lines_processed += 1 282 | if file.lines_processed % 1000000 == 0: 283 | file.bytes_processed = file_bytes_processed 284 | queue.put(file) 285 | 286 | output_handle.close() 287 | file.complete = True 288 | file.bytes_processed = file.file_size 289 | except Exception as err: 290 | file.error_message = str(err) 291 | queue.put(file) 292 | 293 | 294 | if __name__ == '__main__': 295 | parser = argparse.ArgumentParser(description="Use multiple processes to decompress and iterate over pushshift dump files") 296 | parser.add_argument("input", help="The input folder to recursively read files from") 297 | parser.add_argument("--output", help="Put the output files in this folder", default="") 298 | parser.add_argument("--working", help="The folder to store temporary files in", default="pushshift_working") 299 | parser.add_argument("--field", help="When deciding what lines to keep, use this field for comparisons", default="subreddit") 300 | parser.add_argument("--value", help="When deciding what lines to keep, compare the field to this value. Supports a comma separated list. This is case sensitive", default="pushshift") 301 | parser.add_argument("--value_list", help="A file of newline separated values to use. Overrides the value param if it is set", default=None) 302 | parser.add_argument("--processes", help="Number of processes to use", default=10, type=int) 303 | parser.add_argument("--file_filter", help="Regex filenames have to match to be processed", default="^RC_|^RS_") 304 | parser.add_argument( 305 | "--split_intermediate", 306 | help="Split the intermediate files by the first letter of the matched field, use if the filter will result in a large number of separate files", 307 | action="store_true") 308 | parser.add_argument( 309 | "--single_output", 310 | help="Output a single combined file instead of splitting by the search term", 311 | action="store_true") 312 | parser.add_argument( 313 | "--error_rate", help= 314 | "Percentage as an integer from 0 to 100 of the lines where the field can be missing. For the subreddit field especially, " 315 | "there are a number of posts that simply don't have a subreddit attached", default=1, type=int) 316 | parser.add_argument("--debug", help="Enable debug logging", action='store_const', const=True, default=False) 317 | parser.add_argument( 318 | "--partial", help="The values only have to be contained in the field, not match exactly. If this is set, " 319 | "the output files are not split by value. WARNING: This can severely slow down the script, especially if searching the " 320 | "body.", action='store_const', const=True, default=False) 321 | parser.add_argument( 322 | "--regex", help="The values are treated as regular expressions. If this is set, " 323 | "the output files are not split by value. WARNING: This can severely slow down the script, especially if searching the " 324 | "body. If set, ignores the --partial flag", action='store_const', const=True, default=False) 325 | script_type = "split" 326 | 327 | args = parser.parse_args() 328 | arg_string = f"{args.field}:{(args.value if args.value else args.value_list)}" 329 | 330 | if args.debug: 331 | log.setLevel(logging.DEBUG) 332 | 333 | log.info(f"Loading files from: {args.input}") 334 | if args.output: 335 | log.info(f"Writing output to: {args.output}") 336 | else: 337 | log.info(f"Writing output to working folder") 338 | 339 | if (args.partial or args.regex or args.single_output) and args.split_intermediate: 340 | log.info("The partial, regex and single_output flags are not compatible with the split_intermediate flag") 341 | sys.exit(1) 342 | 343 | values = set() 344 | if args.value_list: 345 | log.info(f"Reading {args.value_list} for values to compare") 346 | with open(args.value_list, 'r') as value_list_handle: 347 | for line in value_list_handle: 348 | values.add(line) 349 | 350 | else: 351 | values = set(args.value.split(",")) 352 | 353 | if args.regex: 354 | regexes = [] 355 | for reg in values: 356 | regexes.append(re.compile(reg)) 357 | values = regexes 358 | if len(values) > 1: 359 | log.info(f"Checking field {args.field} against {len(values)} regexes") 360 | else: 361 | log.info(f"Checking field {args.field} against regex {values[0]}") 362 | else: 363 | lower_values = set() 364 | for value_inner in values: 365 | lower_values.add(value_inner.strip().lower()) 366 | values = lower_values 367 | if len(values) > 5: 368 | val_string = f"any of {len(values)} values" 369 | elif len(values) == 1: 370 | val_string = f"the value {(','.join(values))}" 371 | else: 372 | val_string = f"any of the values {(','.join(values))}" 373 | if args.partial: 374 | log.info(f"Checking if any of {val_string} are contained in field {args.field}") 375 | else: 376 | log.info(f"Checking if any of {val_string} exactly match field {args.field}") 377 | 378 | if args.partial or args.regex or args.single_output: 379 | log.info(f"Outputing to a single combined file") 380 | 381 | multiprocessing.set_start_method('spawn') 382 | queue = multiprocessing.Manager().Queue() 383 | status_json = os.path.join(args.working, "status.json") 384 | input_files, saved_arg_string, saved_type, completed_prefixes = load_file_list(status_json) 385 | if saved_arg_string and saved_arg_string != arg_string: 386 | log.warning(f"Args don't match args from json file. Delete working folder") 387 | sys.exit(0) 388 | 389 | if saved_type and saved_type != script_type: 390 | log.warning(f"Script type doesn't match type from json file. Delete working folder") 391 | sys.exit(0) 392 | 393 | # if the file list wasn't loaded from the json, this is the first run, find what files we need to process 394 | if input_files is None: 395 | input_files = [] 396 | for subdir, dirs, files in os.walk(args.input): 397 | files.sort() 398 | for file_name in files: 399 | if file_name.endswith(".zst") and re.search(args.file_filter, file_name) is not None: 400 | input_path = os.path.join(subdir, file_name) 401 | if args.split_intermediate: 402 | output_extension = "" 403 | else: 404 | output_extension = ".zst" 405 | output_path = os.path.join(args.working, f"{file_name[:-4]}{output_extension}") 406 | input_files.append(FileConfig(input_path, output_path=output_path)) 407 | 408 | save_file_list(input_files, args.working, status_json, arg_string, script_type) 409 | else: 410 | log.info(f"Existing input file was read, if this is not correct you should delete the {args.working} folder and run this script again") 411 | 412 | files_processed, total_bytes, total_bytes_processed, total_lines_processed, total_lines_matched, total_lines_errored = 0, 0, 0, 0, 0, 0 413 | files_to_process = [] 414 | # calculate the total file size for progress reports, build a list of incomplete files to process 415 | # do this largest to smallest by file size so that we aren't processing a few really big files with only a few threads at the end 416 | for file in sorted(input_files, key=lambda item: item.file_size, reverse=True): 417 | total_bytes += file.file_size 418 | if file.complete: 419 | files_processed += 1 420 | total_lines_processed += file.lines_processed 421 | total_lines_matched += file.lines_matched 422 | total_bytes_processed += file.file_size 423 | total_lines_errored += file.error_lines 424 | else: 425 | files_to_process.append(file) 426 | 427 | log.info(f"Processed {files_processed} of {len(input_files)} files with {(total_bytes_processed / (2**30)):.2f} of {(total_bytes / (2**30)):.2f} gigabytes") 428 | 429 | start_time = time.time() 430 | if len(files_to_process): 431 | progress_queue = Queue(40) 432 | progress_queue.put([start_time, total_lines_processed, total_bytes_processed]) 433 | speed_queue = Queue(40) 434 | for file in files_to_process: 435 | log.info(f"Processing file: {file.input_path}") 436 | # start the workers 437 | with multiprocessing.Pool(processes=min(args.processes, len(files_to_process))) as pool: 438 | workers = pool.starmap_async(process_file, [(file, queue, args.field, values, args.partial, args.regex, args.split_intermediate) for file in files_to_process], chunksize=1, error_callback=log.info) 439 | while not workers.ready() or not queue.empty(): 440 | # loop until the workers are all done, pulling in status messages as they are sent 441 | file_update = queue.get() 442 | if file_update.error_message is not None: 443 | log.warning(f"File failed {file_update.input_path}: {file_update.error_message}") 444 | 445 | # this is the workers telling us they are starting a new file, print the debug message but nothing else 446 | if file_update.lines_processed == 0: 447 | log.debug(f"Starting file: {file_update.input_path} : {file_update.file_size:,}") 448 | continue 449 | 450 | # I'm going to assume that the list of files is short enough that it's no 451 | # big deal to just iterate each time since that saves a bunch of work 452 | total_lines_processed, total_lines_matched, total_bytes_processed, total_lines_errored, files_processed, files_errored, i = 0, 0, 0, 0, 0, 0, 0 453 | for file in input_files: 454 | if file.input_path == file_update.input_path: 455 | input_files[i] = file_update 456 | file = file_update 457 | total_lines_processed += file.lines_processed 458 | total_lines_matched += file.lines_matched 459 | total_bytes_processed += file.bytes_processed 460 | total_lines_errored += file.error_lines 461 | files_processed += 1 if file.complete or file.error_message is not None else 0 462 | files_errored += 1 if file.error_message is not None else 0 463 | i += 1 464 | if file_update.complete or file_update.error_message is not None: 465 | save_file_list(input_files, args.working, status_json, arg_string, script_type) 466 | log.debug(f"Finished file: {file_update.input_path} : {file_update.file_size:,}") 467 | current_time = time.time() 468 | progress_queue.put([current_time, total_lines_processed, total_bytes_processed]) 469 | 470 | first_time, first_lines, first_bytes = progress_queue.peek() 471 | bytes_per_second = int((total_bytes_processed - first_bytes)/(current_time - first_time)) 472 | speed_queue.put(bytes_per_second) 473 | seconds_left = int((total_bytes - total_bytes_processed) / int(sum(speed_queue.list) / len(speed_queue.list))) 474 | minutes_left = int(seconds_left / 60) 475 | hours_left = int(minutes_left / 60) 476 | days_left = int(hours_left / 24) 477 | 478 | log.info( 479 | f"{total_lines_processed:,} lines at {(total_lines_processed - first_lines)/(current_time - first_time):,.0f}/s, {total_lines_errored:,} errored, {total_lines_matched:,} matched : " 480 | f"{(total_bytes_processed / (2**30)):.2f} gb at {(bytes_per_second / (2**20)):,.0f} mb/s, {(total_bytes_processed / total_bytes) * 100:.0f}% : " 481 | f"{files_processed}({files_errored})/{len(input_files)} files : " 482 | f"{(str(days_left) + 'd ' if days_left > 0 else '')}{hours_left - (days_left * 24)}:{minutes_left - (hours_left * 60):02}:{seconds_left - (minutes_left * 60):02} remaining") 483 | 484 | log.info(f"{total_lines_processed:,}, {total_lines_errored} errored : {(total_bytes_processed / (2**30)):.2f} gb, {(total_bytes_processed / total_bytes) * 100:.0f}% : {files_processed}/{len(input_files)}") 485 | 486 | type_handles = defaultdict(list) 487 | prefixes = set() 488 | count_incomplete = 0 489 | count_intermediate_files = 0 490 | # build a list of output files to combine 491 | for file in sorted(input_files, key=lambda item: os.path.split(item.output_path)[1]): 492 | if not file.complete: 493 | if file.error_message is not None: 494 | log.info(f"File {file.input_path} errored {file.error_message}") 495 | else: 496 | log.info(f"File {file.input_path} is not marked as complete") 497 | count_incomplete += 1 498 | else: 499 | if file.error_lines > file.lines_processed * (args.error_rate * 0.01): 500 | log.info( 501 | f"File {file.input_path} has {file.error_lines:,} errored lines out of {file.lines_processed:,}, " 502 | f"{(file.error_lines / file.lines_processed) * (args.error_rate * 0.01):.2f}% which is above the limit of {args.error_rate}%") 503 | count_incomplete += 1 504 | elif file.output_path is not None and os.path.exists(file.output_path): 505 | input_handle = FileHandle(file.output_path, is_split=args.split_intermediate) 506 | for path in input_handle.get_paths(): 507 | prefixes.add(path[-FileHandle.ext_len - 1:-FileHandle.ext_len]) 508 | count_intermediate_files += 1 509 | type_handles[file.file_type].append(input_handle) 510 | 511 | if count_incomplete > 0: 512 | log.info(f"{count_incomplete} files were not completed, errored or don't exist, something went wrong. Aborting") 513 | sys.exit() 514 | 515 | log.info(f"Processing complete, combining {count_intermediate_files} result files") 516 | 517 | for completed_prefix in completed_prefixes: 518 | if completed_prefix in prefixes: 519 | prefixes.remove(completed_prefix) 520 | 521 | output_lines = 0 522 | output_handles = {} 523 | files_combined = 0 524 | if values: 525 | split = True 526 | else: 527 | split = False 528 | if args.split_intermediate: 529 | for prefix in sorted(prefixes): 530 | log.info(f"From {files_combined}/{count_intermediate_files} files to {len(output_handles):,} output handles : {output_lines:,}/{total_lines_matched:,} lines") 531 | for file_type, input_handles in type_handles.items(): 532 | for input_handle in input_handles: 533 | has_lines = False 534 | for line, file_bytes_processed in input_handle.yield_lines(character_filter=prefix): 535 | if not has_lines: 536 | has_lines = True 537 | files_combined += 1 538 | output_lines += 1 539 | obj = json.loads(line) 540 | observed_case = obj[args.field] 541 | observed = observed_case.lower() 542 | if observed not in output_handles: 543 | if args.output: 544 | if not os.path.exists(args.output): 545 | os.makedirs(args.output) 546 | output_file_path = os.path.join(args.output, f"{observed_case}_{FileType.to_str(file_type)}.zst") 547 | else: 548 | output_file_path = f"{observed_case}_{FileType.to_str(file_type)}.zst" 549 | log.debug(f"Writing to file {output_file_path}") 550 | output_handle = FileHandle(output_file_path) 551 | output_handles[observed] = output_handle 552 | else: 553 | output_handle = output_handles[observed] 554 | 555 | output_handle.write_line(line) 556 | if output_lines % 1000000 == 0: 557 | log.info(f"From {files_combined}/{count_intermediate_files} files to {len(output_handles):,} output handles : {output_lines:,}/{total_lines_matched:,} lines : {input_handle.path} / {prefix}") 558 | for handle in output_handles.values(): 559 | handle.close() 560 | output_handles = {} 561 | completed_prefixes.add(prefix) 562 | save_file_list(input_files, args.working, status_json, arg_string, script_type, completed_prefixes) 563 | 564 | else: 565 | log.info(f"From {files_combined}/{count_intermediate_files} files to {len(output_handles):,} output handles : {output_lines:,}/{total_lines_matched:,} lines") 566 | for file_type, input_handles in type_handles.items(): 567 | for input_handle in input_handles: 568 | files_combined += 1 569 | for line, file_bytes_processed in input_handle.yield_lines(): 570 | output_lines += 1 571 | obj = json.loads(line) 572 | if args.partial or args.regex or args.single_output: 573 | observed_case = "output" 574 | else: 575 | observed_case = obj[args.field] 576 | observed = observed_case.lower() 577 | if observed not in output_handles: 578 | if args.output: 579 | if not os.path.exists(args.output): 580 | os.makedirs(args.output) 581 | output_file_path = os.path.join(args.output, f"{observed_case}_{FileType.to_str(file_type)}.zst") 582 | else: 583 | output_file_path = f"{observed_case}_{FileType.to_str(file_type)}.zst" 584 | log.debug(f"Writing to file {output_file_path}") 585 | output_handle = FileHandle(output_file_path) 586 | output_handles[observed] = output_handle 587 | else: 588 | output_handle = output_handles[observed] 589 | 590 | output_handle.write_line(line) 591 | if output_lines % 1000000 == 0: 592 | log.info(f"From {files_combined}/{count_intermediate_files} files to {len(output_handles):,} output handles : {output_lines:,}/{total_lines_matched:,} lines : {input_handle.path}") 593 | for handle in output_handles.values(): 594 | handle.close() 595 | output_handles = {} 596 | 597 | log.info(f"From {files_combined}/{count_intermediate_files} files to {len(output_handles):,} output handles : {output_lines:,}/{total_lines_matched:,} lines") 598 | -------------------------------------------------------------------------------- /scripts/count_words_single_file.py: -------------------------------------------------------------------------------- 1 | # this is an example of loading and iterating over a single file, doing some processing along the way to export a resulting csv 2 | 3 | import zstandard 4 | import os 5 | import json 6 | from collections import defaultdict 7 | from datetime import datetime 8 | import logging.handlers 9 | 10 | 11 | log = logging.getLogger("bot") 12 | log.setLevel(logging.DEBUG) 13 | log.addHandler(logging.StreamHandler()) 14 | 15 | 16 | def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0): 17 | chunk = reader.read(chunk_size) 18 | bytes_read += chunk_size 19 | if previous_chunk is not None: 20 | chunk = previous_chunk + chunk 21 | try: 22 | return chunk.decode() 23 | except UnicodeDecodeError: 24 | if bytes_read > max_window_size: 25 | raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes") 26 | log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk") 27 | return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read) 28 | 29 | 30 | def read_lines_zst(file_name): 31 | with open(file_name, 'rb') as file_handle: 32 | buffer = '' 33 | reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle) 34 | while True: 35 | chunk = read_and_decode(reader, 2**27, (2**29) * 2) 36 | 37 | if not chunk: 38 | break 39 | lines = (buffer + chunk).split("\n") 40 | 41 | for line in lines[:-1]: 42 | yield line, file_handle.tell() 43 | 44 | buffer = lines[-1] 45 | 46 | reader.close() 47 | 48 | 49 | if __name__ == "__main__": 50 | # the path to the input comment file 51 | input_path = r"\\MYCLOUDPR4100\Public\reddit\requests\wallstreetbets_comments.zst" 52 | # the path to the output csv file of word counts 53 | output_path = r"\\MYCLOUDPR4100\Public\reddit\wallstreetbets_counts.csv" 54 | # skip everything before this date. The subreddit was created in 2012, so there's a lot of dates before it gets to the good stuff if you want to skip them 55 | start_date = datetime.strptime("2020-01-01", '%Y-%m-%d') 56 | # list of word phrases to search for. Make sure these are all lowercase 57 | phrases = [ 58 | "diamond hands", 59 | "sell", 60 | ] 61 | 62 | # bunch of initialization stuff 63 | word_counts = defaultdict(int) 64 | file_lines = 0 65 | file_bytes_processed = 0 66 | created = None 67 | bad_lines = 0 68 | current_day = None 69 | output_file = open(output_path, 'w') 70 | output_file.write(f"Date,{(','.join(phrases))}\n") 71 | input_size = os.stat(input_path).st_size 72 | try: 73 | # this is the main loop where we iterate over every single line in the zst file 74 | for line, file_bytes_processed in read_lines_zst(input_path): 75 | try: 76 | # load the line into a json object 77 | obj = json.loads(line) 78 | # turn the created timestamp into a date object 79 | created = datetime.utcfromtimestamp(int(obj['created_utc'])) 80 | # skip if we're before the start date defined above 81 | if created >= start_date: 82 | # if this is a different day than the previous line we looked at, save the word counts to the csv 83 | if current_day != created.replace(hour=0, minute=0, second=0, microsecond=0): 84 | # don't save the dates if this is the very first day, we're just starting 85 | if current_day is not None: 86 | # write out the date at the beginning of the line 87 | output_file.write(f"{current_day.strftime('%Y-%m-%d')}") 88 | # for each phrase in the list, look up the count associated with it and write it out 89 | for phrase in phrases: 90 | output_file.write(",") 91 | output_file.write(str(word_counts[phrase])) 92 | output_file.write("\n") 93 | # reset the dictionary so we can start counting up for the new day 94 | word_counts = defaultdict(int) 95 | # update the variable to the new day, so we can then tell when we get to the next day 96 | current_day = created.replace(hour=0, minute=0, second=0, microsecond=0) 97 | 98 | # get the lowercase of the object text 99 | body_lower = obj['body'].lower() 100 | # for each of the phrases in the list 101 | for phrase in phrases: 102 | # check if it's the text 103 | if phrase in body_lower: 104 | word_counts[phrase] += 1 105 | 106 | # just in case there's corruption somewhere in the file 107 | except (KeyError, json.JSONDecodeError) as err: 108 | bad_lines += 1 109 | file_lines += 1 110 | if file_lines % 100000 == 0: 111 | log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : {bad_lines:,} : {(file_bytes_processed / input_size) * 100:.0f}%") 112 | except Exception as err: 113 | log.info(err) 114 | 115 | # write out the last day 116 | output_file.write(f"{current_day.strftime('%Y-%m-%d')}") 117 | for phrase in phrases: 118 | output_file.write(",") 119 | output_file.write(str(word_counts[phrase])) 120 | output_file.write("\n") 121 | 122 | output_file.close() 123 | log.info(f"Complete : {file_lines:,} : {bad_lines:,}") 124 | -------------------------------------------------------------------------------- /scripts/filter_file.py: -------------------------------------------------------------------------------- 1 | import zstandard 2 | import os 3 | import json 4 | import sys 5 | import csv 6 | from datetime import datetime 7 | import logging.handlers 8 | 9 | # put the path to the input file, or a folder of files to process all of 10 | input_file = r"\\MYCLOUDPR4100\Public\reddit\subreddits23\wallstreetbets_submissions.zst" 11 | # put the name or path to the output file. The file extension from below will be added automatically. If the input file is a folder, the output will be treated as a folder as well 12 | output_file = r"\\MYCLOUDPR4100\Public\output" 13 | # the format to output in, pick from the following options 14 | # zst: same as the input, a zstandard compressed ndjson file. Can be read by the other scripts in the repo 15 | # txt: an ndjson file, which is a text file with a separate json object on each line. Can be opened by any text editor 16 | # csv: a comma separated value file. Can be opened by a text editor or excel 17 | # WARNING READ THIS: if you use txt or csv output on a large input file without filtering out most of the rows, the resulting file will be extremely large. Usually about 7 times as large as the compressed input file 18 | output_format = "csv" 19 | # override the above format and output only this field into a text file, one per line. Useful if you want to make a list of authors or ids. See the examples below 20 | # any field that's in the dump is supported, but useful ones are 21 | # author: the username of the author 22 | # id: the id of the submission or comment 23 | # link_id: only for comments, the fullname of the submission the comment is associated with 24 | # parent_id: only for comments, the fullname of the parent of the comment. Either another comment or the submission if it's top level 25 | single_field = None 26 | # the fields in the file are different depending on whether it has comments or submissions. If we're writing a csv, we need to know which fields to write. 27 | # set this to true to write out to the log every time there's a bad line, set to false if you're expecting only some of the lines to match the key 28 | write_bad_lines = True 29 | 30 | # only output items between these two dates 31 | from_date = datetime.strptime("2005-01-01", "%Y-%m-%d") 32 | to_date = datetime.strptime("2030-12-31", "%Y-%m-%d") 33 | 34 | # the field to filter on, the values to filter with and whether it should be an exact match 35 | # some examples: 36 | # 37 | # return only objects where the author is u/watchful1 or u/spez 38 | # field = "author" 39 | # values = ["watchful1","spez"] 40 | # exact_match = True 41 | # 42 | # return only objects where the title contains either "stonk" or "moon" 43 | # field = "title" 44 | # values = ["stonk","moon"] 45 | # exact_match = False 46 | # 47 | # return only objects where the body contains either "stonk" or "moon". For submissions the body is in the "selftext" field, for comments it's in the "body" field 48 | # field = "selftext" 49 | # values = ["stonk","moon"] 50 | # exact_match = False 51 | # 52 | # 53 | # filter a submission file and then get a file with all the comments only in those submissions. This is a multi step process 54 | # add your submission filters and set the output file name to something unique 55 | # input_file = "redditdev_submissions.zst" 56 | # output_file = "filtered_submissions" 57 | # output_format = "csv" 58 | # field = "author" 59 | # values = ["watchful1"] 60 | # 61 | # run the script, this will result in a file called "filtered_submissions.csv" that contains only submissions by u/watchful1 62 | # now we'll run the script again with the same input and filters, but set the output to single field. Be sure to change the output file to a new name, but don't change any of the other inputs 63 | # output_file = "submission_ids" 64 | # single_field = "id" 65 | # 66 | # run the script again, this will result in a file called "submission_ids.txt" that has an id on each line 67 | # now we'll remove all the other filters and update the script to input from the comments file, and use the submission ids list we created before. And change the output name again so we don't override anything 68 | # input_file = "redditdev_comments.zst" 69 | # output_file = "filtered_comments" 70 | # single_field = None # resetting this back so it's not used 71 | # field = "link_id" # in the comment object, this is the field that contains the submission id 72 | # values_file = "submission_ids.txt" 73 | # exact_match = False # the link_id field has a prefix on it, so we can't do an exact match 74 | # 75 | # run the script one last time and now you have a file called "filtered_comments.csv" that only has comments from your submissions above 76 | # if you want only top level comments instead of all comments, you can set field to "parent_id" instead of "link_id" 77 | 78 | # change this to field = None if you don't want to filter by anything 79 | field = "body" 80 | values = [''] 81 | # if you have a long list of values, you can put them in a file and put the filename here. If set this overrides the value list above 82 | # if this list is very large, it could greatly slow down the process 83 | values_file = None 84 | exact_match = False 85 | 86 | 87 | # sets up logging to the console as well as a file 88 | log = logging.getLogger("bot") 89 | log.setLevel(logging.INFO) 90 | log_formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s') 91 | log_str_handler = logging.StreamHandler() 92 | log_str_handler.setFormatter(log_formatter) 93 | log.addHandler(log_str_handler) 94 | if not os.path.exists("logs"): 95 | os.makedirs("logs") 96 | log_file_handler = logging.handlers.RotatingFileHandler(os.path.join("logs", "bot.log"), maxBytes=1024*1024*16, backupCount=5) 97 | log_file_handler.setFormatter(log_formatter) 98 | log.addHandler(log_file_handler) 99 | 100 | 101 | def write_line_zst(handle, line): 102 | handle.write(line.encode('utf-8')) 103 | handle.write("\n".encode('utf-8')) 104 | 105 | 106 | def write_line_json(handle, obj): 107 | handle.write(json.dumps(obj)) 108 | handle.write("\n") 109 | 110 | 111 | def write_line_single(handle, obj, field): 112 | if field in obj: 113 | handle.write(obj[field]) 114 | else: 115 | log.info(f"{field} not in object {obj['id']}") 116 | handle.write("\n") 117 | 118 | 119 | def write_line_csv(writer, obj, is_submission): 120 | output_list = [] 121 | output_list.append(str(obj['score'])) 122 | output_list.append(datetime.fromtimestamp(int(obj['created_utc'])).strftime("%Y-%m-%d")) 123 | if is_submission: 124 | output_list.append(obj['title']) 125 | output_list.append(f"u/{obj['author']}") 126 | if 'permalink' in obj: 127 | output_list.append(f"https://www.reddit.com{obj['permalink']}") 128 | else: 129 | output_list.append(f"https://www.reddit.com/r/{obj['subreddit']}/comments/{obj['link_id'][3:]}/_/{obj['id']}") 130 | if is_submission: 131 | if obj['is_self']: 132 | if 'selftext' in obj: 133 | output_list.append(obj['selftext']) 134 | else: 135 | output_list.append("") 136 | else: 137 | output_list.append(obj['url']) 138 | else: 139 | output_list.append(obj['body']) 140 | writer.writerow(output_list) 141 | 142 | 143 | def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0): 144 | chunk = reader.read(chunk_size) 145 | bytes_read += chunk_size 146 | if previous_chunk is not None: 147 | chunk = previous_chunk + chunk 148 | try: 149 | return chunk.decode() 150 | except UnicodeDecodeError: 151 | if bytes_read > max_window_size: 152 | raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes") 153 | log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk") 154 | return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read) 155 | 156 | 157 | def read_lines_zst(file_name): 158 | with open(file_name, 'rb') as file_handle: 159 | buffer = '' 160 | reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle) 161 | while True: 162 | chunk = read_and_decode(reader, 2**27, (2**29) * 2) 163 | 164 | if not chunk: 165 | break 166 | lines = (buffer + chunk).split("\n") 167 | 168 | for line in lines[:-1]: 169 | yield line.strip(), file_handle.tell() 170 | 171 | buffer = lines[-1] 172 | 173 | reader.close() 174 | 175 | 176 | def process_file(input_file, output_file, output_format, field, values, from_date, to_date, single_field, exact_match): 177 | output_path = f"{output_file}.{output_format}" 178 | is_submission = "submission" in input_file 179 | log.info(f"Input: {input_file} : Output: {output_path} : Is submission {is_submission}") 180 | writer = None 181 | if output_format == "zst": 182 | handle = zstandard.ZstdCompressor().stream_writer(open(output_path, 'wb')) 183 | elif output_format == "txt": 184 | handle = open(output_path, 'w', encoding='UTF-8') 185 | elif output_format == "csv": 186 | handle = open(output_path, 'w', encoding='UTF-8', newline='') 187 | writer = csv.writer(handle) 188 | else: 189 | log.error(f"Unsupported output format {output_format}") 190 | sys.exit() 191 | 192 | file_size = os.stat(input_file).st_size 193 | created = None 194 | matched_lines = 0 195 | bad_lines = 0 196 | total_lines = 0 197 | for line, file_bytes_processed in read_lines_zst(input_file): 198 | total_lines += 1 199 | if total_lines % 100000 == 0: 200 | log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {total_lines:,} : {matched_lines:,} : {bad_lines:,} : {file_bytes_processed:,}:{(file_bytes_processed / file_size) * 100:.0f}%") 201 | 202 | try: 203 | obj = json.loads(line) 204 | created = datetime.utcfromtimestamp(int(obj['created_utc'])) 205 | 206 | if created < from_date: 207 | continue 208 | if created > to_date: 209 | continue 210 | 211 | if field is not None: 212 | field_value = obj[field].lower() 213 | matched = False 214 | for value in values: 215 | if exact_match: 216 | if value == field_value: 217 | matched = True 218 | break 219 | else: 220 | if value in field_value: 221 | matched = True 222 | break 223 | if not matched: 224 | continue 225 | 226 | matched_lines += 1 227 | if output_format == "zst": 228 | write_line_zst(handle, line) 229 | elif output_format == "csv": 230 | write_line_csv(writer, obj, is_submission) 231 | elif output_format == "txt": 232 | if single_field is not None: 233 | write_line_single(handle, obj, single_field) 234 | else: 235 | write_line_json(handle, obj) 236 | else: 237 | log.info(f"Something went wrong, invalid output format {output_format}") 238 | except (KeyError, json.JSONDecodeError) as err: 239 | bad_lines += 1 240 | if write_bad_lines: 241 | if isinstance(err, KeyError): 242 | log.warning(f"Key {field} is not in the object: {err}") 243 | elif isinstance(err, json.JSONDecodeError): 244 | log.warning(f"Line decoding failed: {err}") 245 | log.warning(line) 246 | 247 | handle.close() 248 | log.info(f"Complete : {total_lines:,} : {matched_lines:,} : {bad_lines:,}") 249 | 250 | 251 | if __name__ == "__main__": 252 | if single_field is not None: 253 | log.info("Single field output mode, changing output file format to txt") 254 | output_format = "txt" 255 | 256 | if values_file is not None: 257 | values = [] 258 | with open(values_file, 'r') as values_handle: 259 | for value in values_handle: 260 | values.append(value.strip().lower()) 261 | log.info(f"Loaded {len(values)} from values file {values_file}") 262 | else: 263 | values = [value.lower() for value in values] # convert to lowercase 264 | 265 | log.info(f"Filtering field: {field}") 266 | if len(values) <= 20: 267 | log.info(f"On values: {','.join(values)}") 268 | else: 269 | log.info(f"On values:") 270 | for value in values: 271 | log.info(value) 272 | log.info(f"Exact match {('on' if exact_match else 'off')}. Single field {single_field}.") 273 | log.info(f"From date {from_date.strftime('%Y-%m-%d')} to date {to_date.strftime('%Y-%m-%d')}") 274 | log.info(f"Output format set to {output_format}") 275 | 276 | input_files = [] 277 | if os.path.isdir(input_file): 278 | if not os.path.exists(output_file): 279 | os.makedirs(output_file) 280 | for file in os.listdir(input_file): 281 | if not os.path.isdir(file) and file.endswith(".zst"): 282 | input_name = os.path.splitext(os.path.splitext(os.path.basename(file))[0])[0] 283 | input_files.append((os.path.join(input_file, file), os.path.join(output_file, input_name))) 284 | else: 285 | input_files.append((input_file, output_file)) 286 | log.info(f"Processing {len(input_files)} files") 287 | for file_in, file_out in input_files: 288 | try: 289 | process_file(file_in, file_out, output_format, field, values, from_date, to_date, single_field, exact_match) 290 | except Exception as err: 291 | log.warning(f"Error processing {file_in}: {err}") 292 | log.warning(traceback.format_exc()) 293 | -------------------------------------------------------------------------------- /scripts/find_overlapping_users.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from collections import defaultdict 3 | from datetime import datetime, timedelta 4 | import time 5 | import os 6 | import logging.handlers 7 | import zstandard 8 | import json 9 | 10 | # IMPORTANT SETUP INSTRUCTIONS 11 | # get subreddit files from here https://www.reddit.com/r/pushshift/comments/1itme1k/separate_dump_files_for_the_top_40k_subreddits/ 12 | # change the folder line to the folder where the files are stored 13 | # change the subreddits to the list of subreddits, one per line. The case must exactly match, ie, for r/AskReddit, put "AskReddit" 14 | # the files in the folder must match the format from the torrent, subreddit_type.zst, like AskReddit_comments.zst 15 | # the script will look for both comments and submissions files for each subreddit 16 | folder = r"\\MYCLOUDPR4100\Public\reddit\subreddits24" 17 | subreddits_string = """ 18 | askcarsales 19 | Denton 20 | relationship_advice 21 | Dallas 22 | askdfw 23 | AskMen 24 | rolex 25 | lego 26 | """ 27 | ignored_users = {'[deleted]', 'automoderator'} 28 | # this is a list of users to ignore when doing the comparison. Most popular bots post in many subreddits and aren't the person you're looking for 29 | # here's a good start, but add bots to your list as you encounter them https://github.com/Watchful1/PushshiftDumps/blob/master/scripts/ignored.txt 30 | ignored_users_file = "ignored.txt" 31 | min_comments_per_sub = 1 32 | output_file_name = "users.txt" 33 | require_first_subreddit = False # if true, print users that occur in the first subreddit and any one of the following ones. Otherwise just find the most overlap between all subs 34 | from_date = datetime.strptime("2005-01-01", "%Y-%m-%d") 35 | to_date = datetime.strptime("2040-12-31", "%Y-%m-%d") 36 | 37 | 38 | # sets up logging to the console as well as a file 39 | log = logging.getLogger("bot") 40 | log.setLevel(logging.INFO) 41 | log_formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s') 42 | log_str_handler = logging.StreamHandler() 43 | log_str_handler.setFormatter(log_formatter) 44 | log.addHandler(log_str_handler) 45 | if not os.path.exists("logs"): 46 | os.makedirs("logs") 47 | log_file_handler = logging.handlers.RotatingFileHandler(os.path.join("logs", "bot.log"), maxBytes=1024*1024*16, backupCount=5) 48 | log_file_handler.setFormatter(log_formatter) 49 | log.addHandler(log_file_handler) 50 | 51 | 52 | def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0): 53 | chunk = reader.read(chunk_size) 54 | bytes_read += chunk_size 55 | if previous_chunk is not None: 56 | chunk = previous_chunk + chunk 57 | try: 58 | return chunk.decode() 59 | except UnicodeDecodeError: 60 | if bytes_read > max_window_size: 61 | raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes") 62 | log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk") 63 | return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read) 64 | 65 | 66 | def read_lines_zst(file_name): 67 | with open(file_name, 'rb') as file_handle: 68 | buffer = '' 69 | reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle) 70 | while True: 71 | chunk = read_and_decode(reader, 2**27, (2**29) * 2) 72 | 73 | if not chunk: 74 | break 75 | lines = (buffer + chunk).split("\n") 76 | 77 | for line in lines[:-1]: 78 | yield line.strip(), file_handle.tell() 79 | 80 | buffer = lines[-1] 81 | 82 | reader.close() 83 | 84 | 85 | def get_commenters_from_file(subreddit, subreddit_file, subreddit_commenters, total_lines, files_status, from_date, to_date): 86 | file_lines = 0 87 | created = None 88 | file_size = os.stat(subreddit_file).st_size 89 | for line, file_bytes_processed in read_lines_zst(subreddit_file): 90 | total_lines += 1 91 | file_lines += 1 92 | if total_lines % 100000 == 0: 93 | log.info(f"{files_status}: {total_lines:,}: r/{subreddit}: {created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : {(file_bytes_processed / file_size) * 100:.0f}%") 94 | 95 | try: 96 | obj = json.loads(line) 97 | created = datetime.utcfromtimestamp(int(obj['created_utc'])) 98 | if created < from_date or created > to_date: 99 | continue 100 | 101 | if obj['author'].lower() not in ignored_users: 102 | subreddit_commenters[obj['author']] += 1 103 | except (KeyError, json.JSONDecodeError) as err: 104 | pass 105 | log.info(f"{total_lines:,}: {subreddit_file}: {created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : 100%") 106 | return total_lines 107 | 108 | 109 | if __name__ == "__main__": 110 | log.info(f"Subreddit's folder: {folder}") 111 | if not os.path.exists(folder): 112 | log.error(f"Subreddit's folder either doesn't exist or the script doesn't have access to it: {folder}") 113 | sys.exit() 114 | subreddits = [] 115 | for line in subreddits_string.split("\n"): 116 | subreddit = line.strip() 117 | if subreddit == "": 118 | continue 119 | subreddits.append(subreddit) 120 | 121 | if len(subreddits) <= 10: 122 | log.info(f"Finding overlapping users in {', '.join(subreddits)}") 123 | else: 124 | log.info(f"Finding overlapping users in {len(subreddits)} subreddits") 125 | if require_first_subreddit: 126 | log.info(f"Finding users from the first subreddit that are in any of the other subreddits") 127 | log.info(f"Minimum comments per subreddit set to {min_comments_per_sub}") 128 | log.info(f"Outputting to {output_file_name}") 129 | log.info(f"From date {from_date.strftime('%Y-%m-%d')} to date {to_date.strftime('%Y-%m-%d')}") 130 | 131 | if os.path.exists(ignored_users_file): 132 | with open(ignored_users_file) as fh: 133 | for user in fh.readlines(): 134 | ignored_users.add(user.strip().lower()) 135 | log.info(f"Loaded {len(ignored_users)} ignored users from {ignored_users_file}") 136 | 137 | log.info(f"Checking that subreddit files are present") 138 | 139 | folder_files = {} 140 | for file in os.listdir(folder): 141 | folder_files[file.lower()] = file 142 | 143 | subreddit_stats = [] 144 | for subreddit in subreddits: 145 | subreddit_stat = {"files": 0, "bytes": 0, "name": subreddit} 146 | for file_type in ["submissions", "comments"]: 147 | file_ending = f"_{file_type}.zst" 148 | file_name = folder_files.get(f"{subreddit.lower()}{file_ending}") 149 | if file_name is None: 150 | continue 151 | subreddit_file = os.path.join(folder, file_name) 152 | 153 | subreddit_stat["name"] = file_name[0:-len(file_ending)] 154 | subreddit_stat[file_type] = subreddit_file 155 | subreddit_stat["files"] += 1 156 | subreddit_stat["bytes"] += os.stat(subreddit_file).st_size 157 | 158 | subreddit_stats.append(subreddit_stat) 159 | 160 | subreddit_stats.sort(key=lambda x: x["bytes"], reverse=True) 161 | abort = False 162 | for subreddit_stat in subreddit_stats: 163 | if subreddit_stat["files"] == 0: 164 | log.info(f"No files for {subreddit_stat['name']} exist") 165 | abort = True 166 | else: 167 | log.info(f"r/{subreddit_stat['name']} files total {(subreddit_stat['bytes'] / (2**30)):.2f} gb") 168 | 169 | if abort: 170 | log.error(f"The script can see {len(folder_files)} files in the folder, but not the ones requested: {folder}") 171 | sys.exit(0) 172 | 173 | commenterSubreddits = defaultdict(int) 174 | is_first = True 175 | total_lines = 0 176 | files_processed = 1 177 | for subreddit_stat in subreddit_stats: 178 | commenters = defaultdict(int) 179 | for file_type in ["submissions", "comments"]: 180 | total_lines = get_commenters_from_file( 181 | f"{subreddit_stat['name']}_{file_type}", 182 | subreddit_stat[file_type], 183 | commenters, 184 | total_lines, 185 | f"{files_processed}|{len(subreddit_stats)}", 186 | from_date, 187 | to_date 188 | ) 189 | for commenter in commenters: 190 | if require_first_subreddit and not is_first and commenter not in commenterSubreddits: 191 | continue 192 | if commenters[commenter] >= min_comments_per_sub: 193 | commenterSubreddits[commenter] += 1 194 | is_first = False 195 | files_processed += 1 196 | 197 | if require_first_subreddit: 198 | count_found = 0 199 | with open(output_file_name, 'w') as txt: 200 | txt.write(f"Commenters in r/{subreddits[0]} and at least one of {(', '.join(subreddits))}\n") 201 | for commenter, countSubreddits in commenterSubreddits.items(): 202 | if countSubreddits >= 2: 203 | count_found += 1 204 | txt.write(f"{commenter}\n") 205 | log.info(f"{count_found} commenters in r/{subreddits[0]} and at least one of {(', '.join(subreddits))}") 206 | 207 | else: 208 | sharedCommenters = defaultdict(list) 209 | for commenter, countSubreddits in commenterSubreddits.items(): 210 | if countSubreddits >= 2: 211 | sharedCommenters[countSubreddits].append(commenter) 212 | 213 | with open(output_file_name, 'w') as txt: 214 | log.info(f"Writing output to {output_file_name}") 215 | txt.write(f"Commenters in subreddits {(', '.join(subreddits))}\n") 216 | for i in range(len(subreddits)): 217 | commenters = len(sharedCommenters[len(subreddits) - i]) 218 | inner_str = f"but {i} " if i != 0 else "" 219 | log.info(f"{commenters} commenters in all {inner_str}subreddits") 220 | if commenters == 0: 221 | txt.write(f"No commenters in all {inner_str}subreddits\n") 222 | else: 223 | txt.write(f"{commenters} commenters in all {inner_str}subreddits\n") 224 | for user in sorted(sharedCommenters[len(subreddits) - i], key=str.lower): 225 | txt.write(f"{user}\n") 226 | txt.write("\n") 227 | if commenters > 3: 228 | break 229 | -------------------------------------------------------------------------------- /scripts/ignored.txt: -------------------------------------------------------------------------------- 1 | alphabet_order_bot 2 | AmputatorBot 3 | anti-gif-bot 4 | Anti-ThisBot-IB 5 | autotldr 6 | B0tRank 7 | converter-bot 8 | exclaim_bot 9 | GenderNeutralBot 10 | Good_Human_Bot_v2 11 | haikusbot 12 | LearnDifferenceBot 13 | LuckyNumber-Bot 14 | nice___bot 15 | of_patrol_bot 16 | Paid-Not-Payed-Bot 17 | Reddit-Book-Bot 18 | RemindMeBot 19 | SexPanther_Bot 20 | Shakespeare-Bot 21 | sneakpeekbot 22 | TheGratitudeBot 23 | TotesMessenger 24 | Upside_Down-Bot 25 | useles-converter-bot 26 | WaitingToBeTriggered 27 | WaterIsWetBot 28 | wikipedia_answer_bot 29 | WikiSummarizerBot 30 | TheDroidNextDoor 31 | agree-with-you 32 | BadDadBot 33 | Booty_Warrior_bot 34 | dadbot_2 35 | dadbot_3000 36 | FakespotAnalysisBot 37 | Grammar-Bot-Elite 38 | IamYodaBot 39 | LinkifyBot 40 | phonebatterylevelbot 41 | PORTMANTEAU-BOT 42 | queen_of_england_bot 43 | SpambotSwatter 44 | SpunkyDred 45 | tiny_smile_bot 46 | twitterInfo_bot 47 | YoMommaJokeBot 48 | FatFingerHelperBot 49 | WikiTextBot 50 | CommonMisspellingBot 51 | auto-xkcd37 52 | HelperBot_ 53 | imguralbumbot 54 | RepostSleuthBot 55 | wikipedia_text_bot 56 | TheSunflowerSeeds 57 | Bot_Metric 58 | CakeDay--Bot 59 | GoodBot_BadBot 60 | BigLebowskiBot 61 | jashxn 62 | nice-scores 63 | NoGoogleAMPBot 64 | sub_doesnt_exist_bot 65 | WikiMobileLinkBot 66 | EncouragementRobot 67 | AnimalFactsBot 68 | eazeaze 69 | epic_gamer_4268 70 | ghost_of_dongerbot 71 | LimbRetrieval-Bot 72 | LoneKharnivore 73 | video_descriptionbot 74 | WhyNotCollegeBoard 75 | youtubefactsbot 76 | _youtubot_ 77 | Alternative_Case_878 78 | botrickbateman 79 | ClickableLinkBot 80 | I-Am-Dad-Bot 81 | SEND_NUKES_PLZ 82 | UselessConversionBot 83 | Agrees_withyou 84 | AreYouDeaf 85 | autowikibot 86 | ectbot 87 | Generic_Reddit_Bot 88 | happy-cake-day-bot- 89 | Philip_Jeffries 90 | SokkaHaikuBot 91 | table_it_bot 92 | UkraineWithoutTheBot 93 | BotThatSaysBro 94 | ConceptMajestic9156 95 | kelvin_bot 96 | ReverseCaptioningBot 97 | ShelSilverstain 98 | SmileBot-2020 99 | Chuck_Norris_Jokebot 100 | ConvertsToMetric 101 | EmojifierBot 102 | haikubot-1911 103 | I-AM-PIRATE 104 | MaxImageBot 105 | nkid299 106 | resavr_bot 107 | serendipitybot 108 | SmallSubBot 109 | smile-bot-2019 110 | YodaOnReddit-Bot 111 | Anti_Fake_Yoda_Bot 112 | AntiObnoxiousBot 113 | bruh__bot 114 | LeEpicRedditor69 115 | lerobinbot 116 | not_so_magic_8_ball 117 | nwordcountbot 118 | oofed-bot 119 | RatedCommentBot 120 | same_post_bot 121 | same_subreddit_bot 122 | SuicideAwarenessBot 123 | thebenshapirobot 124 | these_days_bot 125 | totes_meta_bot 126 | aardBot 127 | gifv-bot 128 | I_Love_You-BOT 129 | imdad_bot 130 | metric_units 131 | YoUaReSoHiLaRiOuS 132 | HIPPAbot 133 | VettedBot 134 | ackchyually_bot 135 | amp-is-watching-you 136 | AntiLowEffortBot 137 | auddbot 138 | BananaFactBot 139 | BlogSpammr 140 | Chick-fil-A_spellbot 141 | CoolDownBot 142 | demonitize_bot 143 | EverySingleThread 144 | GANDHI-BOT 145 | HappyFriendlyBot 146 | icarebot 147 | kzreminderbot 148 | MCTerminologyBot 149 | Mentioned_Videos 150 | morejpeg_auto 151 | profanitycounter 152 | remindditbot 153 | SaveVideo 154 | savevideobot 155 | The-Worst-Bot 156 | theHelperdroid 157 | VredditDownloader 158 | YOUREABOT 159 | YTubeInfoBot 160 | URLfixerBot 161 | TweetsInCommentsBot 162 | SovietRussiaBot 163 | ShibeBot 164 | PressFBot 165 | LittleHelperRobot 166 | LinkFixerBot 167 | LinkFixerBotSnr 168 | Link_Demobilizer 169 | LazyLinkerBot 170 | Darnit_Bot 171 | checks_out_bot 172 | HippoBot9000 173 | could-of-bot 174 | mentionhelper 175 | RossGellerBot 176 | the_timezone_bot -------------------------------------------------------------------------------- /scripts/iterate_folder.py: -------------------------------------------------------------------------------- 1 | # this is an example of iterating over all zst files in a single folder, 2 | # decompressing them and reading the created_utc field to make sure the files 3 | # are intact. It has no output other than the number of lines 4 | 5 | import zstandard 6 | import os 7 | import json 8 | import sys 9 | from datetime import datetime 10 | import logging.handlers 11 | 12 | 13 | log = logging.getLogger("bot") 14 | log.setLevel(logging.DEBUG) 15 | log.addHandler(logging.StreamHandler()) 16 | 17 | 18 | def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0): 19 | chunk = reader.read(chunk_size) 20 | bytes_read += chunk_size 21 | if previous_chunk is not None: 22 | chunk = previous_chunk + chunk 23 | try: 24 | return chunk.decode() 25 | except UnicodeDecodeError: 26 | if bytes_read > max_window_size: 27 | raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes") 28 | log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk") 29 | return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read) 30 | 31 | 32 | def read_lines_zst(file_name): 33 | with open(file_name, 'rb') as file_handle: 34 | buffer = '' 35 | reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle) 36 | while True: 37 | chunk = read_and_decode(reader, 2**27, (2**29) * 2) 38 | 39 | if not chunk: 40 | break 41 | lines = (buffer + chunk).split("\n") 42 | 43 | for line in lines[:-1]: 44 | yield line.strip(), file_handle.tell() 45 | 46 | buffer = lines[-1] 47 | 48 | reader.close() 49 | 50 | 51 | input_folder = sys.argv[1] 52 | input_files = [] 53 | total_size = 0 54 | for subdir, dirs, files in os.walk(input_folder): 55 | for filename in files: 56 | input_path = os.path.join(subdir, filename) 57 | if input_path.endswith(".zst"): 58 | file_size = os.stat(input_path).st_size 59 | total_size += file_size 60 | input_files.append([input_path, file_size]) 61 | 62 | log.info(f"Processing {len(input_files)} files of {(total_size / (2**30)):.2f} gigabytes") 63 | 64 | total_lines = 0 65 | total_bytes_processed = 0 66 | for input_file in input_files: 67 | file_lines = 0 68 | file_bytes_processed = 0 69 | created = None 70 | for line, file_bytes_processed in read_lines_zst(input_file[0]): 71 | obj = json.loads(line) 72 | created = datetime.utcfromtimestamp(int(obj['created_utc'])) 73 | file_lines += 1 74 | if file_lines == 1: 75 | log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines + total_lines:,} : 0% : {(total_bytes_processed / total_size) * 100:.0f}%") 76 | if file_lines % 100000 == 0: 77 | log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines + total_lines:,} : {(file_bytes_processed / input_file[1]) * 100:.0f}% : {(total_bytes_processed / total_size) * 100:.0f}%") 78 | total_lines += file_lines 79 | total_bytes_processed += input_file[1] 80 | log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {total_lines:,} : 100% : {(total_bytes_processed / total_size) * 100:.0f}%") 81 | 82 | log.info(f"Total: {total_lines}") 83 | -------------------------------------------------------------------------------- /scripts/single_file.py: -------------------------------------------------------------------------------- 1 | # this is an example of loading and iterating over a single file 2 | 3 | import zstandard 4 | import os 5 | import json 6 | import sys 7 | from datetime import datetime 8 | import logging.handlers 9 | 10 | 11 | log = logging.getLogger("bot") 12 | log.setLevel(logging.DEBUG) 13 | log.addHandler(logging.StreamHandler()) 14 | 15 | 16 | def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0): 17 | chunk = reader.read(chunk_size) 18 | bytes_read += chunk_size 19 | if previous_chunk is not None: 20 | chunk = previous_chunk + chunk 21 | try: 22 | return chunk.decode() 23 | except UnicodeDecodeError: 24 | if bytes_read > max_window_size: 25 | raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes") 26 | log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk") 27 | return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read) 28 | 29 | 30 | def read_lines_zst(file_name): 31 | with open(file_name, 'rb') as file_handle: 32 | buffer = '' 33 | reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle) 34 | while True: 35 | chunk = read_and_decode(reader, 2**27, (2**29) * 2) 36 | 37 | if not chunk: 38 | break 39 | lines = (buffer + chunk).split("\n") 40 | 41 | for line in lines[:-1]: 42 | yield line, file_handle.tell() 43 | 44 | buffer = lines[-1] 45 | 46 | reader.close() 47 | 48 | 49 | if __name__ == "__main__": 50 | file_path = sys.argv[1] 51 | file_size = os.stat(file_path).st_size 52 | file_lines = 0 53 | file_bytes_processed = 0 54 | created = None 55 | field = "subreddit" 56 | value = "wallstreetbets" 57 | bad_lines = 0 58 | # try: 59 | for line, file_bytes_processed in read_lines_zst(file_path): 60 | try: 61 | obj = json.loads(line) 62 | created = datetime.utcfromtimestamp(int(obj['created_utc'])) 63 | temp = obj[field] == value 64 | except (KeyError, json.JSONDecodeError) as err: 65 | bad_lines += 1 66 | file_lines += 1 67 | if file_lines % 100000 == 0: 68 | log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : {bad_lines:,} : {file_bytes_processed:,}:{(file_bytes_processed / file_size) * 100:.0f}%") 69 | 70 | # except Exception as err: 71 | # log.info(err) 72 | 73 | log.info(f"Complete : {file_lines:,} : {bad_lines:,}") 74 | 75 | -------------------------------------------------------------------------------- /scripts/to_csv.py: -------------------------------------------------------------------------------- 1 | # this converts a zst file to csv 2 | # 3 | # it's important to note that the resulting file will likely be quite large 4 | # and you probably won't be able to open it in excel or another csv reader 5 | # 6 | # arguments are inputfile, outputfile, fields 7 | # call this like 8 | # python to_csv.py wallstreetbets_submissions.zst wallstreetbets_submissions.csv author,selftext,title 9 | 10 | import zstandard 11 | import os 12 | import json 13 | import sys 14 | import csv 15 | from datetime import datetime 16 | import logging.handlers 17 | 18 | 19 | # put the path to the input file 20 | input_file_path = r"\\MYCLOUDPR4100\Public\tools\PushshiftDumps\Straight-Wrap-172_submissions.zst" 21 | # put the path to the output file, with the csv extension 22 | output_file_path = r"\\MYCLOUDPR4100\Public\Straight-Wrap-172_submissions.csv" 23 | # if you want a custom set of fields, put them in the following list. If you leave it empty the script will use a default set of fields 24 | fields = [] 25 | 26 | log = logging.getLogger("bot") 27 | log.setLevel(logging.DEBUG) 28 | log.addHandler(logging.StreamHandler()) 29 | 30 | 31 | def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0): 32 | chunk = reader.read(chunk_size) 33 | bytes_read += chunk_size 34 | if previous_chunk is not None: 35 | chunk = previous_chunk + chunk 36 | try: 37 | return chunk.decode() 38 | except UnicodeDecodeError: 39 | if bytes_read > max_window_size: 40 | raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes") 41 | return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read) 42 | 43 | 44 | def read_lines_zst(file_name): 45 | with open(file_name, 'rb') as file_handle: 46 | buffer = '' 47 | reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle) 48 | while True: 49 | chunk = read_and_decode(reader, 2**27, (2**29) * 2) 50 | if not chunk: 51 | break 52 | lines = (buffer + chunk).split("\n") 53 | 54 | for line in lines[:-1]: 55 | yield line, file_handle.tell() 56 | 57 | buffer = lines[-1] 58 | reader.close() 59 | 60 | 61 | if __name__ == "__main__": 62 | if len(sys.argv) >= 3: 63 | input_file_path = sys.argv[1] 64 | output_file_path = sys.argv[2] 65 | fields = sys.argv[3].split(",") 66 | 67 | is_submission = "submission" in input_file_path 68 | if not len(fields): 69 | if is_submission: 70 | fields = ["author","title","score","created","link","text","url"] 71 | else: 72 | fields = ["author","score","created","link","body"] 73 | 74 | file_size = os.stat(input_file_path).st_size 75 | file_lines, bad_lines = 0, 0 76 | line, created = None, None 77 | output_file = open(output_file_path, "w", encoding='utf-8', newline="") 78 | writer = csv.writer(output_file) 79 | writer.writerow(fields) 80 | try: 81 | for line, file_bytes_processed in read_lines_zst(input_file_path): 82 | try: 83 | obj = json.loads(line) 84 | output_obj = [] 85 | for field in fields: 86 | if field == "created": 87 | value = datetime.fromtimestamp(int(obj['created_utc'])).strftime("%Y-%m-%d %H:%M") 88 | elif field == "link": 89 | if 'permalink' in obj: 90 | value = f"https://www.reddit.com{obj['permalink']}" 91 | else: 92 | value = f"https://www.reddit.com/r/{obj['subreddit']}/comments/{obj['link_id'][3:]}/_/{obj['id']}/" 93 | elif field == "author": 94 | value = f"u/{obj['author']}" 95 | elif field == "text": 96 | if 'selftext' in obj: 97 | value = obj['selftext'] 98 | else: 99 | value = "" 100 | else: 101 | value = obj[field] 102 | 103 | output_obj.append(str(value).encode("utf-8", errors='replace').decode()) 104 | writer.writerow(output_obj) 105 | 106 | created = datetime.utcfromtimestamp(int(obj['created_utc'])) 107 | except json.JSONDecodeError as err: 108 | bad_lines += 1 109 | file_lines += 1 110 | if file_lines % 100000 == 0: 111 | log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : {bad_lines:,} : {(file_bytes_processed / file_size) * 100:.0f}%") 112 | except KeyError as err: 113 | log.info(f"Object has no key: {err}") 114 | log.info(line) 115 | except Exception as err: 116 | log.info(err) 117 | log.info(line) 118 | 119 | output_file.close() 120 | log.info(f"Complete : {file_lines:,} : {bad_lines:,}") 121 | 122 | --------------------------------------------------------------------------------