├── .gitignore
├── CITATION.cff
├── LICENSE.md
├── Pipfile
├── Pipfile.lock
├── README.md
├── personal
    ├── combine
    │   ├── build_month.py
    │   ├── classes.py
    │   ├── merge.py
    │   ├── merge_and_backfill.py
    │   └── merge_minutes.py
    ├── compression
    │   ├── extract_file.py
    │   ├── recompress_file.py
    │   ├── recompress_folder.py
    │   └── recompress_folder_multiprocess.py
    ├── diagnostic
    │   ├── comments_per_day.py
    │   ├── comments_per_day_with_score.py
    │   ├── compare_lines.py
    │   ├── count_fields.py
    │   ├── count_subreddits_multiprocess.py
    │   ├── get_zst_details.py
    │   ├── sum_subreddit_counts.py
    │   ├── test_file.py
    │   └── test_files_multiprocess.py
    ├── mongo
    │   ├── export_mongo.py
    │   ├── group_subs.py
    │   └── insert_mongo.py
    ├── move
    │   ├── copy_listed_files.py
    │   ├── move_files.py
    │   └── rename_files.py
    ├── opt_in_quarantined.py
    ├── process_month.py
    ├── transform
    │   ├── split_blocks_by_minutes.py
    │   ├── split_by_minutes.py
    │   └── split_by_subreddit.py
    ├── utils.py
    └── zst_blocks.py
└── scripts
    ├── combine_folder_multiprocess.py
    ├── count_words_single_file.py
    ├── filter_file.py
    ├── find_overlapping_users.py
    ├── ignored.txt
    ├── iterate_folder.py
    ├── single_file.py
    └── to_csv.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/*
2 | logs/*
3 | __pycache__/*
4 | *.db
5 | *.ini
6 | *.txt


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | # This CITATION.cff file was generated with cffinit.
 2 | # Visit https://bit.ly/cffinit to generate yours today!
 3 | 
 4 | cff-version: 1.2.0
 5 | title: Pushshift dump utils
 6 | message: >-
 7 |   If you use this software, please cite it using the
 8 |   metadata from this file.
 9 | type: software
10 | authors:
11 |   - given-names: Watchful1
12 | repository-code: 'https://github.com/Watchful1/PushshiftDumps'
13 | abstract: >-
14 |   Tools to help parse reddit data from zstandard compressed
15 |   ndjson files from the pushshift archives
16 | license: MIT
17 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Watchful1
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | zstandard = "*"
 8 | discord-logging = {editable = true, git = "https://github.com/Watchful1/DiscordLogging.git"}
 9 | requests = "*"
10 | pymongo = {extras = ["srv"], version = "*"}
11 | scipy = "*"
12 | sortedcontainers = "*"
13 | praw = "*"
14 | multiprocessing-logging = "*"
15 | 
16 | [dev-packages]
17 | 
18 | [requires]
19 | python_version = "3.9"
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | This repo contains example python scripts for processing the reddit dump files created by pushshift. The files can be downloaded from [here](https://files.pushshift.io/reddit/) or torrented from [here](https://academictorrents.com/details/f37bb9c0abe350f0f1cbd4577d0fe413ed07724e).
2 | 
3 | * `single_file.py` decompresses and iterates over a single zst compressed file
4 | * `iterate_folder.py` does the same, but for all files in a folder
5 | * `combine_folder_multiprocess.py` uses separate processes to iterate over multiple files in parallel, writing lines that match the criteria passed in to text files, then combining them into a final zst compressed file


--------------------------------------------------------------------------------
/personal/combine/build_month.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import requests
  3 | import time
  4 | import discord_logging
  5 | import argparse
  6 | import os
  7 | import re
  8 | import zstandard
  9 | from datetime import datetime, timedelta
 10 | import json
 11 | import praw
 12 | from praw import endpoints
 13 | import prawcore
 14 | import logging.handlers
 15 | 
 16 | sys.path.append('personal')
 17 | 
 18 | log = discord_logging.get_logger(init=True)
 19 | 
 20 | import utils
 21 | import classes
 22 | from classes import IngestType
 23 | from merge import ObjectType
 24 | 
 25 | 
 26 | NEWLINE_ENCODED = "\n".encode('utf-8')
 27 | reg = re.compile(r"\d\d-\d\d-\d\d_\d\d-\d\d")
 28 | 
 29 | 
 30 | def build_month(month, input_folder, output_folder, file_type, compression_level):
 31 | 	if file_type == "comments":
 32 | 		prefix = "RC"
 33 | 	elif file_type == "submissions":
 34 | 		prefix = "RS"
 35 | 	else:
 36 | 		log.error(f"Invalid type: {args.type}")
 37 | 		sys.exit(2)
 38 | 
 39 | 	total_objects = 0
 40 | 	total_bytes = 0
 41 | 	minute_iterator = month
 42 | 	if month.month == 12:
 43 | 		end_time = month.replace(year=month.year + 1, month=1)
 44 | 	else:
 45 | 		end_time = month.replace(month=month.month + 1)
 46 | 	while minute_iterator < end_time:
 47 | 		minute_file_path = os.path.join(input_folder, file_type, minute_iterator.strftime('%y-%m-%d'), f"{prefix}_{minute_iterator.strftime('%y-%m-%d_%H-%M')}.zst")
 48 | 		for obj, line, _ in utils.read_obj_zst_meta(minute_file_path):
 49 | 			total_bytes += len(line.encode('utf-8'))
 50 | 			total_bytes += 1
 51 | 
 52 | 			total_objects += 1
 53 | 			if total_objects % 1000000 == 0:
 54 | 				log.info(f"{file_type}: Counting: {minute_iterator.strftime('%y-%m-%d_%H-%M')} : {total_objects:,} : {total_bytes:,}")
 55 | 
 56 | 		minute_iterator += timedelta(minutes=1)
 57 | 
 58 | 	log.info(f"{file_type}: Counting: {minute_iterator.strftime('%y-%m-%d_%H-%M')} : {total_objects:,} : {total_bytes:,}")
 59 | 
 60 | 	output_path = os.path.join(output_folder, file_type, f"{prefix}_{month.strftime('%Y-%m')}.zst")
 61 | 	output_handle = zstandard.ZstdCompressor(level=compression_level, write_content_size=True, write_checksum=True, threads=-1).stream_writer(open(output_path, 'wb'), size=total_bytes)
 62 | 
 63 | 	count_objects = 0
 64 | 	count_bytes = 0
 65 | 	minute_iterator = month
 66 | 	if month.month == 12:
 67 | 		end_time = month.replace(year=month.year + 1, month=1)
 68 | 	else:
 69 | 		end_time = month.replace(month=month.month + 1)
 70 | 	while minute_iterator < end_time:
 71 | 		minute_file_path = os.path.join(input_folder, file_type, minute_iterator.strftime('%y-%m-%d'), f"{prefix}_{minute_iterator.strftime('%y-%m-%d_%H-%M')}.zst")
 72 | 		for obj, line, _ in utils.read_obj_zst_meta(minute_file_path):
 73 | 			line_encoded = line.encode('utf-8')
 74 | 			count_bytes += len(line_encoded)
 75 | 			count_bytes += 1
 76 | 			output_handle.write(line_encoded)
 77 | 			output_handle.write(NEWLINE_ENCODED)
 78 | 
 79 | 			count_objects += 1
 80 | 			if count_objects % 100000 == 0:
 81 | 				log.info(f"{file_type}: Writing: {minute_iterator.strftime('%y-%m-%d_%H-%M')} : {count_objects:,}/{total_objects:,} : {count_bytes:,}/{total_bytes:,}")
 82 | 
 83 | 		minute_iterator += timedelta(minutes=1)
 84 | 
 85 | 	log.info(f"{file_type}: Writing: {minute_iterator.strftime('%y-%m-%d_%H-%M')} : {count_objects:,}/{total_objects:,} : {count_bytes:,}/{total_bytes:,}")
 86 | 	output_handle.close()
 87 | 
 88 | 
 89 | if __name__ == "__main__":
 90 | 	parser = argparse.ArgumentParser(description="Combine the minute files into a single month")
 91 | 	parser.add_argument("--type", help="The object type, either comments or submissions", required=True)
 92 | 	parser.add_argument("--month", help="The month to process, format YY-MM", required=True)
 93 | 	parser.add_argument('--input', help='Input folder', required=True)
 94 | 	parser.add_argument('--output', help='Output folder', required=True)
 95 | 	parser.add_argument("--debug", help="Enable debug logging", action='store_const', const=True, default=False)
 96 | 	parser.add_argument("--level", help="The compression ratio to output at", default="3")
 97 | 	args = parser.parse_args()
 98 | 
 99 | 	if args.debug:
100 | 		discord_logging.set_level(logging.DEBUG)
101 | 
102 | 	month = datetime.strptime(args.month, '%y-%m')
103 | 	level = int(args.level)
104 | 
105 | 	log.info(f"Input folder: {args.input}")
106 | 	log.info(f"Output folder: {args.output}")
107 | 	log.info(f"Month: {args.month}")
108 | 	log.info(f"Compression level: {level}")
109 | 
110 | 	build_month(
111 | 		month,
112 | 		args.input,
113 | 		args.output,
114 | 		args.type,
115 | 		level
116 | 	)
117 | 


--------------------------------------------------------------------------------
/personal/combine/classes.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | import os
  3 | import discord_logging
  4 | import sys
  5 | import zstandard
  6 | import json
  7 | from enum import Enum
  8 | from sortedcontainers import SortedList
  9 | from collections import defaultdict
 10 | 
 11 | log = discord_logging.get_logger()
 12 | 
 13 | import utils
 14 | import merge
 15 | 
 16 | NEWLINE_ENCODED = "\n".encode('utf-8')
 17 | 
 18 | 
 19 | class ApiRequest:
 20 | 	def __init__(self, ids, is_submission, ingest_name, estimated_datetime=None, missing_expected=False):
 21 | 		self.ids = ids
 22 | 		self.is_submission = is_submission
 23 | 		self.ingest_name = ingest_name
 24 | 		self.estimated_datetime = estimated_datetime
 25 | 		self.missing_expected = missing_expected
 26 | 		self.results = None
 27 | 		self.complete = False
 28 | 		self.tries = 0
 29 | 		self.prev_lengths = []
 30 | 
 31 | 	def should_retry(self):
 32 | 		if self.complete:
 33 | 			return False  # the request is complete, no need to retry
 34 | 		if len(self.prev_lengths) <= 1:
 35 | 			return True  # we've only made one attempt and it didn't work, do retry
 36 | 		if self.prev_lengths[-1] == 0:
 37 | 			if len(self.prev_lengths) < (10 if self.missing_expected else 100):
 38 | 				return True  # the most recent result was 0 objects, retry up to 100 times
 39 | 			else:
 40 | 				log.info(f"Force finished request with retries: {self}")
 41 | 				self.complete = True
 42 | 				return False
 43 | 		if self.prev_lengths[-1] == self.prev_lengths[-2]:
 44 | 			if self.missing_expected:
 45 | 				self.complete = True
 46 | 				return False  # the latest two requests were the same and we're expecting missing objects, mark as complete
 47 | 			elif len(self.prev_lengths) >= 4 and \
 48 | 					self.prev_lengths[-1] == self.prev_lengths[-3] and \
 49 | 					self.prev_lengths[-1] == self.prev_lengths[-4]:
 50 | 				log.info(f"Force finished request with retries: {self}")
 51 | 				self.complete = True
 52 | 				return False  # the latest four requests were the same, go ahead and mark as complete
 53 | 		return True  # recent requests didn't match, and weren't 0, go ahead and retry
 54 | 
 55 | 	def get_body_key(self):
 56 | 		return "self_text" if self.is_submission else "body"
 57 | 
 58 | 	def get_string_type(self):
 59 | 		return "submission" if self.is_submission else "comment"
 60 | 
 61 | 	def get_prefix(self):
 62 | 		return "t3_" if self.is_submission else "t1_"
 63 | 
 64 | 	def set_results(self, results):
 65 | 		self.prev_lengths.append(len(results))
 66 | 		self.results = []
 67 | 		current_timestamp = int(datetime.utcnow().timestamp())
 68 | 		for result in results:
 69 | 			obj = result['data']
 70 | 			if 'body_html' in obj:
 71 | 				del obj['body_html']
 72 | 			if 'selftext_html' in obj:
 73 | 				del obj['selftext_html']
 74 | 			obj['retrieved_on'] = current_timestamp
 75 | 			self.results.append(obj)
 76 | 		log.debug(f"Set result: {self}")
 77 | 
 78 | 	def id_string(self):
 79 | 		return f"{self.get_prefix()}{(f',{self.get_prefix()}'.join(self.ids))}"
 80 | 
 81 | 	def __str__(self):
 82 | 		return \
 83 | 			f"{self.ingest_name}: {self.ids[0]}-{self.ids[-1]} {self.get_string_type()}: " \
 84 | 			f"{len(self.results) if self.results else self.results} : {self.tries} : " \
 85 | 			f"{self.complete} : {','.join([str(val) for val in self.prev_lengths])}"
 86 | 
 87 | 	def __gt__(self, other):
 88 | 		if isinstance(other, ApiRequest):
 89 | 			return False
 90 | 		return True
 91 | 
 92 | 	def __lt__(self, other):
 93 | 		if isinstance(other, ApiRequest):
 94 | 			return True
 95 | 		return False
 96 | 
 97 | 	def __eq__(self, other):
 98 | 		if isinstance(other, ApiRequest):
 99 | 			return True
100 | 		return False
101 | 
102 | 
103 | class Queue:
104 | 	def __init__(self, max_size):
105 | 		self.list = []
106 | 		self.max_size = max_size
107 | 
108 | 	def put(self, item):
109 | 		if len(self.list) >= self.max_size:
110 | 			self.list.pop(0)
111 | 		self.list.append(item)
112 | 
113 | 	def peek(self):
114 | 		return self.list[0] if len(self.list) > 0 else None
115 | 
116 | 
117 | class OutputHandle:
118 | 	def __init__(self, is_submission, dump_folder):
119 | 		self.handle = None
120 | 		self.current_path = None
121 | 		self.current_minute = None
122 | 		self.is_submission = is_submission
123 | 		self.dump_folder = dump_folder
124 | 
125 | 		if not os.path.exists(dump_folder):
126 | 			os.makedirs(dump_folder)
127 | 
128 | 	def matched_minute(self, new_date_time):
129 | 		return self.current_minute is not None and new_date_time.minute == self.current_minute
130 | 
131 | 	def get_path(self, date_folder, export_filename, increment=None):
132 | 		folder = f"{self.dump_folder}{os.path.sep}{date_folder}"
133 | 		if not os.path.exists(folder):
134 | 			os.makedirs(folder)
135 | 
136 | 		bldr = [folder]
137 | 		bldr.append(os.path.sep)
138 | 		if self.is_submission:
139 | 			bldr.append("RS_")
140 | 		else:
141 | 			bldr.append("RC_")
142 | 		bldr.append(export_filename)
143 | 		if increment is not None:
144 | 			bldr.append("_")
145 | 			bldr.append(str(increment))
146 | 		bldr.append(".zst")
147 | 
148 | 		return ''.join(bldr)
149 | 
150 | 	def rollover_to_minute(self, date_time):
151 | 		if self.handle is not None:
152 | 			self.handle.close()
153 | 			os.rename(self.current_path + ".tmp", self.current_path)
154 | 		date_folder = date_time.strftime('%y-%m-%d')
155 | 		export_filename = date_time.strftime('%y-%m-%d_%H-%M')
156 | 		export_path = self.get_path(date_folder, export_filename)
157 | 		if os.path.exists(export_path + ".tmp"):
158 | 			os.rename(export_path + ".tmp", export_path)
159 | 		i = 0
160 | 		while os.path.exists(export_path):
161 | 			log.info(f"Dump exists, incrementing: {export_path}")
162 | 			i += 1
163 | 			export_path = self.get_path(date_folder, export_filename, i)
164 | 			if i > 100:
165 | 				log.warning(f"Something went wrong, more than 100 dumps for minute, aborting")
166 | 				sys.exit(3)
167 | 		self.current_path = export_path
168 | 		self.handle = zstandard.ZstdCompressor().stream_writer(open(export_path + ".tmp", 'wb'))
169 | 		self.current_minute = date_time.minute
170 | 
171 | 	def write_object(self, obj):
172 | 		self.handle.write(json.dumps(obj, sort_keys=True).encode('utf-8'))
173 | 		self.handle.write(NEWLINE_ENCODED)
174 | 
175 | 	def flush(self):
176 | 		self.handle.flush()
177 | 
178 | 	def close(self):
179 | 		if self.handle is not None:
180 | 			self.handle.close()
181 | 
182 | 
183 | class IngestType(Enum):
184 | 	INGEST = 1
185 | 	RESCAN = 2
186 | 	DOWNLOAD = 3
187 | 	PUSHSHIFT = 4
188 | 	BACKFILL = 5
189 | 	MISSING = 6
190 | 
191 | 
192 | class ObjectDict:
193 | 	def __init__(self, min_datetime, max_datetime, obj_type):
194 | 		self.min_datetime = min_datetime
195 | 		self.max_datetime = max_datetime
196 | 		self.obj_type = obj_type
197 | 
198 | 		self.counts = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
199 | 		self.min_id = None
200 | 		self.max_id = None
201 | 
202 | 		self.by_id = {}
203 | 		self.by_minute = defaultdict(ObjectMinuteList)
204 | 
205 | 	def contains_id(self, str_id):
206 | 		return str_id in self.by_id
207 | 
208 | 	def delete_object_id(self, str_id):
209 | 		del self.by_id[str_id]
210 | 
211 | 	def delete_objects_below_minute(self, delete_below_minute):
212 | 		for minute, minute_list in self.by_minute.items():
213 | 			if minute < delete_below_minute:
214 | 				for obj in minute_list.obj_list:
215 | 					self.delete_object_id(obj['id'])
216 | 
217 | 	def rebuild_minute_dict(self):
218 | 		self.by_minute = defaultdict(ObjectMinuteList)
219 | 		for obj in self.by_id.values():
220 | 			created_minute = datetime.utcfromtimestamp(obj["created_utc"]).replace(second=0, microsecond=0)
221 | 			self.by_minute[created_minute].add(obj)
222 | 
223 | 	def count_minutes(self):
224 | 		return len(self.by_minute)
225 | 
226 | 	@staticmethod
227 | 	def get_counts_string_from_dict(counts_dict, ingest_types):
228 | 		bldr = []
229 | 		for ingest_type in ingest_types:
230 | 			if ingest_type in counts_dict:
231 | 				bldr.append(f"{counts_dict[ingest_type][True]}({counts_dict[ingest_type][False]})")
232 | 			else:
233 | 				bldr.append("0(0)")
234 | 		return "|".join(bldr)
235 | 
236 | 	def get_counts_string_by_minute(self, minute, ingest_types):
237 | 		count_string = ObjectDict.get_counts_string_from_dict(self.counts[minute], ingest_types)
238 | 		minute_dict = self.by_minute.get(minute)
239 | 		if minute_dict is None or minute_dict.max_id is None or minute_dict.min_id is None:
240 | 			range_string = ""
241 | 		else:
242 | 			range_string = f" - {len(minute_dict.obj_list)}({minute_dict.max_id - minute_dict.min_id}) ({utils.base36encode(minute_dict.min_id)}-{utils.base36encode(minute_dict.max_id)})"
243 | 		return count_string + range_string
244 | 
245 | 	def get_counts_string(self):
246 | 		sum_dict = defaultdict(lambda: defaultdict(int))
247 | 		for counts_dict in self.counts.values():
248 | 			for ingest_type in IngestType:
249 | 				if ingest_type in counts_dict:
250 | 					sum_dict[ingest_type][True] += counts_dict[ingest_type][True]
251 | 					sum_dict[ingest_type][False] += counts_dict[ingest_type][False]
252 | 
253 | 		return ObjectDict.get_counts_string_from_dict(sum_dict, IngestType)
254 | 
255 | 	def get_missing_ids_by_minutes(self, start_minute, end_minute, ignore_ids):
256 | 		start_id = self.by_minute[start_minute].min_id
257 | 		end_id = self.by_minute[end_minute].max_id
258 | 		missing_ids = []
259 | 		count_ignored_ids = 0
260 | 		for int_id in range(start_id, end_id + 1):
261 | 			ignored = False
262 | 			for ignore_start, ignore_end in ignore_ids:
263 | 				if ignore_start <= int_id <= ignore_end:
264 | 					count_ignored_ids += 1
265 | 					ignored = True
266 | 					break
267 | 			if ignored:
268 | 				continue
269 | 
270 | 			string_id = utils.base36encode(int_id)
271 | 			if not self.contains_id(string_id):
272 | 				missing_ids.append(string_id)
273 | 		if count_ignored_ids > 0:
274 | 			log.warning(f"Ignored {count_ignored_ids} ids in range {utils.base36encode(start_id)}-{utils.base36encode(end_id)}")
275 | 		return missing_ids, start_id, end_id
276 | 
277 | 	def add_object(self, obj, ingest_type):
278 | 		created_utc = datetime.utcfromtimestamp(obj["created_utc"])
279 | 		created_minute = created_utc.replace(second=0, microsecond=0)
280 | 		if obj['id'] in self.by_id:
281 | 			existing_obj = self.by_id[obj['id']]
282 | 			unmatched_field = merge.merge_fields(existing_obj, obj, self.obj_type)
283 | 			self.counts[created_minute][ingest_type][False] += 1
284 | 			return unmatched_field
285 | 		if created_utc < self.min_datetime or created_utc > self.max_datetime:
286 | 			return False
287 | 		unmatched_field = merge.parse_fields(obj, self.obj_type)
288 | 		self.by_id[obj['id']] = obj
289 | 		self.by_minute[created_minute].add(obj)
290 | 		self.counts[created_minute][ingest_type][True] += 1
291 | 		self.min_id, self.max_id = utils.merge_lowest_highest_id(obj['id'], self.min_id, self.max_id)
292 | 		return unmatched_field
293 | 
294 | 	def add_missing_object(self, obj_id):
295 | 		if obj_id in self.by_id:
296 | 			return
297 | 		int_id = utils.base36decode(obj_id)
298 | 		for minute, minute_dict in self.by_minute.items():
299 | 			if minute_dict.min_id is None:
300 | 				continue
301 | 			if minute_dict.min_id < int_id < minute_dict.max_id:
302 | 				self.counts[minute][IngestType.MISSING][True] += 1
303 | 				return
304 | 
305 | 
306 | class ObjectMinuteList:
307 | 	def __init__(self):
308 | 		self.obj_list = SortedList(key=lambda x: f"{x['created_utc']}:{x['id']}")
309 | 		self.min_id = None
310 | 		self.max_id = None
311 | 
312 | 	def add(self, obj):
313 | 		self.min_id, self.max_id = utils.merge_lowest_highest_id(obj['id'], self.min_id, self.max_id)
314 | 		self.obj_list.add(obj)
315 | 


--------------------------------------------------------------------------------
/personal/combine/merge.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import sys
  3 | from enum import Enum
  4 | from datetime import datetime
  5 | import discord_logging
  6 | 
  7 | log = discord_logging.get_logger()
  8 | 
  9 | 
 10 | class FieldAction(Enum):
 11 | 	OVERWRITE = 1
 12 | 	OVERWRITE_NOT_NONE = 2
 13 | 	OVERWRITE_IF_NONE = 3
 14 | 	DONT_OVERWRITE = 4
 15 | 	DELETE = 5
 16 | 	SPECIAL = 6
 17 | 	SPECIAL_NO_OVERWRITE = 7
 18 | 	ALLOW = 8
 19 | 	ALLOW_EMPTY = 9
 20 | 
 21 | 
 22 | class ObjectType(Enum):
 23 | 	COMMENT = 1
 24 | 	SUBMISSION = 2
 25 | 
 26 | 
 27 | field_actions = {
 28 | 	ObjectType.COMMENT: {
 29 | 		"_meta": FieldAction.OVERWRITE,
 30 | 		"all_awardings": FieldAction.OVERWRITE_NOT_NONE,
 31 | 		"approved": FieldAction.DELETE,
 32 | 		"approved_at_utc": FieldAction.SPECIAL_NO_OVERWRITE,
 33 | 		"approved_by": FieldAction.SPECIAL_NO_OVERWRITE,
 34 | 		"archived": FieldAction.OVERWRITE,
 35 | 		"associated_award": FieldAction.ALLOW_EMPTY,
 36 | 		"author": FieldAction.OVERWRITE_IF_NONE,
 37 | 		"author_cakeday": FieldAction.DONT_OVERWRITE,
 38 | 		"author_flair_background_color": FieldAction.OVERWRITE_IF_NONE,
 39 | 		"author_flair_css_class": FieldAction.OVERWRITE_IF_NONE,
 40 | 		"author_flair_richtext": FieldAction.OVERWRITE_IF_NONE,
 41 | 		"author_flair_template_id": FieldAction.OVERWRITE_IF_NONE,
 42 | 		"author_flair_text": FieldAction.OVERWRITE_IF_NONE,
 43 | 		"author_flair_text_color": FieldAction.OVERWRITE_IF_NONE,
 44 | 		"author_flair_type": FieldAction.OVERWRITE_IF_NONE,
 45 | 		"author_fullname": FieldAction.OVERWRITE_IF_NONE,
 46 | 		"author_is_blocked": FieldAction.SPECIAL_NO_OVERWRITE,
 47 | 		"author_patreon_flair": FieldAction.OVERWRITE,
 48 | 		"author_premium": FieldAction.OVERWRITE,
 49 | 		"awarders": FieldAction.OVERWRITE_IF_NONE,
 50 | 		"ban_note": FieldAction.DELETE,
 51 | 		"banned_at_utc": FieldAction.SPECIAL_NO_OVERWRITE,
 52 | 		"banned_by": FieldAction.SPECIAL_NO_OVERWRITE,
 53 | 		"body": FieldAction.SPECIAL,
 54 | 		"body_html": FieldAction.DELETE,
 55 | 		"body_sha1": FieldAction.OVERWRITE_NOT_NONE,
 56 | 		"can_gild": FieldAction.OVERWRITE,
 57 | 		"can_mod_post": FieldAction.SPECIAL_NO_OVERWRITE,
 58 | 		"collapsed": FieldAction.OVERWRITE,
 59 | 		"collapsed_because_crowd_control": FieldAction.ALLOW_EMPTY,
 60 | 		"collapsed_reason": FieldAction.OVERWRITE,
 61 | 		"collapsed_reason_code": FieldAction.OVERWRITE,
 62 | 		"comment_type": FieldAction.OVERWRITE_NOT_NONE,
 63 | 		"controversiality": FieldAction.OVERWRITE,
 64 | 		"created": FieldAction.OVERWRITE_IF_NONE,
 65 | 		"created_utc": FieldAction.OVERWRITE_IF_NONE,
 66 | 		"distinguished": FieldAction.OVERWRITE,
 67 | 		"downs": FieldAction.OVERWRITE_IF_NONE,
 68 | 		"editable": FieldAction.OVERWRITE,
 69 | 		"edited": FieldAction.OVERWRITE_NOT_NONE,
 70 | 		"edited_on": FieldAction.ALLOW,
 71 | 		"expression_asset_data": FieldAction.OVERWRITE_NOT_NONE,
 72 | 		"gilded": FieldAction.OVERWRITE_NOT_NONE,
 73 | 		"gildings": FieldAction.OVERWRITE_NOT_NONE,
 74 | 		"id": FieldAction.ALLOW,
 75 | 		"ignore_reports": FieldAction.DELETE,
 76 | 		"is_submitter": FieldAction.DONT_OVERWRITE,
 77 | 		"likes": FieldAction.OVERWRITE_NOT_NONE,
 78 | 		"link_id": FieldAction.ALLOW,
 79 | 		"locked": FieldAction.OVERWRITE,
 80 | 		"media_metadata": FieldAction.OVERWRITE,
 81 | 		"mod_note": FieldAction.ALLOW_EMPTY,
 82 | 		"mod_reason_by": FieldAction.SPECIAL_NO_OVERWRITE,
 83 | 		"mod_reason_title": FieldAction.SPECIAL_NO_OVERWRITE,
 84 | 		"mod_reports": FieldAction.SPECIAL_NO_OVERWRITE,
 85 | 		"mod_reports_dismissed": FieldAction.SPECIAL_NO_OVERWRITE,
 86 | 		"name": FieldAction.OVERWRITE_IF_NONE,
 87 | 		"nest_level": FieldAction.OVERWRITE_NOT_NONE,
 88 | 		"no_follow": FieldAction.OVERWRITE,
 89 | 		"num_reports": FieldAction.SPECIAL_NO_OVERWRITE,
 90 | 		"parent_id": FieldAction.OVERWRITE_IF_NONE,
 91 | 		"permalink": FieldAction.DONT_OVERWRITE,
 92 | 		"removal_reason": FieldAction.SPECIAL,
 93 | 		"removed": FieldAction.DELETE,
 94 | 		"replies": FieldAction.OVERWRITE_IF_NONE,
 95 | 		"report_reasons": FieldAction.SPECIAL_NO_OVERWRITE,
 96 | 		"retrieved_on": FieldAction.SPECIAL,
 97 | 		"retrieved_utc": FieldAction.SPECIAL,
 98 | 		"rte_mode": FieldAction.OVERWRITE_NOT_NONE,
 99 | 		"saved": FieldAction.SPECIAL_NO_OVERWRITE,
100 | 		"score": FieldAction.SPECIAL,
101 | 		"score_hidden": FieldAction.OVERWRITE,
102 | 		"send_replies": FieldAction.OVERWRITE,
103 | 		"spam": FieldAction.DELETE,
104 | 		"stickied": FieldAction.OVERWRITE,
105 | 		"subreddit": FieldAction.OVERWRITE_NOT_NONE,
106 | 		"subreddit_id": FieldAction.OVERWRITE_NOT_NONE,
107 | 		"subreddit_name_prefixed": FieldAction.OVERWRITE_NOT_NONE,
108 | 		"subreddit_type": FieldAction.DONT_OVERWRITE,
109 | 		"top_awarded_type": FieldAction.ALLOW_EMPTY,
110 | 		"total_awards_received": FieldAction.OVERWRITE_NOT_NONE,
111 | 		"treatment_tags": FieldAction.OVERWRITE_NOT_NONE,
112 | 		"unrepliable_reason": FieldAction.OVERWRITE_NOT_NONE,
113 | 		"ups": FieldAction.OVERWRITE_NOT_NONE,
114 | 		"user_reports": FieldAction.SPECIAL_NO_OVERWRITE,
115 | 		"user_reports_dismissed": FieldAction.SPECIAL_NO_OVERWRITE,
116 | 		"updated_on": FieldAction.SPECIAL,
117 | 		"updated_utc": FieldAction.SPECIAL,
118 | 		"utc_datetime_str": FieldAction.DELETE,
119 | 	},
120 | 	ObjectType.SUBMISSION: {
121 | 		"_meta": FieldAction.OVERWRITE,
122 | 		"ad_business": FieldAction.OVERWRITE_NOT_NONE,
123 | 		"ad_promoted_user_posts": FieldAction.OVERWRITE_NOT_NONE,
124 | 		"ad_supplementary_text_md": FieldAction.OVERWRITE_NOT_NONE,
125 | 		"ad_user_targeting": FieldAction.OVERWRITE_NOT_NONE,
126 | 		"adserver_click_url": FieldAction.ALLOW_EMPTY,
127 | 		"adserver_imp_pixel": FieldAction.ALLOW_EMPTY,
128 | 		"all_awardings": FieldAction.OVERWRITE_NOT_NONE,
129 | 		"allow_live_comments": FieldAction.OVERWRITE,
130 | 		"app_store_data": FieldAction.ALLOW_EMPTY,
131 | 		"approved": FieldAction.DELETE,
132 | 		"approved_at_utc": FieldAction.SPECIAL_NO_OVERWRITE,
133 | 		"approved_by": FieldAction.SPECIAL_NO_OVERWRITE,
134 | 		"archived": FieldAction.ALLOW_EMPTY,
135 | 		"author": FieldAction.OVERWRITE_IF_NONE,
136 | 		"author_cakeday": FieldAction.DONT_OVERWRITE,
137 | 		"author_flair_background_color": FieldAction.OVERWRITE_NOT_NONE,
138 | 		"author_flair_css_class": FieldAction.OVERWRITE_NOT_NONE,
139 | 		"author_flair_richtext": FieldAction.OVERWRITE_NOT_NONE,
140 | 		"author_flair_template_id": FieldAction.OVERWRITE_NOT_NONE,
141 | 		"author_flair_text": FieldAction.OVERWRITE_NOT_NONE,
142 | 		"author_flair_text_color": FieldAction.OVERWRITE_NOT_NONE,
143 | 		"author_flair_type": FieldAction.OVERWRITE_NOT_NONE,
144 | 		"author_fullname": FieldAction.OVERWRITE_NOT_NONE,
145 | 		"author_id": FieldAction.OVERWRITE_NOT_NONE,
146 | 		"author_is_blocked": FieldAction.SPECIAL_NO_OVERWRITE,
147 | 		"author_patreon_flair": FieldAction.OVERWRITE,
148 | 		"author_premium": FieldAction.OVERWRITE,
149 | 		"awarders": FieldAction.ALLOW_EMPTY,
150 | 		"ban_note": FieldAction.DELETE,
151 | 		"banned_at_utc": FieldAction.SPECIAL_NO_OVERWRITE,
152 | 		"banned_by": FieldAction.SPECIAL_NO_OVERWRITE,
153 | 		"call_to_action": FieldAction.OVERWRITE,
154 | 		"campaign_id": FieldAction.ALLOW_EMPTY,
155 | 		"can_gild": FieldAction.OVERWRITE,
156 | 		"can_mod_post": FieldAction.SPECIAL_NO_OVERWRITE,
157 | 		"category": FieldAction.OVERWRITE_NOT_NONE,
158 | 		"clicked": FieldAction.SPECIAL_NO_OVERWRITE,
159 | 		"collections": FieldAction.OVERWRITE_NOT_NONE,
160 | 		"content_categories": FieldAction.ALLOW,
161 | 		"contest_mode": FieldAction.OVERWRITE,
162 | 		"created": FieldAction.OVERWRITE_IF_NONE,
163 | 		"created_utc": FieldAction.OVERWRITE_IF_NONE,
164 | 		"crosspost_parent": FieldAction.OVERWRITE_NOT_NONE,
165 | 		"crosspost_parent_list": FieldAction.OVERWRITE_NOT_NONE,
166 | 		"discussion_type": FieldAction.OVERWRITE_NOT_NONE,
167 | 		"distinguished": FieldAction.OVERWRITE,
168 | 		"domain": FieldAction.OVERWRITE_NOT_NONE,
169 | 		"domain_override": FieldAction.OVERWRITE_NOT_NONE,
170 | 		"downs": FieldAction.SPECIAL_NO_OVERWRITE,
171 | 		"edited": FieldAction.OVERWRITE,
172 | 		"edited_on": FieldAction.ALLOW,
173 | 		"embed_type": FieldAction.OVERWRITE_NOT_NONE,
174 | 		"embed_url": FieldAction.OVERWRITE_NOT_NONE,
175 | 		"event_end": FieldAction.OVERWRITE_NOT_NONE,
176 | 		"event_is_live": FieldAction.OVERWRITE_NOT_NONE,
177 | 		"event_start": FieldAction.OVERWRITE_NOT_NONE,
178 | 		"events": FieldAction.ALLOW_EMPTY,
179 | 		"eventsOnRender": FieldAction.ALLOW_EMPTY,
180 | 		"gallery_data": FieldAction.OVERWRITE_NOT_NONE,
181 | 		"gilded": FieldAction.OVERWRITE_NOT_NONE,
182 | 		"gildings": FieldAction.OVERWRITE_NOT_NONE,
183 | 		"hidden": FieldAction.SPECIAL_NO_OVERWRITE,
184 | 		"hide_score": FieldAction.OVERWRITE,
185 | 		"href_url": FieldAction.DONT_OVERWRITE,
186 | 		"id": FieldAction.ALLOW,
187 | 		"ignore_reports": FieldAction.DELETE,
188 | 		"impression_id": FieldAction.ALLOW_EMPTY,
189 | 		"impression_id_str": FieldAction.ALLOW_EMPTY,
190 | 		"is_blank": FieldAction.ALLOW_EMPTY,
191 | 		"is_created_from_ads_ui": FieldAction.OVERWRITE_NOT_NONE,
192 | 		"is_crosspostable": FieldAction.OVERWRITE,
193 | 		"is_gallery": FieldAction.OVERWRITE_NOT_NONE,
194 | 		"is_meta": FieldAction.OVERWRITE,
195 | 		"is_original_content": FieldAction.OVERWRITE,
196 | 		"is_reddit_media_domain": FieldAction.OVERWRITE,
197 | 		"is_robot_indexable": FieldAction.OVERWRITE,
198 | 		"is_self": FieldAction.DONT_OVERWRITE,
199 | 		"is_survey_ad": FieldAction.ALLOW_EMPTY,
200 | 		"is_video": FieldAction.OVERWRITE,
201 | 		"likes": FieldAction.OVERWRITE_NOT_NONE,
202 | 		"link_flair_background_color": FieldAction.OVERWRITE_NOT_NONE,
203 | 		"link_flair_css_class": FieldAction.OVERWRITE_NOT_NONE,
204 | 		"link_flair_richtext": FieldAction.OVERWRITE_NOT_NONE,
205 | 		"link_flair_template_id": FieldAction.OVERWRITE_NOT_NONE,
206 | 		"link_flair_text": FieldAction.OVERWRITE_NOT_NONE,
207 | 		"link_flair_text_color": FieldAction.OVERWRITE_NOT_NONE,
208 | 		"link_flair_type": FieldAction.OVERWRITE_NOT_NONE,
209 | 		"locked": FieldAction.OVERWRITE,
210 | 		"media": FieldAction.OVERWRITE_NOT_NONE,
211 | 		"media_embed": FieldAction.OVERWRITE_NOT_NONE,
212 | 		"media_metadata": FieldAction.OVERWRITE_NOT_NONE,
213 | 		"media_only": FieldAction.OVERWRITE,
214 | 		"mobile_ad_url": FieldAction.OVERWRITE_NOT_NONE,
215 | 		"mod_note": FieldAction.SPECIAL_NO_OVERWRITE,
216 | 		"mod_reason_by": FieldAction.SPECIAL_NO_OVERWRITE,
217 | 		"mod_reason_title": FieldAction.SPECIAL_NO_OVERWRITE,
218 | 		"mod_reports": FieldAction.SPECIAL_NO_OVERWRITE,
219 | 		"mod_reports_dismissed": FieldAction.SPECIAL_NO_OVERWRITE,
220 | 		"name": FieldAction.OVERWRITE_IF_NONE,
221 | 		"no_follow": FieldAction.OVERWRITE,
222 | 		"num_comments": FieldAction.OVERWRITE_NOT_NONE,
223 | 		"num_crossposts": FieldAction.OVERWRITE,
224 | 		"num_reports": FieldAction.SPECIAL_NO_OVERWRITE,
225 | 		"original_link": FieldAction.ALLOW_EMPTY,
226 | 		"outbound_link": FieldAction.ALLOW_EMPTY,
227 | 		"over_18": FieldAction.OVERWRITE,
228 | 		"parent_whitelist_status": FieldAction.OVERWRITE,
229 | 		"permalink": FieldAction.DONT_OVERWRITE,
230 | 		"pinned": FieldAction.ALLOW_EMPTY,
231 | 		"poll_data": FieldAction.OVERWRITE_NOT_NONE,
232 | 		"post_hint": FieldAction.OVERWRITE,
233 | 		"preview": FieldAction.OVERWRITE_NOT_NONE,
234 | 		"previous_selftext": FieldAction.ALLOW,
235 | 		"priority_id": FieldAction.ALLOW_EMPTY,
236 | 		"product_ids": FieldAction.ALLOW_EMPTY,
237 | 		"promo_layout": FieldAction.OVERWRITE,
238 | 		"promoted": FieldAction.ALLOW_EMPTY,
239 | 		"promoted_by": FieldAction.ALLOW_EMPTY,
240 | 		"promoted_display_name": FieldAction.ALLOW_EMPTY,
241 | 		"promoted_url": FieldAction.ALLOW_EMPTY,
242 | 		"pwls": FieldAction.OVERWRITE,
243 | 		"quarantine": FieldAction.DONT_OVERWRITE,
244 | 		"removal_reason": FieldAction.SPECIAL,
245 | 		"removed": FieldAction.DELETE,
246 | 		"removed_by": FieldAction.SPECIAL_NO_OVERWRITE,
247 | 		"removed_by_category": FieldAction.OVERWRITE,
248 | 		"report_reasons": FieldAction.SPECIAL_NO_OVERWRITE,
249 | 		"retrieved_on": FieldAction.SPECIAL,
250 | 		"retrieved_utc": FieldAction.SPECIAL,
251 | 		"rte_mode": FieldAction.OVERWRITE_NOT_NONE,
252 | 		"saved": FieldAction.SPECIAL_NO_OVERWRITE,
253 | 		"score": FieldAction.SPECIAL,
254 | 		"secure_media": FieldAction.OVERWRITE_NOT_NONE,
255 | 		"secure_media_embed": FieldAction.OVERWRITE_NOT_NONE,
256 | 		"selftext": FieldAction.SPECIAL,
257 | 		"selftext_html": FieldAction.DELETE,
258 | 		"send_replies": FieldAction.OVERWRITE,
259 | 		"show_media": FieldAction.ALLOW,
260 | 		"sk_ad_network_data": FieldAction.ALLOW_EMPTY,
261 | 		"spam": FieldAction.DELETE,
262 | 		"spoiler": FieldAction.OVERWRITE,
263 | 		"stickied": FieldAction.OVERWRITE,
264 | 		"subcaption": FieldAction.OVERWRITE,
265 | 		"subreddit": FieldAction.OVERWRITE_NOT_NONE,
266 | 		"subreddit_id": FieldAction.OVERWRITE_NOT_NONE,
267 | 		"subreddit_name_prefixed": FieldAction.OVERWRITE_NOT_NONE,
268 | 		"subreddit_subscribers": FieldAction.OVERWRITE_IF_NONE,
269 | 		"subreddit_type": FieldAction.DONT_OVERWRITE,
270 | 		"suggested_sort": FieldAction.OVERWRITE,
271 | 		"third_party_trackers": FieldAction.ALLOW_EMPTY,
272 | 		"third_party_tracking": FieldAction.ALLOW_EMPTY,
273 | 		"third_party_tracking_2": FieldAction.ALLOW_EMPTY,
274 | 		"thumbnail": FieldAction.OVERWRITE_NOT_NONE,
275 | 		"thumbnail_height": FieldAction.OVERWRITE_NOT_NONE,
276 | 		"thumbnail_width": FieldAction.OVERWRITE_NOT_NONE,
277 | 		"title": FieldAction.DONT_OVERWRITE,
278 | 		"top_awarded_type": FieldAction.OVERWRITE,
279 | 		"total_awards_received": FieldAction.OVERWRITE_NOT_NONE,
280 | 		"treatment_tags": FieldAction.OVERWRITE_NOT_NONE,
281 | 		"tournament_data": FieldAction.OVERWRITE_NOT_NONE,
282 | 		"unrepliable_reason": FieldAction.OVERWRITE_NOT_NONE,
283 | 		"updated_on": FieldAction.SPECIAL,
284 | 		"updated_utc": FieldAction.SPECIAL,
285 | 		"ups": FieldAction.OVERWRITE_NOT_NONE,
286 | 		"upvote_ratio": FieldAction.OVERWRITE,
287 | 		"url": FieldAction.OVERWRITE_NOT_NONE,
288 | 		"url_overridden_by_dest": FieldAction.OVERWRITE_NOT_NONE,
289 | 		"user_reports": FieldAction.SPECIAL_NO_OVERWRITE,
290 | 		"user_reports_dismissed": FieldAction.SPECIAL_NO_OVERWRITE,
291 | 		"utc_datetime_str": FieldAction.DELETE,
292 | 		"view_count": FieldAction.ALLOW_EMPTY,
293 | 		"visited": FieldAction.SPECIAL_NO_OVERWRITE,
294 | 		"whitelist_status": FieldAction.OVERWRITE,
295 | 		"wls": FieldAction.OVERWRITE,
296 | 	},
297 | }
298 | 
299 | 
300 | def is_empty(value):
301 | 	return value is None \
302 | 		or value == "" \
303 | 		or value == "[deleted]" \
304 | 		or value == "[removed]" \
305 | 		or value == [] \
306 | 		or value == {} \
307 | 		or value is False \
308 | 		or value == 0
309 | 
310 | 
311 | def replace(match):
312 | 	if match.group(0) == "amp;": return ""
313 | 	if match.group(0) == "&lt;": return "<"
314 | 	if match.group(0) == "&gt;": return ">"
315 | 	log.warning(f"Unknown group: {match}")
316 | 	sys.exit(2)
317 | 
318 | 
319 | unencode_regex = re.compile(r"amp;|&lt;|&gt;")
320 | 
321 | 
322 | def merge_fields(existing_obj, new_obj, obj_type):
323 | 	unmatched_field = False
324 | 	type_actions = field_actions[obj_type]
325 | 	for key, new_value in new_obj.items():
326 | 		action = type_actions.get(key)
327 | 
328 | 		original_value = existing_obj.get(key)
329 | 		if new_value != original_value:
330 | 			# if isinstance(new_value, str) and unencode_regex.search(new_value):
331 | 			# 	new_value_no_encode = unencode_regex.sub(replace, new_value)
332 | 			# 	if new_value_no_encode == original_value:
333 | 			# 		continue
334 | 			if action == FieldAction.OVERWRITE:
335 | 				existing_obj[key] = new_value
336 | 			elif action == FieldAction.OVERWRITE_NOT_NONE:
337 | 				if not is_empty(new_value):
338 | 					existing_obj[key] = new_value
339 | 			elif action == FieldAction.OVERWRITE_IF_NONE:
340 | 				if is_empty(original_value):
341 | 					existing_obj[key] = new_value
342 | 			elif action == FieldAction.SPECIAL:
343 | 				if key == "body":
344 | 					if not is_empty(new_value):
345 | 						if 'previous_body' in existing_obj:
346 | 							existing_obj['previous_body'] = original_value
347 | 						existing_obj['body'] = new_value
348 | 				elif key == "score":
349 | 					if not is_empty(new_value):
350 | 						if is_empty(original_value) or abs(new_value) > abs(original_value):
351 | 							existing_obj['score'] = new_value
352 | 				elif key == "selftext":
353 | 					if not is_empty(new_value):
354 | 						if 'previous_selftext' not in existing_obj:
355 | 							existing_obj['previous_selftext'] = original_value
356 | 						existing_obj['selftext'] = new_value
357 | 				elif key == "removal_reason" and new_value in ["legal", None]:
358 | 					existing_obj[key] = new_value
359 | 				elif key in ["retrieved_on", "retrieved_utc"]:
360 | 					prev_retrieved_on = existing_obj["retrieved_on"]
361 | 					if new_value < prev_retrieved_on:
362 | 						existing_obj["retrieved_on"] = new_value
363 | 						existing_obj["updated_on"] = prev_retrieved_on
364 | 					if new_value > prev_retrieved_on:
365 | 						existing_obj["updated_on"] = new_value
366 | 				elif key in ["updated_on", "updated_utc"]:
367 | 					if new_value > existing_obj["updated_on"]:
368 | 						existing_obj["updated_on"] = new_value
369 | 				else:
370 | 					log.info(f"{new_obj['id']} unmatched special: {key}: {original_value} != {new_value}")
371 | 					unmatched_field = True
372 | 			elif action == FieldAction.DELETE or action == FieldAction.DONT_OVERWRITE or action == FieldAction.SPECIAL_NO_OVERWRITE:
373 | 				pass
374 | 			else:
375 | 				log.info(f"{new_obj['id']} unmatched no action: {key}|{action}: {original_value} != {new_value}")
376 | 				unmatched_field = True
377 | 		elif action is None:
378 | 			log.info(f"{new_obj['id']} matched no action: {key}: {new_value}")
379 | 			unmatched_field = True
380 | 
381 | 	return unmatched_field
382 | 
383 | 
384 | def parse_fields(new_obj, obj_type):
385 | 	keys_to_delete = []
386 | 	keys_to_add = []
387 | 	unmatched_field = False
388 | 	type_actions = field_actions[obj_type]
389 | 	for key, new_value in new_obj.items():
390 | 		action = type_actions.get(key)
391 | 		if action is not None:
392 | 			if action == FieldAction.DELETE:
393 | 				keys_to_delete.append(key)
394 | 			elif action == FieldAction.ALLOW_EMPTY:
395 | 				if not is_empty(new_value):
396 | 					log.info(f"{new_obj['id']} not empty: {key}: {new_value}")
397 | 					unmatched_field = True
398 | 					keys_to_delete.append(key)
399 | 			elif action == FieldAction.SPECIAL:
400 | 				if key in ["retrieved_on", "body", "selftext", "updated_on", "score"]:
401 | 					pass
402 | 				elif key == "removal_reason" and new_value in ["legal", None]:
403 | 					pass
404 | 				elif key == "retrieved_utc":
405 | 					keys_to_add.append(("retrieved_on", new_value))
406 | 					keys_to_delete.append(key)
407 | 				elif key == "updated_utc":
408 | 					keys_to_add.append(("updated_on", new_value))
409 | 					keys_to_delete.append(key)
410 | 				else:
411 | 					log.info(f"{new_obj['id']} special no match: {key}: {new_value}")
412 | 					unmatched_field = True
413 | 					keys_to_delete.append(key)
414 | 			elif action == FieldAction.SPECIAL_NO_OVERWRITE:
415 | 				if key in ["can_mod_post", "saved", "clicked", "visited", "author_is_blocked", "hidden"]:
416 | 					new_obj[key] = False
417 | 				elif key in ["banned_at_utc", "banned_by", "approved_at_utc", "approved_by", "user_reports_dismissed", "mod_reports_dismissed", "removed_by", "mod_note", "mod_reason_by", "mod_reason_title"]:
418 | 					new_obj[key] = None
419 | 				elif key in ["num_reports", "downs"]:
420 | 					new_obj[key] = 0
421 | 				elif key in ["report_reasons", "user_reports", "mod_reports"]:
422 | 					new_obj[key] = []
423 | 				else:
424 | 					log.info(f"{new_obj['id']} special no overwrite no match: {key}: {new_value}")
425 | 					unmatched_field = True
426 | 					keys_to_delete.append(key)
427 | 		else:
428 | 			log.info(f"{new_obj['id']} no action: {key}: {new_value}")
429 | 			unmatched_field = True
430 | 
431 | 	for key in keys_to_delete:
432 | 		del new_obj[key]
433 | 
434 | 	for key, value in keys_to_add:
435 | 		new_obj[key] = value
436 | 
437 | 	if 'retrieved_on' not in new_obj:
438 | 		new_obj['retrieved_on'] = int(datetime.utcnow().timestamp())
439 | 
440 | 	return unmatched_field
441 | 


--------------------------------------------------------------------------------
/personal/combine/merge_and_backfill.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import requests
  3 | import time
  4 | import discord_logging
  5 | import argparse
  6 | import os
  7 | import re
  8 | import zstandard
  9 | from datetime import datetime, timedelta
 10 | import json
 11 | import praw
 12 | from praw import endpoints
 13 | import prawcore
 14 | import logging.handlers
 15 | 
 16 | sys.path.append('personal')
 17 | 
 18 | log = discord_logging.get_logger(init=True)
 19 | 
 20 | import utils
 21 | import classes
 22 | from classes import IngestType
 23 | from merge import ObjectType
 24 | 
 25 | 
 26 | NEWLINE_ENCODED = "\n".encode('utf-8')
 27 | reg = re.compile(r"\d\d-\d\d-\d\d_\d\d-\d\d")
 28 | 
 29 | 
 30 | def get_pushshift_token(old_token):
 31 | 	saved_token = load_pushshift_token()
 32 | 	if saved_token is None or old_token == saved_token:
 33 | 		log.info(f"Requesting new token")
 34 | 		result_token = re_auth_pushshift(old_token)
 35 | 		save_pushshift_token(result_token)
 36 | 	else:
 37 | 		result_token = saved_token
 38 | 
 39 | 	return result_token
 40 | 
 41 | 
 42 | def save_pushshift_token(token):
 43 | 	with open("pushshift.txt", 'w') as file:
 44 | 		file.write(token)
 45 | 
 46 | 
 47 | def load_pushshift_token():
 48 | 	with open("pushshift.txt", 'r') as file:
 49 | 		token = file.read().strip()
 50 | 	return token
 51 | 
 52 | 
 53 | def re_auth_pushshift(old_token):
 54 | 	url = f"https://auth.pushshift.io/refresh?access_token={old_token}"
 55 | 	log.warning(f"Reauth request: {url}")
 56 | 	response = requests.post(url)
 57 | 	result = response.json()
 58 | 	log.warning(f"Reauth response: {str(result)}")
 59 | 	discord_logging.flush_discord()
 60 | 	if 'access_token' in result:
 61 | 		new_token = result['access_token']
 62 | 		log.warning(f"New pushshift token: {new_token}")
 63 | 		save_pushshift_token(new_token)
 64 | 		discord_logging.flush_discord()
 65 | 		return new_token
 66 | 	elif 'detail' in result:
 67 | 		if result['detail'] == 'Access token is still active and can not be refreshed.':
 68 | 			log.warning(f"Access token still active, trying request again")
 69 | 			time.sleep(5)
 70 | 			return old_token
 71 | 
 72 | 		log.warning(f"Reauth failed: {result['detail']}")
 73 | 		discord_logging.flush_discord()
 74 | 		sys.exit(1)
 75 | 	else:
 76 | 		log.warning(f"Something went wrong re-authing")
 77 | 		discord_logging.flush_discord()
 78 | 		sys.exit(1)
 79 | 
 80 | 
 81 | def query_pushshift(ids, bearer, object_type, pushshift_token_function):
 82 | 	object_name = "comment" if object_type == ObjectType.COMMENT else "submission"
 83 | 	url = f"https://api.pushshift.io/reddit/{object_name}/search?limit=1000&ids={','.join(ids)}"
 84 | 	log.debug(f"pushshift query: {url}")
 85 | 	response = None
 86 | 	total_attempts = 100
 87 | 	current_attempt = 0
 88 | 	sleep_per_attempt = 10
 89 | 	for current_attempt in range(total_attempts):
 90 | 		try:
 91 | 			response = requests.get(url, headers={
 92 | 				'User-Agent': "In script by /u/Watchful1",
 93 | 				'Authorization': f"Bearer {bearer}"}, timeout=20)
 94 | 		except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout) as err:
 95 | 			log.info(f"Pushshift failed, sleeping {current_attempt * sleep_per_attempt} : {err}")
 96 | 			time.sleep(current_attempt * sleep_per_attempt)
 97 | 			continue
 98 | 		if response is None:
 99 | 			log.info(f"Pushshift failed, sleeping {current_attempt * sleep_per_attempt} : no response")
100 | 			time.sleep(current_attempt * sleep_per_attempt)
101 | 			continue
102 | 		if response.status_code == 200:
103 | 			break
104 | 		if response.status_code == 403:
105 | 			log.warning(f"Pushshift 403, trying reauth: {response.json()}")
106 | 			log.warning(url)
107 | 			log.warning(f"'Authorization': Bearer {bearer}")
108 | 			bearer = pushshift_token_function(bearer)
109 | 		log.info(f"Pushshift failed, sleeping {current_attempt * sleep_per_attempt} : status {response.status_code}")
110 | 		time.sleep(current_attempt * sleep_per_attempt)
111 | 	if response is None:
112 | 		log.warning(f"{current_attempt + 1} requests failed with no response")
113 | 		log.warning(url)
114 | 		log.warning(f"'Authorization': Bearer {bearer}")
115 | 		discord_logging.flush_discord()
116 | 		sys.exit(1)
117 | 	if response.status_code != 200:
118 | 		log.warning(f"{current_attempt + 1} requests failed with status code {response.status_code}")
119 | 		log.warning(url)
120 | 		log.warning(f"'Authorization': Bearer {bearer}")
121 | 		discord_logging.flush_discord()
122 | 		sys.exit(1)
123 | 	if current_attempt > 0:
124 | 		log.info(f"Pushshift call succeeded after {current_attempt + 1} retries")
125 | 	return response.json()['data'], bearer
126 | 
127 | 
128 | def query_reddit(ids, reddit, object_type):
129 | 	id_prefix = 't1_' if object_type == ObjectType.COMMENT else 't3_'
130 | 	id_string = f"{id_prefix}{(f',{id_prefix}'.join(ids))}"
131 | 	response = None
132 | 	for i in range(20):
133 | 		try:
134 | 			response = reddit.request(method="GET", path=endpoints.API_PATH["info"], params={"id": id_string})
135 | 			break
136 | 		except (prawcore.exceptions.ServerError, prawcore.exceptions.RequestException) as err:
137 | 			log.info(f"No response from reddit api for {object_type}, sleeping {i * 5} seconds: {err} : {id_string}")
138 | 			time.sleep(i * 5)
139 | 	if response is None:
140 | 		log.warning(f"Reddit api failed, aborting")
141 | 		return []
142 | 	return response['data']['children']
143 | 
144 | 
145 | def end_of_day(input_minute):
146 | 	return input_minute.replace(hour=0, minute=0, second=0) + timedelta(days=1)
147 | 
148 | 
149 | def build_day(day_to_process, input_folders, output_folder, object_type, reddit, ignore_ids, pushshift_token_function):
150 | 	file_type = "comments" if object_type == ObjectType.COMMENT else "submissions"
151 | 
152 | 	pushshift_token = pushshift_token_function(None)
153 | 	log.info(f"{file_type}: Using pushshift token: {pushshift_token}")
154 | 
155 | 	file_minutes = {}
156 | 	minute_iterator = day_to_process - timedelta(minutes=2)
157 | 	end_time = end_of_day(day_to_process) + timedelta(minutes=2)
158 | 	while minute_iterator <= end_time:
159 | 		file_minutes[minute_iterator] = []
160 | 		minute_iterator += timedelta(minutes=1)
161 | 
162 | 	for merge_folder, ingest_type in input_folders:
163 | 		merge_date_folder = os.path.join(merge_folder, file_type, day_to_process.strftime('%y-%m-%d'))
164 | 		if os.path.exists(merge_date_folder):
165 | 			for file in os.listdir(merge_date_folder):
166 | 				match = reg.search(file)
167 | 				if not match:
168 | 					log.info(f"{file_type}: File doesn't match regex: {file}")
169 | 					continue
170 | 				file_date = datetime.strptime(match.group(), '%y-%m-%d_%H-%M')
171 | 				if file_date in file_minutes:
172 | 					file_minutes[file_date].append((os.path.join(merge_date_folder, file), ingest_type))
173 | 
174 | 	objects = classes.ObjectDict(day_to_process, day_to_process + timedelta(days=1) - timedelta(seconds=1), object_type)
175 | 	unmatched_field = False
176 | 	minute_iterator = day_to_process - timedelta(minutes=2)
177 | 	working_lowest_minute = day_to_process
178 | 	last_minute_of_day = end_of_day(day_to_process) - timedelta(minutes=1)
179 | 	while minute_iterator <= end_time:
180 | 		for ingest_file, ingest_type in file_minutes[minute_iterator]:
181 | 			for obj in utils.read_obj_zst(ingest_file):
182 | 				if objects.add_object(obj, ingest_type):
183 | 					unmatched_field = True
184 | 		log.info(f"{file_type}: Loaded {minute_iterator.strftime('%y-%m-%d_%H-%M')} : {objects.get_counts_string_by_minute(minute_iterator, [IngestType.INGEST, IngestType.RESCAN, IngestType.DOWNLOAD])}")
185 | 
186 | 		if minute_iterator >= end_time or objects.count_minutes() >= 11:
187 | 			if minute_iterator > last_minute_of_day:
188 | 				working_highest_minute = last_minute_of_day
189 | 			else:
190 | 				working_highest_minute = minute_iterator - timedelta(minutes=1)
191 | 			missing_ids, start_id, end_id = objects.get_missing_ids_by_minutes(working_lowest_minute, working_highest_minute, ignore_ids)
192 | 			log.debug(
193 | 				f"{file_type}: Backfilling from: {working_lowest_minute.strftime('%y-%m-%d_%H-%M')} ({utils.base36encode(start_id)}|{start_id}) to "
194 | 				f"{working_highest_minute.strftime('%y-%m-%d_%H-%M')} ({utils.base36encode(end_id)}|{end_id}) with {len(missing_ids)} ({end_id - start_id}) ids")
195 | 
196 | 			for chunk in utils.chunk_list(missing_ids, 50):
197 | 				pushshift_objects, pushshift_token = query_pushshift(chunk, pushshift_token, object_type, pushshift_token_function)
198 | 				for pushshift_object in pushshift_objects:
199 | 					if objects.add_object(pushshift_object, IngestType.PUSHSHIFT):
200 | 						unmatched_field = True
201 | 
202 | 			for chunk in utils.chunk_list(missing_ids, 100):
203 | 				reddit_objects = query_reddit(chunk, reddit, object_type)
204 | 				for reddit_object in reddit_objects:
205 | 					if objects.add_object(reddit_object['data'], IngestType.BACKFILL):
206 | 						unmatched_field = True
207 | 
208 | 			for missing_id in missing_ids:
209 | 				if missing_id not in objects.by_id:
210 | 					objects.add_missing_object(missing_id)
211 | 
212 | 			objects.delete_objects_below_minute(working_lowest_minute)
213 | 			while working_lowest_minute <= working_highest_minute:
214 | 				folder = os.path.join(output_folder, file_type, working_lowest_minute.strftime('%y-%m-%d'))
215 | 				if not os.path.exists(folder):
216 | 					os.makedirs(folder)
217 | 				output_path = os.path.join(folder, f"{('RC' if object_type == ObjectType.COMMENT else 'RS')}_{working_lowest_minute.strftime('%y-%m-%d_%H-%M')}.zst")
218 | 				output_handle = zstandard.ZstdCompressor().stream_writer(open(output_path, 'wb'))
219 | 
220 | 				for obj in objects.by_minute[working_lowest_minute].obj_list:
221 | 					output_handle.write(json.dumps(obj, sort_keys=True).encode('utf-8'))
222 | 					output_handle.write(NEWLINE_ENCODED)
223 | 					objects.delete_object_id(obj['id'])
224 | 				log.info(
225 | 					f"{file_type}: Wrote up to {working_lowest_minute.strftime('%y-%m-%d_%H-%M')} : "
226 | 					f"{objects.get_counts_string_by_minute(working_lowest_minute, [IngestType.PUSHSHIFT, IngestType.BACKFILL, IngestType.MISSING])}")
227 | 				output_handle.close()
228 | 				working_lowest_minute += timedelta(minutes=1)
229 | 
230 | 			objects.rebuild_minute_dict()
231 | 
232 | 		discord_logging.flush_discord()
233 | 		if unmatched_field:
234 | 			log.warning(f"{file_type}: Unmatched field, aborting")
235 | 			discord_logging.flush_discord()
236 | 			sys.exit(1)
237 | 
238 | 		minute_iterator += timedelta(minutes=1)
239 | 
240 | 	log.info(f"{file_type}: Finished day {day_to_process.strftime('%y-%m-%d')}: {objects.get_counts_string()}")
241 | 
242 | 
243 | def merge_and_backfill(start_date, end_date, input_folders, output_folder, object_type, ignore_ids, reddit_username, pushshift_token_function):
244 | 	reddit = praw.Reddit(reddit_username)
245 | 	while start_date <= end_date:
246 | 		build_day(start_date, input_folders, output_folder, object_type, reddit, ignore_ids, pushshift_token_function)
247 | 		start_date = end_of_day(start_date)
248 | 
249 | 
250 | if __name__ == "__main__":
251 | 	parser = argparse.ArgumentParser(description="Combine the ingest and rescan files, clean and do pushshift lookups as needed")
252 | 	parser.add_argument("--type", help="The object type, either comments or submissions", required=True)
253 | 	parser.add_argument("--start_date", help="The start of the date range to process, format YY-MM-DD_HH-MM", required=True)
254 | 	parser.add_argument("--end_date", help="The end of the date range to process, format YY-MM-DD. If not provided, the script processes to the end of the day")
255 | 	parser.add_argument('--input', help='Input folder', required=True)
256 | 	parser.add_argument('--output', help='Output folder', required=True)
257 | 	parser.add_argument('--pushshift', help='The pushshift token')
258 | 	parser.add_argument("--debug", help="Enable debug logging", action='store_const', const=True, default=False)
259 | 	parser.add_argument("--ignore_ids", help="Ignore ids between the id ranges listed", default=None)
260 | 	args = parser.parse_args()
261 | 
262 | 	if args.debug:
263 | 		discord_logging.set_level(logging.DEBUG)
264 | 
265 | 	input_folders = [
266 | 		(os.path.join(args.input, "ingest"), IngestType.INGEST),
267 | 		(os.path.join(args.input, "rescan"), IngestType.RESCAN),
268 | 		(os.path.join(args.input, "download"), IngestType.DOWNLOAD),
269 | 	]
270 | 
271 | 	if args.start_date is None:
272 | 		log.error(f"No start date provided")
273 | 		sys.exit(2)
274 | 	start_date = datetime.strptime(args.start_date, '%y-%m-%d_%H-%M')
275 | 	end_date = end_of_day(start_date)
276 | 	if args.end_date is not None:
277 | 		end_date = datetime.strptime(args.end_date, '%y-%m-%d')
278 | 
279 | 	for input_folder, ingest_type in input_folders:
280 | 		log.info(f"Input folder: {input_folder}")
281 | 	log.info(f"Output folder: {args.output}")
282 | 
283 | 	object_type = None
284 | 	if args.type == "comments":
285 | 		object_type = ObjectType.COMMENT
286 | 	elif args.type == "submissions":
287 | 		object_type = ObjectType.SUBMISSION
288 | 	else:
289 | 		log.error(f"Invalid type: {args.type}")
290 | 		sys.exit(2)
291 | 
292 | 	ignore_ids = []
293 | 	if args.ignore_ids is not None:
294 | 		for id_range in args.ignore_ids.split(","):
295 | 			start_id, end_id = id_range.split("-")
296 | 			ignore_ids.append((utils.base36decode(start_id), utils.base36decode(end_id)))
297 | 
298 | 	discord_logging.init_discord_logging(
299 | 		section_name="Watchful12",
300 | 		log_level=logging.WARNING
301 | 	)
302 | 
303 | 	if args.pushshift is not None:
304 | 		log.warning(f"Saving pushshift token: {args.pushshift}")
305 | 		save_pushshift_token(args.pushshift)
306 | 
307 | 	merge_and_backfill(
308 | 		start_date,
309 | 		end_date,
310 | 		input_folders,
311 | 		args.output,
312 | 		object_type,
313 | 		ignore_ids,
314 | 		"Watchful12",
315 | 		get_pushshift_token
316 | 	)
317 | 


--------------------------------------------------------------------------------
/personal/combine/merge_minutes.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import requests
  3 | import time
  4 | import discord_logging
  5 | import argparse
  6 | import os
  7 | import re
  8 | import zstandard
  9 | from datetime import datetime, timedelta
 10 | import json
 11 | import praw
 12 | from praw import endpoints
 13 | import prawcore
 14 | import logging.handlers
 15 | 
 16 | sys.path.append('personal')
 17 | 
 18 | log = discord_logging.init_logging(debug=False)
 19 | 
 20 | import utils
 21 | import classes
 22 | from classes import IngestType
 23 | from merge import ObjectType
 24 | 
 25 | 
 26 | NEWLINE_ENCODED = "\n".encode('utf-8')
 27 | reg = re.compile(r"\d\d-\d\d-\d\d_\d\d-\d\d")
 28 | 
 29 | 
 30 | def end_of_day(input_minute):
 31 | 	return input_minute.replace(hour=0, minute=0, second=0) + timedelta(days=1)
 32 | 
 33 | 
 34 | def build_day(day_to_process, input_folders, output_folder, object_type):
 35 | 	file_type = "comments" if object_type == ObjectType.COMMENT else "submissions"
 36 | 
 37 | 	file_minutes = {}
 38 | 	minute_iterator = day_to_process - timedelta(minutes=2)
 39 | 	end_time = end_of_day(day_to_process) + timedelta(minutes=2)
 40 | 	while minute_iterator <= end_time:
 41 | 		file_minutes[minute_iterator] = []
 42 | 		minute_iterator += timedelta(minutes=1)
 43 | 
 44 | 	for merge_folder, ingest_type in input_folders:
 45 | 		merge_date_folder = os.path.join(merge_folder, file_type, day_to_process.strftime('%y-%m-%d'))
 46 | 		if os.path.exists(merge_date_folder):
 47 | 			for file in os.listdir(merge_date_folder):
 48 | 				match = reg.search(file)
 49 | 				if not match:
 50 | 					log.info(f"File doesn't match regex: {file}")
 51 | 					continue
 52 | 				file_date = datetime.strptime(match.group(), '%y-%m-%d_%H-%M')
 53 | 				if file_date in file_minutes:
 54 | 					file_minutes[file_date].append((os.path.join(merge_date_folder, file), ingest_type))
 55 | 
 56 | 	objects = classes.ObjectDict(day_to_process, day_to_process + timedelta(days=1) - timedelta(seconds=1), object_type)
 57 | 	unmatched_field = False
 58 | 	minute_iterator = day_to_process - timedelta(minutes=2)
 59 | 	working_lowest_minute = day_to_process
 60 | 	last_minute_of_day = end_of_day(day_to_process) - timedelta(minutes=1)
 61 | 	while minute_iterator <= end_time:
 62 | 		for ingest_file, ingest_type in file_minutes[minute_iterator]:
 63 | 			for obj in utils.read_obj_zst(ingest_file):
 64 | 				if objects.add_object(obj, ingest_type):
 65 | 					unmatched_field = True
 66 | 		log.info(f"Loaded {minute_iterator.strftime('%y-%m-%d_%H-%M')} : {objects.get_counts_string_by_minute(minute_iterator, [IngestType.INGEST, IngestType.DOWNLOAD])}")
 67 | 
 68 | 		if minute_iterator >= end_time or objects.count_minutes() >= 11:
 69 | 			if minute_iterator > last_minute_of_day:
 70 | 				working_highest_minute = last_minute_of_day
 71 | 			else:
 72 | 				working_highest_minute = minute_iterator - timedelta(minutes=1)
 73 | 
 74 | 			objects.delete_objects_below_minute(working_lowest_minute)
 75 | 			while working_lowest_minute <= working_highest_minute:
 76 | 				folder = os.path.join(output_folder, file_type, working_lowest_minute.strftime('%y-%m-%d'))
 77 | 				if not os.path.exists(folder):
 78 | 					os.makedirs(folder)
 79 | 				output_path = os.path.join(folder, f"{('RS' if object_type == ObjectType.COMMENT else 'RC')}_{working_lowest_minute.strftime('%y-%m-%d_%H-%M')}.zst")
 80 | 				output_handle = zstandard.ZstdCompressor().stream_writer(open(output_path, 'wb'))
 81 | 
 82 | 				for obj in objects.by_minute[working_lowest_minute].obj_list:
 83 | 					output_handle.write(json.dumps(obj, sort_keys=True).encode('utf-8'))
 84 | 					output_handle.write(NEWLINE_ENCODED)
 85 | 					objects.delete_object_id(obj['id'])
 86 | 				log.info(f"Wrote up to {working_lowest_minute.strftime('%y-%m-%d_%H-%M')}")
 87 | 				output_handle.close()
 88 | 				working_lowest_minute += timedelta(minutes=1)
 89 | 
 90 | 			objects.rebuild_minute_dict()
 91 | 
 92 | 		discord_logging.flush_discord()
 93 | 		if unmatched_field:
 94 | 			log.info(f"Unmatched field, aborting")
 95 | 			sys.exit(1)
 96 | 
 97 | 		minute_iterator += timedelta(minutes=1)
 98 | 
 99 | 	log.info(f"Finished day {day_to_process.strftime('%y-%m-%d')}: {objects.get_counts_string()}")
100 | 
101 | 
102 | if __name__ == "__main__":
103 | 	parser = argparse.ArgumentParser(description="Combine two ingest files")
104 | 	parser.add_argument("--type", help="The object type, either comments or submissions", required=True)
105 | 	parser.add_argument("--start_date", help="The start of the date range to process, format YY-MM-DD_HH-MM", required=True)
106 | 	parser.add_argument("--end_date", help="The end of the date range to process, format YY-MM-DD. If not provided, the script processes to the end of the day")
107 | 	parser.add_argument('--input', help='Input folder', required=True)
108 | 	parser.add_argument('--output', help='Output folder', required=True)
109 | 	parser.add_argument("--debug", help="Enable debug logging", action='store_const', const=True, default=False)
110 | 	args = parser.parse_args()
111 | 
112 | 	if args.debug:
113 | 		discord_logging.set_level(logging.DEBUG)
114 | 
115 | 	if args.start_date is None:
116 | 		log.error(f"No start date provided")
117 | 		sys.exit(2)
118 | 	start_date = datetime.strptime(args.start_date, '%y-%m-%d_%H-%M')
119 | 	end_date = end_of_day(start_date)
120 | 	if args.end_date is not None:
121 | 		end_date = datetime.strptime(args.end_date, '%y-%m-%d')
122 | 
123 | 	input_folders = [
124 | 		(os.path.join(args.input, "combined"), IngestType.INGEST),
125 | 		(os.path.join(args.input, "download"), IngestType.DOWNLOAD),
126 | 	]
127 | 
128 | 	for input_folder, ingest_type in input_folders:
129 | 		log.info(f"Input folder: {input_folder}")
130 | 	log.info(f"Output folder: {args.output}")
131 | 
132 | 	object_type = None
133 | 	if args.type == "comments":
134 | 		object_type = ObjectType.COMMENT
135 | 	elif args.type == "submissions":
136 | 		object_type = ObjectType.SUBMISSION
137 | 	else:
138 | 		log.error(f"Invalid type: {args.type}")
139 | 		sys.exit(2)
140 | 
141 | 	while start_date <= end_date:
142 | 		build_day(start_date, input_folders, args.output, object_type)
143 | 		start_date = end_of_day(start_date)
144 | 


--------------------------------------------------------------------------------
/personal/compression/extract_file.py:
--------------------------------------------------------------------------------
 1 | import utils
 2 | import discord_logging
 3 | import os
 4 | import sys
 5 | from datetime import datetime
 6 | 
 7 | log = discord_logging.init_logging()
 8 | 
 9 | 
10 | if __name__ == "__main__":
11 | 	input_file_path = r"\\MYCLOUDPR4100\Public\reddit_final\curiousdrive_submissions.zst"
12 | 	output_file_path = r"\\MYCLOUDPR4100\Public\reddit_final\curiousdrive_submissions.txt"
13 | 	file_size = os.stat(input_file_path).st_size
14 | 
15 | 	file_lines = 0
16 | 	file_bytes_processed = 0
17 | 	created = None
18 | 	inserts = []
19 | 	output_file = open(output_file_path, 'w')
20 | 	for obj, line, file_bytes_processed in utils.read_obj_zst_meta(input_file_path):
21 | 		created = datetime.utcfromtimestamp(int(obj['created_utc']))
22 | 		file_lines += 1
23 | 		output_file.write(line)
24 | 		output_file.write("\n")
25 | 		if file_lines % 100000 == 0:
26 | 			log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : {(file_bytes_processed / file_size) * 100:.0f}%")
27 | 
28 | 	log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : 100%")
29 | 	output_file.close()
30 | 
31 | 


--------------------------------------------------------------------------------
/personal/compression/recompress_file.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import zstandard
 3 | import discord_logging
 4 | import time
 5 | import sys
 6 | 
 7 | sys.path.append('personal')
 8 | 
 9 | log = discord_logging.init_logging()
10 | 
11 | import utils
12 | 
13 | if __name__ == '__main__':
14 | 	parser = argparse.ArgumentParser(description="Take all the zst files in the input folder, extract them and compress them again at the ratio specified")
15 | 	parser.add_argument("input", help="The input file")
16 | 	parser.add_argument("output", help="The output file")
17 | 	parser.add_argument("--level", help="The compression ratio to output at", default="3")
18 | 	args = parser.parse_args()
19 | 
20 | 	log.info(f"Input file {args.input}")
21 | 	log.info(f"Output file {args.output}")
22 | 
23 | 	total_objects = 0
24 | 	total_bytes = 0
25 | 	for obj, line, _ in utils.read_obj_zst_meta(args.input):
26 | 		total_bytes += len(line.encode('utf-8'))
27 | 		total_bytes += 1
28 | 
29 | 		total_objects += 1
30 | 		if total_objects % 1000000 == 0:
31 | 			log.info(f"{total_objects:,} : {total_bytes:,}")
32 | 
33 | 	log.info(f"{total_objects:,} : {total_bytes:,}")
34 | 
35 | 	for threads in range(-1, 21):
36 | 		decompressor = zstandard.ZstdDecompressor(max_window_size=2**31)
37 | 		compressor = zstandard.ZstdCompressor(level=22, write_content_size=True, write_checksum=True, threads=threads)
38 | 		start_time = time.time()
39 | 		with open(args.input, 'rb') as input_handle, open(args.output, "wb") as output_handle:
40 | 			compression_reader = decompressor.stream_reader(input_handle)
41 | 			read_count, write_count = compressor.copy_stream(compression_reader, output_handle, size=total_bytes)
42 | 		seconds = time.time() - start_time
43 | 
44 | 		log.info(f"{read_count:,} to {write_count:,} in {seconds:,.2f} with {threads} threads")
45 | 
46 | 	# compressed_bytes_read += file_size
47 | 	# uncompressed_bytes_read += read_count
48 | 	# bytes_written += write_count
49 | 	# log.info(f"{files_read:,}/{len(files):,} : {(compressed_bytes_read / (2**30)):.2f} gb of {(total_size / (2**30)):.2f} gb compressed to {(bytes_written / (2**30)):.2f} gb : {bytes_written /compressed_bytes_read:.3f}")
50 | 


--------------------------------------------------------------------------------
/personal/compression/recompress_folder.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import zstandard
 3 | import os
 4 | import logging.handlers
 5 | 
 6 | log = logging.getLogger("bot")
 7 | log.setLevel(logging.INFO)
 8 | log_formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')
 9 | log_str_handler = logging.StreamHandler()
10 | log_str_handler.setFormatter(log_formatter)
11 | log.addHandler(log_str_handler)
12 | if not os.path.exists("logs"):
13 | 	os.makedirs("logs")
14 | log_file_handler = logging.handlers.RotatingFileHandler(os.path.join("logs", "bot.log"), maxBytes=1024*1024*16, backupCount=5)
15 | log_file_handler.setFormatter(log_formatter)
16 | log.addHandler(log_file_handler)
17 | 
18 | if __name__ == '__main__':
19 | 	parser = argparse.ArgumentParser(description="Take all the zst files in the input folder, extract them and compress them again at the ratio specified")
20 | 	parser.add_argument("input", help="The input folder to read files from")
21 | 	parser.add_argument("output", help="The output folder to write files to")
22 | 	parser.add_argument("--level", help="The compression ratio to output at", default="3")
23 | 	args = parser.parse_args()
24 | 
25 | 	log.info(f"Reading all files from {args.input}")
26 | 
27 | 	files = []
28 | 	total_size = 0
29 | 	for file_name in os.listdir(args.input):
30 | 		file_path = os.path.join(args.input, file_name)
31 | 		if file_name.endswith(".zst") and os.path.isfile(file_path):
32 | 			file_size = os.stat(file_path).st_size
33 | 			total_size += file_size
34 | 			files.append((file_name, file_size))
35 | 			if len(files) % 1000 == 0:
36 | 				log.info(f"Loaded {len(files)} files")
37 | 	log.info(f"Loaded {len(files)} files of total size {total_size:,}")
38 | 
39 | 	level = int(args.level)
40 | 	log.info(f"Writing files out to {args.output} at ratio {level}")
41 | 	if not os.path.exists(args.output):
42 | 		os.makedirs(args.output)
43 | 
44 | 	compressed_bytes_read = 0
45 | 	uncompressed_bytes_read = 0
46 | 	bytes_written = 0
47 | 	files_read = 0
48 | 
49 | 	decompressor = zstandard.ZstdDecompressor(max_window_size=2**31)
50 | 	compressor = zstandard.ZstdCompressor(level=level, threads=-1)
51 | 	for file_name, file_size in files:
52 | 		input_path = os.path.join(args.input, file_name)
53 | 		output_path = os.path.join(args.output, file_name)
54 | 		with open(input_path, 'rb') as input_handle, open(output_path, "wb") as output_handle:
55 | 			compression_reader = decompressor.stream_reader(input_handle)
56 | 			read_count, write_count = compressor.copy_stream(compression_reader, output_handle)
57 | 
58 | 		compressed_bytes_read += file_size
59 | 		uncompressed_bytes_read += read_count
60 | 		bytes_written += write_count
61 | 		files_read += 1
62 | 		log.info(f"{files_read:,}/{len(files):,} : {(compressed_bytes_read / (2**30)):.2f} gb of {(total_size / (2**30)):.2f} gb compressed to {(bytes_written / (2**30)):.2f} gb : {bytes_written /compressed_bytes_read:.3f}")
63 | 


--------------------------------------------------------------------------------
/personal/compression/recompress_folder_multiprocess.py:
--------------------------------------------------------------------------------
  1 | # this script iterates through zst compressed ndjson files, like the pushshift reddit dumps, loads each line
  2 | # and if it matches the criteria in the command line arguments, it's written out into a separate file for
  3 | # that month. After all the ndjson files are processed, it iterates through the resulting files and combines
  4 | # them into a final file.
  5 | 
  6 | # this script assumes the files are named in chronological order and prefixed with RS_ or RC_, like the pushshift dumps
  7 | 
  8 | # features:
  9 | #  - multiple processes in parallel to maximize drive read and decompression
 10 | #  - saves state as it completes each file and picks up where it stopped
 11 | #  - detailed progress indicators
 12 | 
 13 | # examples:
 14 | #  - get all comments that have a subreddit field (subreddit is the default) of "wallstreetbets". This will create a single output file "wallstreetbets_comments.zst" in the folder the script is run in
 15 | #    python3 combine_folder_multiprocess.py reddit/comments --value wallstreetbets
 16 | #  - get all comments and submissions (assuming both types of dump files are under the reddit folder) that have an author field of Watchful1 or spez and output the results to a folder called pushshift.
 17 | #    This will result in four files, pushshift/Watchful1_comments, pushshift/Watchful1_submissions, pushshift/spez_comments, pushshift/spez_submissions
 18 | #    python3 combine_folder_multiprocess.py reddit --field author --value Watchful1,spez --output pushshift
 19 | 
 20 | import zstandard
 21 | import os
 22 | import json
 23 | import sys
 24 | import time
 25 | import argparse
 26 | import re
 27 | from collections import defaultdict
 28 | import logging.handlers
 29 | import multiprocessing
 30 | from enum import Enum
 31 | 
 32 | 
 33 | # sets up logging to the console as well as a file
 34 | log = logging.getLogger("bot")
 35 | log.setLevel(logging.INFO)
 36 | log_formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')
 37 | 
 38 | log_str_handler = logging.StreamHandler()
 39 | log_str_handler.setFormatter(log_formatter)
 40 | log.addHandler(log_str_handler)
 41 | if not os.path.exists("logs"):
 42 | 	os.makedirs("logs")
 43 | log_file_handler = logging.handlers.RotatingFileHandler(
 44 | 	os.path.join("logs", "bot.log"), maxBytes=1024*1024*16, backupCount=5)
 45 | log_file_handler.setFormatter(log_formatter)
 46 | log.addHandler(log_file_handler)
 47 | 
 48 | 
 49 | # convenience object used to pass status information between processes
 50 | class FileConfig:
 51 | 	def __init__(self, input_path, output_path, complete=False, uncompressed_size=None, new_compressed_size=None, total_lines=None):
 52 | 		self.input_path = input_path
 53 | 		self.output_path = output_path
 54 | 		self.complete = complete
 55 | 		self.error_message = None
 56 | 
 57 | 		self.old_compressed_size = os.stat(input_path).st_size
 58 | 		self.uncompressed_size = uncompressed_size
 59 | 		self.new_compressed_size = new_compressed_size
 60 | 
 61 | 		self.total_lines = total_lines
 62 | 
 63 | 	def __str__(self):
 64 | 		return f"{self.input_path} : {self.output_path} : {self.complete} : {self.old_compressed_size} - {self.uncompressed_size} - {self.new_compressed_size}"
 65 | 
 66 | 
 67 | def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
 68 | 	chunk = reader.read(chunk_size)
 69 | 	bytes_read += len(chunk)
 70 | 	if previous_chunk is not None:
 71 | 		chunk = previous_chunk + chunk
 72 | 	try:
 73 | 		return chunk.decode(), bytes_read
 74 | 	except UnicodeDecodeError:
 75 | 		if bytes_read > max_window_size:
 76 | 			raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
 77 | 		return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)
 78 | 
 79 | 
 80 | def count_lines_bytes(file_name):
 81 | 	count_lines = 0
 82 | 	uncompressed_bytes = 0
 83 | 	with open(file_name, 'rb') as file_handle:
 84 | 		buffer = ''
 85 | 		reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
 86 | 
 87 | 		while True:
 88 | 			chunk, chunk_bytes = read_and_decode(reader, 2**27, (2**29) * 2)
 89 | 			uncompressed_bytes += chunk_bytes
 90 | 			if not chunk:
 91 | 				break
 92 | 			lines = (buffer + chunk).split("\n")
 93 | 			count_lines += len(lines) - 1
 94 | 
 95 | 			buffer = lines[-1]
 96 | 		reader.close()
 97 | 	return count_lines, uncompressed_bytes
 98 | 
 99 | 
100 | class Queue:
101 | 	def __init__(self, max_size):
102 | 		self.list = []
103 | 		self.max_size = max_size
104 | 
105 | 	def put(self, item):
106 | 		if len(self.list) >= self.max_size:
107 | 			self.list.pop(0)
108 | 		self.list.append(item)
109 | 
110 | 	def peek(self):
111 | 		return self.list[0] if len(self.list) > 0 else None
112 | 
113 | 
114 | # save file information and progress to a json file
115 | # we don't want to save the whole FileConfig object, since some info resets if we restart
116 | def save_file_list(input_files, working_folder, status_json, arg_string, script_type):
117 | 	if not os.path.exists(working_folder):
118 | 		os.makedirs(working_folder)
119 | 	simple_file_list = []
120 | 	for file in input_files:
121 | 		simple_file_list.append([file.input_path, file.output_path, file.complete, file.uncompressed_size, file.new_compressed_size, file.total_lines])
122 | 	with open(status_json, 'w') as status_json_file:
123 | 		output_dict = {
124 | 			"args": arg_string,
125 | 			"type": script_type,
126 | 			"files": simple_file_list,
127 | 		}
128 | 		status_json_file.write(json.dumps(output_dict, indent=4))
129 | 
130 | 
131 | # load file information from the json file and recalculate file sizes
132 | def load_file_list(status_json):
133 | 	if os.path.exists(status_json):
134 | 		with open(status_json, 'r') as status_json_file:
135 | 			output_dict = json.load(status_json_file)
136 | 			input_files = []
137 | 			for simple_file in output_dict["files"]:
138 | 				input_files.append(
139 | 					FileConfig(simple_file[0], simple_file[1], simple_file[2], simple_file[3], simple_file[4], simple_file[5])
140 | 				)
141 | 			return input_files, output_dict["args"], output_dict["type"]
142 | 	else:
143 | 		return None, None, None
144 | 
145 | 
146 | # base of each separate process. Loads a file, iterates through lines and writes out
147 | # the ones where the `field` of the object matches `value`. Also passes status
148 | # information back to the parent via a queue
149 | def process_file(file, queue, threads, level):
150 | 	queue.put(file)
151 | 	file.total_lines, file.uncompressed_size = count_lines_bytes(file.input_path)
152 | 	queue.put(file)
153 | 
154 | 	try:
155 | 		decompressor = zstandard.ZstdDecompressor(max_window_size=2**31)
156 | 		compressor = zstandard.ZstdCompressor(level=level, write_content_size=True, write_checksum=True, threads=threads)
157 | 		with open(file.input_path, 'rb') as input_handle, open(file.output_path, "wb") as output_handle:
158 | 			compression_reader = decompressor.stream_reader(input_handle)
159 | 			read_count, file.new_compressed_size = compressor.copy_stream(compression_reader, output_handle, size=file.uncompressed_size)
160 | 		file.complete = True
161 | 	except Exception as err:
162 | 		file.error_message = str(err)
163 | 	#log.info(f"{read_count:,} to {write_count:,} in {seconds:,.2f} with {threads} threads")
164 | 	queue.put(file)
165 | 
166 | 
167 | if __name__ == '__main__':
168 | 	parser = argparse.ArgumentParser(description="Use multiple processes to recompress zst files in a folder")
169 | 	parser.add_argument("input", help="The input folder to read files from")
170 | 	parser.add_argument("output", help="Put the output files in this folder")
171 | 	parser.add_argument("--level", help="The compression ratio to output at. From 0 to 22", default=22, type=int)
172 | 	parser.add_argument("--working", help="The folder to store temporary files in", default="pushshift_working")
173 | 	parser.add_argument("--processes", help="Number of processes to use", default=4, type=int)
174 | 	parser.add_argument("--threads", help="Number of threads per process", default=0, type=int)
175 | 	parser.add_argument("--debug", help="Enable debug logging", action='store_const', const=True, default=False)
176 | 	script_type = "compress"
177 | 
178 | 	args = parser.parse_args()
179 | 	arg_string = f"{args.input}:{args.output}:{args.level}"
180 | 
181 | 	if args.debug:
182 | 		log.setLevel(logging.DEBUG)
183 | 
184 | 	log.info(f"Loading files from: {args.input}")
185 | 	log.info(f"Writing output to: {args.output}")
186 | 
187 | 	multiprocessing.set_start_method('spawn')
188 | 	queue = multiprocessing.Manager().Queue()
189 | 	status_json = os.path.join(args.working, "status.json")
190 | 	input_files, saved_arg_string, saved_type = load_file_list(status_json)
191 | 	if saved_arg_string and saved_arg_string != arg_string:
192 | 		log.warning(f"Args don't match args from json file. Delete working folder")
193 | 		sys.exit(0)
194 | 
195 | 	if saved_type and saved_type != script_type:
196 | 		log.warning(f"Script type doesn't match type from json file. Delete working folder")
197 | 		sys.exit(0)
198 | 
199 | 	# if the file list wasn't loaded from the json, this is the first run, find what files we need to process
200 | 	if input_files is None:
201 | 		input_files = []
202 | 		for file_name in os.listdir(args.input):
203 | 			input_path = os.path.join(args.input, file_name)
204 | 			if os.path.isfile(input_path) and file_name.endswith(".zst"):
205 | 				output_path = os.path.join(args.output, file_name)
206 | 				input_files.append(FileConfig(input_path, output_path=output_path))
207 | 
208 | 		save_file_list(input_files, args.working, status_json, arg_string, script_type)
209 | 	else:
210 | 		log.info(f"Existing input file was read, if this is not correct you should delete the {args.working} folder and run this script again")
211 | 
212 | 	files_processed, total_old_bytes, processed_old_bytes, processed_uncompressed_bytes, processed_new_bytes, processed_lines = 0, 0, 0, 0, 0, 0
213 | 	files_to_process = []
214 | 	# calculate the total file size for progress reports, build a list of incomplete files to process
215 | 	# do this largest to smallest by file size so that we aren't processing a few really big files with only a few threads at the end
216 | 	for file in sorted(input_files, key=lambda item: item.old_compressed_size, reverse=True):
217 | 		total_old_bytes += file.old_compressed_size
218 | 		if file.complete:
219 | 			files_processed += 1
220 | 			processed_old_bytes += file.old_compressed_size
221 | 			processed_uncompressed_bytes += file.uncompressed_size
222 | 			processed_new_bytes += file.new_compressed_size
223 | 			processed_lines += file.total_lines
224 | 		else:
225 | 			files_to_process.append(file)
226 | 
227 | 	log.info(f"Processed {files_processed} of {len(input_files)} files with {(processed_old_bytes / (2**30)):.2f} of {(total_old_bytes / (2**30)):.2f} gigabytes")
228 | 
229 | 	start_time = time.time()
230 | 	if len(files_to_process):
231 | 		progress_queue = Queue(40)
232 | 		progress_queue.put([start_time, processed_old_bytes])
233 | 		speed_queue = Queue(40)
234 | 		# start the workers
235 | 		with multiprocessing.Pool(processes=min(args.processes, len(files_to_process))) as pool:
236 | 			workers = pool.starmap_async(process_file, [(file, queue, args.threads, args.level) for file in files_to_process], chunksize=1, error_callback=log.info)
237 | 			while not workers.ready() or not queue.empty():
238 | 				# loop until the workers are all done, pulling in status messages as they are sent
239 | 				file_update = queue.get()
240 | 				if file_update.error_message is not None:
241 | 					log.warning(f"File failed {file_update.input_path}: {file_update.error_message}")
242 | 
243 | 				# this is the workers telling us they are starting a new file, print the debug message but nothing else
244 | 				if not file_update.complete:
245 | 					if file_update.uncompressed_size is not None:
246 | 						log.debug(f"Calculated uncompressed size: {file_update.input_path} : {file_update.uncompressed_size:,}")
247 | 					else:
248 | 						log.debug(f"Starting file: {file_update.input_path} : {file_update.old_compressed_size:,}")
249 | 					continue
250 | 
251 | 				# I'm going to assume that the list of files is short enough that it's no
252 | 				# big deal to just iterate each time since that saves a bunch of work
253 | 				files_processed, processed_old_bytes, processed_uncompressed_bytes, processed_new_bytes, processed_lines, files_errored, i = 0, 0, 0, 0, 0, 0, 0
254 | 				for file in input_files:
255 | 					if file.input_path == file_update.input_path:
256 | 						input_files[i] = file_update
257 | 						file = file_update
258 | 					if file.complete:
259 | 						processed_old_bytes += file.old_compressed_size
260 | 						processed_uncompressed_bytes += file.uncompressed_size if file.uncompressed_size is not None else 0
261 | 						processed_new_bytes += file.new_compressed_size if file.new_compressed_size is not None else 0
262 | 						processed_lines += file.total_lines if file.total_lines is not None else 0
263 | 					files_processed += 1 if file.complete or file.error_message is not None else 0
264 | 					files_errored += 1 if file.error_message is not None else 0
265 | 					i += 1
266 | 				if file_update.complete or file_update.error_message is not None:
267 | 					save_file_list(input_files, args.working, status_json, arg_string, script_type)
268 | 					log.debug(f"Finished file: {file_update.input_path}")
269 | 				current_time = time.time()
270 | 				progress_queue.put([current_time, processed_old_bytes])
271 | 
272 | 				first_time, first_bytes = progress_queue.peek()
273 | 				bytes_per_second = int((processed_old_bytes - first_bytes)/(current_time - first_time))
274 | 				speed_queue.put(bytes_per_second)
275 | 				seconds_left = int((total_old_bytes - processed_old_bytes) / int(sum(speed_queue.list) / len(speed_queue.list)))
276 | 				minutes_left = int(seconds_left / 60)
277 | 				hours_left = int(minutes_left / 60)
278 | 				days_left = int(hours_left / 24)
279 | 
280 | 				log.info(
281 | 					f"{(processed_old_bytes / (2**30)):.3f} gb at {(bytes_per_second / (2**20)):,.2f} mb/s, {(processed_old_bytes / total_old_bytes) * 100:.0f}% : "
282 | 					f"{(processed_uncompressed_bytes / (2**30)):.3f} gb uncompressed down to {(processed_new_bytes / (2**30)):.3f} gb compressed : "
283 | 					f"{(processed_old_bytes / processed_uncompressed_bytes):.3f} old ratio : {(processed_new_bytes / processed_uncompressed_bytes):.3f} new ratio : {(processed_new_bytes / processed_old_bytes):.3f} difference : "
284 | 					f"{files_processed}({files_errored})/{len(input_files)} files : "
285 | 					f"{(str(days_left) + 'd ' if days_left > 0 else '')}{hours_left - (days_left * 24)}:{minutes_left - (hours_left * 60):02}:{seconds_left - (minutes_left * 60):02} remaining : "
286 | 					f"{first_time}:{first_bytes}:{current_time}:{processed_old_bytes}:{processed_uncompressed_bytes}:{processed_new_bytes}:{total_old_bytes}:{int(sum(speed_queue.list))}:{len(speed_queue.list)}")
287 | 
288 | 	log.info(f"{(processed_old_bytes / (2**30)):.2f} gb, {(processed_old_bytes / total_old_bytes) * 100:.0f}% : {files_processed}/{len(input_files)}")
289 | 


--------------------------------------------------------------------------------
/personal/diagnostic/comments_per_day.py:
--------------------------------------------------------------------------------
 1 | import utils
 2 | import discord_logging
 3 | from datetime import datetime
 4 | 
 5 | log = discord_logging.init_logging()
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 | 	day = None
10 | 	day_comments = 0
11 | 	for comment in utils.read_obj_zst(r"\\MYCLOUDPR4100\Public\reddit\subreddits23\antiwork_comments.zst"):
12 | 		created_day = datetime.utcfromtimestamp(int(comment['created_utc'])).strftime("%y-%m-%d")
13 | 		if day is None:
14 | 			day = created_day
15 | 		if day != created_day:
16 | 			log.info(f"{day}	{day_comments}")
17 | 			day_comments = 0
18 | 			day = created_day
19 | 		day_comments += 1
20 | 
21 | 	log.info(f"{day}	{day_comments}")
22 | 


--------------------------------------------------------------------------------
/personal/diagnostic/comments_per_day_with_score.py:
--------------------------------------------------------------------------------
 1 | import utils
 2 | import discord_logging
 3 | from datetime import datetime
 4 | 
 5 | log = discord_logging.init_logging()
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 | 	day = None
10 | 	day_comments, day_comments_with_score = 0, 0
11 | 	for comment in utils.read_obj_zst(r"\\MYCLOUDPR4100\Public\reddit\subreddits23\antiwork_comments.zst"):
12 | 		created_day = datetime.utcfromtimestamp(int(comment['created_utc'])).strftime("%y-%m-%d")
13 | 		if day is None:
14 | 			day = created_day
15 | 		if day != created_day:
16 | 			log.info(f"{day}	{day_comments}	{day_comments_with_score}	{int((day_comments_with_score / day_comments) * 100):.2}%")
17 | 			day_comments, day_comments_with_score = 0, 0
18 | 			day = created_day
19 | 		day_comments += 1
20 | 		if comment['score'] != 1:
21 | 			day_comments_with_score += 1
22 | 
23 | 	log.info(f"{day}	{day_comments}	{day_comments_with_score}	{int((day_comments_with_score / day_comments) * 100):.2}%")
24 | 


--------------------------------------------------------------------------------
/personal/diagnostic/compare_lines.py:
--------------------------------------------------------------------------------
 1 | import utils
 2 | import discord_logging
 3 | import os
 4 | import sys
 5 | from datetime import datetime
 6 | 
 7 | log = discord_logging.init_logging()
 8 | 
 9 | 
10 | if __name__ == "__main__":
11 | 	file_one = open(r"\\MYCLOUDPR4100\Public\reddit_final\RelationshipsOver35_comments_dump.txt", 'r')
12 | 	file_two = open(r"\\MYCLOUDPR4100\Public\reddit_final\RelationshipsOver35_comments_mongo.txt", 'r')
13 | 
14 | 	file_lines = 0
15 | 	while True:
16 | 		file_lines += 1
17 | 		line_one = file_one.readline().rstrip()
18 | 		line_two = file_two.readline().rstrip()
19 | 		if line_one != line_two:
20 | 			log.info(f"lines not matching: {file_lines}")
21 | 			log.info(line_one)
22 | 			log.info(line_two)
23 | 			#break
24 | 
25 | 		if file_lines % 100000 == 0:
26 | 			log.info(f"{file_lines:,}")
27 | 
28 | 		if not line_one:
29 | 			break
30 | 
31 | 	log.info(f"{file_lines:,}")
32 | 	file_one.close()
33 | 	file_two.close()
34 | 


--------------------------------------------------------------------------------
/personal/diagnostic/count_fields.py:
--------------------------------------------------------------------------------
 1 | import zstandard
 2 | import os
 3 | import json
 4 | import sys
 5 | from datetime import datetime
 6 | import logging.handlers
 7 | from collections import defaultdict
 8 | 
 9 | 
10 | log = logging.getLogger("bot")
11 | log.setLevel(logging.DEBUG)
12 | log.addHandler(logging.StreamHandler())
13 | 
14 | 
15 | def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
16 | 	chunk = reader.read(chunk_size)
17 | 	bytes_read += chunk_size
18 | 	if previous_chunk is not None:
19 | 		chunk = previous_chunk + chunk
20 | 	try:
21 | 		return chunk.decode()
22 | 	except UnicodeDecodeError:
23 | 		if bytes_read > max_window_size:
24 | 			raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
25 | 		log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk")
26 | 		return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)
27 | 
28 | 
29 | def read_lines_zst(file_name):
30 | 	with open(file_name, 'rb') as file_handle:
31 | 		buffer = ''
32 | 		reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
33 | 		while True:
34 | 			chunk = read_and_decode(reader, 2**27, (2**29) * 2)
35 | 			if not chunk:
36 | 				break
37 | 			lines = (buffer + chunk).split("\n")
38 | 			for line in lines[:-1]:
39 | 				yield json.loads(line)
40 | 			buffer = lines[-1]
41 | 		reader.close()
42 | 
43 | 
44 | if __name__ == "__main__":
45 | 	#input_folder = r"\\MYCLOUDPR4100\Public\ingest\ingest\comments\23-06-23"
46 | 	input_folder = r"\\MYCLOUDPR4100\Public\reddit\comments"
47 | 	input_files = []
48 | 	total_size = 0
49 | 	for subdir, dirs, files in os.walk(input_folder):
50 | 		for filename in files:
51 | 			input_path = os.path.join(subdir, filename)
52 | 			if input_path.endswith(".zst"):
53 | 				file_size = os.stat(input_path).st_size
54 | 				total_size += file_size
55 | 				input_files.append([input_path, file_size])
56 | 
57 | 	log.info(f"Processing {len(input_files)} files of {(total_size / (2**30)):.2f} gigabytes")
58 | 
59 | 	total_lines = 0
60 | 	fields = defaultdict(lambda: defaultdict(int))
61 | 	for input_file in input_files:
62 | 		file_lines = 0
63 | 		created = None
64 | 		for obj in read_lines_zst(input_file[0]):
65 | 			for key, value in obj.items():
66 | 				value = str(value)[:20]
67 | 				fields[key][value] += 1
68 | 
69 | 			created = datetime.utcfromtimestamp(int(obj['created_utc']))
70 | 			file_lines += 1
71 | 			if file_lines % 100000 == 0:
72 | 				log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines + total_lines:,}")
73 | 			if file_lines >= 1000:
74 | 				break
75 | 		total_lines += file_lines
76 | 		log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines + total_lines:,}")
77 | 
78 | 	sorted_fields = []
79 | 	for key, values in fields.items():
80 | 		total_occurrences = 0
81 | 		unique_values = 0
82 | 		examples = []
83 | 		for value_name, count in values.items():
84 | 			unique_values += 1
85 | 			total_occurrences += count
86 | 			if len(examples) < 3:
87 | 				examples.append(value_name)
88 | 		sorted_fields.append((total_occurrences, f"{key}: {(total_occurrences / total_lines) * 100:.2f} : {unique_values:,} : {','.join(examples)}"))
89 | 	sorted_fields.sort(key=lambda x:x[0], reverse=True)
90 | 	for count, string in sorted_fields:
91 | 		log.info(string)
92 | 


--------------------------------------------------------------------------------
/personal/diagnostic/count_subreddits_multiprocess.py:
--------------------------------------------------------------------------------
  1 | import zstandard
  2 | import os
  3 | import json
  4 | import sys
  5 | import time
  6 | import argparse
  7 | import re
  8 | from collections import defaultdict
  9 | from datetime import datetime
 10 | import logging.handlers
 11 | import multiprocessing
 12 | 
 13 | 
 14 | # sets up logging to the console as well as a file
 15 | log = logging.getLogger("bot")
 16 | log.setLevel(logging.INFO)
 17 | log_formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')
 18 | 
 19 | log_stderr_handler = logging.StreamHandler()
 20 | log_stderr_handler.setFormatter(log_formatter)
 21 | log.addHandler(log_stderr_handler)
 22 | if not os.path.exists("logs"):
 23 | 	os.makedirs("logs")
 24 | log_file_handler = logging.handlers.RotatingFileHandler(
 25 | 	os.path.join("logs", "bot.log"), maxBytes=1024*1024*16, backupCount=5)
 26 | log_file_handler.setFormatter(log_formatter)
 27 | log.addHandler(log_file_handler)
 28 | 
 29 | 
 30 | # convenience object used to pass status information between processes
 31 | class FileConfig:
 32 | 	def __init__(self, input_path, output_path=None, complete=False, lines_processed=0, error_lines=0, count_file_path=None):
 33 | 		self.input_path = input_path
 34 | 		self.output_path = output_path
 35 | 		self.count_file_path = count_file_path
 36 | 		self.file_size = os.stat(input_path).st_size
 37 | 		self.complete = complete
 38 | 		self.bytes_processed = self.file_size if complete else 0
 39 | 		self.lines_processed = lines_processed if complete else 0
 40 | 		self.error_message = None
 41 | 		self.error_lines = error_lines
 42 | 
 43 | 	def __str__(self):
 44 | 		return f"{self.input_path} : {self.output_path} : {self.file_size} : {self.complete} : {self.bytes_processed} : {self.lines_processed}"
 45 | 
 46 | 
 47 | # used for calculating running average of read speed
 48 | class Queue:
 49 | 	def __init__(self, max_size):
 50 | 		self.list = []
 51 | 		self.max_size = max_size
 52 | 
 53 | 	def put(self, item):
 54 | 		if len(self.list) >= self.max_size:
 55 | 			self.list.pop(0)
 56 | 		self.list.append(item)
 57 | 
 58 | 	def peek(self):
 59 | 		return self.list[0] if len(self.list) > 0 else None
 60 | 
 61 | 
 62 | # save file information and progress to a json file
 63 | # we don't want to save the whole FileConfig object, since some info resets if we restart
 64 | def save_file_list(input_files, working_folder, status_json, script_type, stage):
 65 | 	if not os.path.exists(working_folder):
 66 | 		os.makedirs(working_folder)
 67 | 	simple_file_list = []
 68 | 	for file in input_files:
 69 | 		simple_file_list.append([file.input_path, file.output_path, file.complete, file.lines_processed, file.error_lines, file.monthly_count_file])
 70 | 	with open(status_json, 'w') as status_json_file:
 71 | 		output_dict = {
 72 | 			"files": simple_file_list,
 73 | 			"type": script_type,
 74 | 			"stage": stage,
 75 | 		}
 76 | 		status_json_file.write(json.dumps(output_dict, indent=4))
 77 | 
 78 | 
 79 | # load file information from the json file and recalculate file sizes
 80 | def load_file_list(status_json):
 81 | 	if os.path.exists(status_json):
 82 | 		with open(status_json, 'r') as status_json_file:
 83 | 			output_dict = json.load(status_json_file)
 84 | 			input_files = []
 85 | 			for simple_file in output_dict["files"]:
 86 | 				input_files.append(
 87 | 					FileConfig(simple_file[0], simple_file[1], simple_file[2], simple_file[3], simple_file[4], simple_file[5] if len(simple_file) > 5 else None)
 88 | 				)
 89 | 			return input_files, output_dict["type"], output_dict["stage"]
 90 | 	else:
 91 | 		return None, None, "count"
 92 | 
 93 | 
 94 | # recursively decompress and decode a chunk of bytes. If there's a decode error then read another chunk and try with that, up to a limit of max_window_size bytes
 95 | def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
 96 | 	chunk = reader.read(chunk_size)
 97 | 	bytes_read += chunk_size
 98 | 	if previous_chunk is not None:
 99 | 		chunk = previous_chunk + chunk
100 | 	try:
101 | 		return chunk.decode()
102 | 	except UnicodeDecodeError:
103 | 		if bytes_read > max_window_size:
104 | 			raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
105 | 		return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)
106 | 
107 | 
108 | # open a zst compressed ndjson file and yield lines one at a time
109 | # also passes back file progress
110 | def read_lines_zst(file_name):
111 | 	with open(file_name, 'rb') as file_handle:
112 | 		buffer = ''
113 | 		reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
114 | 		while True:
115 | 			chunk = read_and_decode(reader, 2**27, (2**29) * 2)
116 | 			if not chunk:
117 | 				break
118 | 			lines = (buffer + chunk).split("\n")
119 | 
120 | 			for line in lines[:-1]:
121 | 				yield line, file_handle.tell()
122 | 
123 | 			buffer = lines[-1]
124 | 		reader.close()
125 | 
126 | 
127 | # base of each separate process. Loads a file, iterates through lines and writes out
128 | # the ones where the `field` of the object matches `value`. Also passes status
129 | # information back to the parent via a queue
130 | def process_file(file, queue, field):
131 | 	output_file = None
132 | 	try:
133 | 		for line, file_bytes_processed in read_lines_zst(file.input_path):
134 | 			try:
135 | 				obj = json.loads(line)
136 | 				observed = obj[field].lower()
137 | 				if output_file is None:
138 | 					output_file = open(file.output_path, 'w', encoding="utf-8")
139 | 				output_file.write(observed)
140 | 				output_file.write("\n")
141 | 			except (KeyError, json.JSONDecodeError) as err:
142 | 				file.error_lines += 1
143 | 			file.lines_processed += 1
144 | 			if file.lines_processed % 1000000 == 0:
145 | 				file.bytes_processed = file_bytes_processed
146 | 				queue.put(file)
147 | 
148 | 		if output_file is not None:
149 | 			output_file.close()
150 | 
151 | 		file.complete = True
152 | 		file.bytes_processed = file.file_size
153 | 	except Exception as err:
154 | 		file.error_message = str(err)
155 | 	queue.put(file)
156 | 
157 | 
158 | if __name__ == '__main__':
159 | 	parser = argparse.ArgumentParser(description="Use multiple processes to decompress and iterate over pushshift dump files")
160 | 	parser.add_argument("input", help="The input folder to recursively read files from")
161 | 	parser.add_argument("--output", help="Name of the output file", default="field_counts")
162 | 	parser.add_argument("--working", help="The folder to store temporary files in", default="pushshift_working")
163 | 	parser.add_argument("--monthly_count_folder", help="The folder to store monthly count files in", default="pushshift_counts")
164 | 	parser.add_argument("--field", help="Which field to count", default="subreddit")
165 | 	parser.add_argument("--min_count", help="Dont write any counts below this number", default=1000, type=int)
166 | 	parser.add_argument("--processes", help="Number of processes to use", default=10, type=int)
167 | 	parser.add_argument("--file_filter", help="Regex filenames have to match to be processed", default="^rc_|rs_")
168 | 	parser.add_argument(
169 | 		"--error_rate", help=
170 | 		"Percentage as an integer from 0 to 100 of the lines where the field can be missing. For the subreddit field especially, "
171 | 		"there are a number of posts that simply don't have a subreddit attached", default=1, type=int)
172 | 	parser.add_argument("--debug", help="Enable debug logging", action='store_const', const=True, default=False)
173 | 	script_type = "count"
174 | 
175 | 	args = parser.parse_args()
176 | 
177 | 	if args.debug:
178 | 		log.setLevel(logging.DEBUG)
179 | 
180 | 	log.info(f"Loading files from: {args.input}")
181 | 	if args.output:
182 | 		log.info(f"Writing output to: {args.output}")
183 | 	else:
184 | 		log.info(f"Writing output to working folder")
185 | 
186 | 	multiprocessing.set_start_method('spawn')
187 | 	queue = multiprocessing.Manager().Queue()
188 | 	status_json = os.path.join(args.working, "status.json")
189 | 	input_files, saved_type, stage = load_file_list(status_json)
190 | 
191 | 	if saved_type and saved_type != script_type:
192 | 		log.warning(f"Script type doesn't match type from json file. Delete working folder")
193 | 		sys.exit(0)
194 | 
195 | 	if stage == "count":
196 | 		# if the file list wasn't loaded from the json, this is the first run, find what files we need to process
197 | 		if input_files is None:
198 | 			input_files = []
199 | 			for subdir, dirs, files in os.walk(args.input):
200 | 				files.sort()
201 | 				for file_name in files:
202 | 					if file_name.endswith(".zst") and re.search(args.file_filter, file_name, re.IGNORECASE) is not None:
203 | 						input_path = os.path.join(subdir, file_name)
204 | 						output_path = os.path.join(args.working, file_name[:-4])
205 | 						input_files.append(FileConfig(input_path, output_path=output_path))
206 | 
207 | 			save_file_list(input_files, args.working, status_json, script_type, "count")
208 | 		else:
209 | 			log.info(f"Existing input file was read, if this is not correct you should delete the {args.working} folder and run this script again")
210 | 
211 | 		files_processed = 0
212 | 		total_bytes = 0
213 | 		total_bytes_processed = 0
214 | 		total_lines_processed = 0
215 | 		total_lines_errored = 0
216 | 		files_to_process = []
217 | 		# calculate the total file size for progress reports, build a list of incomplete files to process
218 | 		# do this largest to smallest by file size so that we aren't processing a few really big files with only a few threads at the end
219 | 		for file in sorted(input_files, key=lambda item: item.file_size, reverse=True):
220 | 			total_bytes += file.file_size
221 | 			if file.complete:
222 | 				files_processed += 1
223 | 				total_lines_processed += file.lines_processed
224 | 				total_bytes_processed += file.file_size
225 | 				total_lines_errored += file.error_lines
226 | 			else:
227 | 				files_to_process.append(file)
228 | 
229 | 		log.info(f"Processed {files_processed} of {len(input_files)} files with {(total_bytes_processed / (2**30)):.2f} of {(total_bytes / (2**30)):.2f} gigabytes")
230 | 
231 | 		start_time = time.time()
232 | 		if len(files_to_process):
233 | 			progress_queue = Queue(40)
234 | 			progress_queue.put([start_time, total_lines_processed, total_bytes_processed])
235 | 			speed_queue = Queue(40)
236 | 			for file in files_to_process:
237 | 				log.info(f"Processing file: {file.input_path}")
238 | 			# start the workers
239 | 			with multiprocessing.Pool(processes=min(args.processes, len(files_to_process))) as pool:
240 | 				workers = pool.starmap_async(process_file, [(file, queue, args.field) for file in files_to_process], chunksize=1, error_callback=log.info)
241 | 				while not workers.ready():
242 | 					# loop until the workers are all done, pulling in status messages as they are sent
243 | 					file_update = queue.get()
244 | 					if file_update.error_message is not None:
245 | 						log.warning(f"File failed {file_update.input_path}: {file_update.error_message}")
246 | 					# I'm going to assume that the list of files is short enough that it's no
247 | 					# big deal to just iterate each time since that saves a bunch of work
248 | 					total_lines_processed = 0
249 | 					total_bytes_processed = 0
250 | 					total_lines_errored = 0
251 | 					files_processed = 0
252 | 					files_errored = 0
253 | 					i = 0
254 | 					for file in input_files:
255 | 						if file.input_path == file_update.input_path:
256 | 							input_files[i] = file_update
257 | 							file = file_update
258 | 						total_lines_processed += file.lines_processed
259 | 						total_bytes_processed += file.bytes_processed
260 | 						total_lines_errored += file.error_lines
261 | 						files_processed += 1 if file.complete or file.error_message is not None else 0
262 | 						files_errored += 1 if file.error_message is not None else 0
263 | 						i += 1
264 | 					if file_update.complete or file_update.error_message is not None:
265 | 						save_file_list(input_files, args.working, status_json, script_type)
266 | 					current_time = time.time()
267 | 					progress_queue.put([current_time, total_lines_processed, total_bytes_processed])
268 | 
269 | 					first_time, first_lines, first_bytes = progress_queue.peek()
270 | 					bytes_per_second = int((total_bytes_processed - first_bytes)/(current_time - first_time))
271 | 					speed_queue.put(bytes_per_second)
272 | 					seconds_left = int((total_bytes - total_bytes_processed) / int(sum(speed_queue.list) / len(speed_queue.list)))
273 | 					minutes_left = int(seconds_left / 60)
274 | 					hours_left = int(minutes_left / 60)
275 | 					days_left = int(hours_left / 24)
276 | 
277 | 					log.info(
278 | 						f"{total_lines_processed:,} lines at {(total_lines_processed - first_lines)/(current_time - first_time):,.0f}/s, {total_lines_errored:,} errored : "
279 | 						f"{(total_bytes_processed / (2**30)):.2f} gb at {(bytes_per_second / (2**20)):,.0f} mb/s, {(total_bytes_processed / total_bytes) * 100:.0f}% : "
280 | 						f"{files_processed}({files_errored})/{len(input_files)} files : "
281 | 						f"{(str(days_left) + 'd ' if days_left > 0 else '')}{hours_left - (days_left * 24)}:{minutes_left - (hours_left * 60):02}:{seconds_left - (minutes_left * 60):02} remaining")
282 | 
283 | 		log.info(f"{total_lines_processed:,}, {total_lines_errored} errored : {(total_bytes_processed / (2**30)):.2f} gb, {(total_bytes_processed / total_bytes) * 100:.0f}% : {files_processed}/{len(input_files)}")
284 | 		stage = "sum"
285 | 		save_file_list(input_files, args.working, status_json, script_type, stage)
286 | 
287 | 	if stage == "sum":
288 | 		#working_file_paths = []
289 | 		count_incomplete = 0
290 | 		# build a list of output files to combine
291 | 		input_files = sorted(input_files, key=lambda item: os.path.split(item.output_path)[1])
292 | 		for file in input_files:
293 | 			if not file.complete:
294 | 				if file.error_message is not None:
295 | 					log.info(f"File {file.input_path} errored {file.error_message}")
296 | 				else:
297 | 					log.info(f"File {file.input_path} is not marked as complete")
298 | 				count_incomplete += 1
299 | 			else:
300 | 				if file.error_lines > file.lines_processed * (args.error_rate * 0.01):
301 | 					log.info(
302 | 						f"File {file.input_path} has {file.error_lines:,} errored lines out of {file.lines_processed:,}, "
303 | 						f"{(file.error_lines / file.lines_processed) * (args.error_rate * 0.01):.2f}% which is above the limit of {args.error_rate}%")
304 | 					count_incomplete += 1
305 | 
306 | 		if count_incomplete > 0:
307 | 			log.info(f"{count_incomplete} files were not completed, errored or don't exist, something went wrong. Aborting")
308 | 			sys.exit()
309 | 
310 | 		log.info(f"Processing complete, combining {len(input_files)} result files")
311 | 
312 | 		if not os.path.exists(args.monthly_count_folder):
313 | 			os.makedirs(args.monthly_count_folder)
314 | 		input_lines = 0
315 | 		files_counted = 0
316 | 		monthly_count_folder_paths = []
317 | 		for file in input_files:
318 | 			files_counted += 1
319 | 			if not os.path.exists(file.output_path):
320 | 				log.info(f"Output file {file.output_path} does not exist, skipping")
321 | 				continue
322 | 			monthly_counts = defaultdict(int)
323 | 			log.info(f"Reading {files_counted}/{len(input_files)} : {input_lines:,} : {os.path.split(file.output_path)[1]}")
324 | 			with open(file.output_path, 'r') as input_file:
325 | 				for line in input_file:
326 | 					input_lines += 1
327 | 					monthly_counts[line.strip()] += 1
328 | 
329 | 			file.monthly_count_file = os.path.join(args.monthly_count_folder, os.path.basename(file.output_path))
330 | 			with open(file.monthly_count_file, 'w') as output_handle:
331 | 				for field, count in sorted(monthly_counts.items(), key=lambda item: item[1], reverse=True):
332 | 					output_handle.write(f"{field}	{count}\n")
333 | 
334 | 		log.info(f"Finished combining files into monthlies, {input_lines:,} lines read. Combining into result output")
335 | 		stage = "agg"
336 | 		save_file_list(input_files, args.working, status_json, script_type, stage)
337 | 
338 | 	if stage == "agg":
339 | 		field_counts = defaultdict(int)
340 | 		for file in input_files:
341 | 			with open(file.monthly_count_file, 'r') as input_handle:
342 | 				for line in input_handle:
343 | 					field, count = line.strip().split("\t")
344 | 					field_counts[field] = count
345 | 
346 | 		sorted_counts = sorted(field_counts.items(), key=lambda item: item[1], reverse=True)
347 | 
348 | 		output_counts = 0
349 | 		with open(f"{args.output}.txt", 'w') as output_handle:
350 | 			for field, count in sorted_counts:
351 | 				if count >= args.min_count:
352 | 					output_counts += 1
353 | 					output_handle.write(f"{field}	{count}\n")
354 | 
355 | 		log.info(f"Finished combining files, {output_counts:,} field counts written")
356 | 


--------------------------------------------------------------------------------
/personal/diagnostic/get_zst_details.py:
--------------------------------------------------------------------------------
 1 | import zstandard
 2 | import os
 3 | import json
 4 | import sys
 5 | import time
 6 | import argparse
 7 | import re
 8 | from collections import defaultdict
 9 | import logging.handlers
10 | import multiprocessing
11 | import utils
12 | 
13 | 
14 | # sets up logging to the console as well as a file
15 | log = logging.getLogger("bot")
16 | log.setLevel(logging.INFO)
17 | log_formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')
18 | 
19 | log_str_handler = logging.StreamHandler()
20 | log_str_handler.setFormatter(log_formatter)
21 | log.addHandler(log_str_handler)
22 | if not os.path.exists("logs"):
23 | 	os.makedirs("logs")
24 | log_file_handler = logging.handlers.RotatingFileHandler(
25 | 	os.path.join("logs", "bot.log"), maxBytes=1024*1024*16, backupCount=5)
26 | log_file_handler.setFormatter(log_formatter)
27 | log.addHandler(log_file_handler)
28 | 
29 | 
30 | def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
31 | 	chunk = reader.read(chunk_size)
32 | 	bytes_read += len(chunk)
33 | 	if previous_chunk is not None:
34 | 		chunk = previous_chunk + chunk
35 | 	try:
36 | 		return chunk.decode(), bytes_read
37 | 	except UnicodeDecodeError:
38 | 		if bytes_read > max_window_size:
39 | 			raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
40 | 		return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)
41 | 
42 | 
43 | def count_lines_bytes(file_name):
44 | 	count_lines = 0
45 | 	uncompressed_bytes = 0
46 | 	with open(file_name, 'rb') as file_handle:
47 | 		buffer = ''
48 | 		reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
49 | 
50 | 		while True:
51 | 			chunk, chunk_bytes = read_and_decode(reader, 2**27, (2**29) * 2)
52 | 			uncompressed_bytes += chunk_bytes
53 | 			if not chunk:
54 | 				break
55 | 			lines = (buffer + chunk).split("\n")
56 | 			count_lines += len(lines) - 1
57 | 
58 | 			buffer = lines[-1]
59 | 		reader.close()
60 | 	return count_lines, uncompressed_bytes
61 | 
62 | 
63 | if __name__ == '__main__':
64 | 	input_path = r"\\MYCLOUDPR4100\Public\reddit\comments\RC_2008-03.zst"
65 | 	compressed_size = os.stat(input_path).st_size
66 | 	count_lines, uncompressed_bytes = count_lines_bytes(input_path)
67 | 	log.info(f"Compressed size: {compressed_size:,} : {(compressed_size / (2**30)):.2f} gb")
68 | 	log.info(f"Uncompressed size: {uncompressed_bytes:,} : {(uncompressed_bytes / (2**30)):.2f} gb")
69 | 	log.info(f"Ratio: {(uncompressed_bytes / compressed_size):.2f}")
70 | 	log.info(f"Lines: {count_lines:,}")
71 | 


--------------------------------------------------------------------------------
/personal/diagnostic/sum_subreddit_counts.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging.handlers
 3 | from collections import defaultdict
 4 | 
 5 | 
 6 | log = logging.getLogger("bot")
 7 | log.setLevel(logging.DEBUG)
 8 | log.addHandler(logging.StreamHandler())
 9 | 
10 | if __name__ == '__main__':
11 | 	input_folder = r"\\MYCLOUDPR4100\Public\pushshift_counts_summed"
12 | 	output_file = r"\\MYCLOUDPR4100\Public\subreddit_counts_total.txt"
13 | 	subreddits = defaultdict(int)
14 | 
15 | 	for subdir, dirs, files in os.walk(input_folder):
16 | 		for filename in files:
17 | 			log.info(f"Processing file: {filename}")
18 | 			input_path = os.path.join(subdir, filename)
19 | 			with open(input_path, 'r') as input_handle:
20 | 				line_count = 0
21 | 				for line in input_handle:
22 | 					subreddit, count_string = line.strip().split("\t")
23 | 					count = int(count_string)
24 | 					subreddits[subreddit] += count
25 | 					line_count += 1
26 | 
27 | 	log.info(f"Total subreddits: {len(subreddits):,}")
28 | 
29 | 	count_written = 0
30 | 	with open(output_file, 'w') as output_handle:
31 | 		for subreddit, count in sorted(subreddits.items(), key=lambda item: item[1], reverse=True):
32 | 			output_handle.write(f"{subreddit}	{count}\n")
33 | 			count_written += 1
34 | 			if count_written % 1000000 == 0:
35 | 				log.info(f"Written: {count_written:,}/{len(subreddits):,}")
36 | 
37 | 	log.info(f"Written: {count_written:,}/{len(subreddits):,}")
38 | 


--------------------------------------------------------------------------------
/personal/diagnostic/test_file.py:
--------------------------------------------------------------------------------
 1 | import utils
 2 | import discord_logging
 3 | import os
 4 | import sys
 5 | from datetime import datetime
 6 | 
 7 | log = discord_logging.init_logging()
 8 | 
 9 | 
10 | if __name__ == "__main__":
11 | 	input_path = r"\\MYCLOUDPR4100\Public\reddit\submissions\RS_2023-04.zst"
12 | 
13 | 	input_file_paths = []
14 | 	if os.path.isdir(input_path):
15 | 		for subdir, dirs, files in os.walk(input_path):
16 | 			files.sort()
17 | 			for file_name in files:
18 | 				if file_name.endswith(".zst"):
19 | 					input_file_paths.append(os.path.join(subdir, file_name))
20 | 	else:
21 | 		input_file_paths.append(input_path)
22 | 
23 | 	files_processed = 0
24 | 	for file_path in input_file_paths:
25 | 		file_name = os.path.basename(file_path)
26 | 		file_size = os.stat(file_path).st_size
27 | 		file_lines = 0
28 | 		file_bytes_processed = 0
29 | 		created = None
30 | 		previous_timestamp = None
31 | 		inserts = []
32 | 		for obj, line, file_bytes_processed in utils.read_obj_zst_meta(file_path):
33 | 			new_timestamp = int(obj['created_utc'])
34 | 			created = datetime.utcfromtimestamp(new_timestamp)
35 | 			if previous_timestamp is not None and previous_timestamp - (2) > new_timestamp:
36 | 				log.warning(f"Out of order timestamps {datetime.utcfromtimestamp(previous_timestamp).strftime('%Y-%m-%d %H:%M:%S')} - 4 hours > {created.strftime('%Y-%m-%d %H:%M:%S')}")
37 | 			previous_timestamp = new_timestamp
38 | 			file_lines += 1
39 | 			if file_lines % 100000 == 0:
40 | 				log.info(f"{files_processed}/{len(input_file_paths)}: {file_name} : {created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : {(file_bytes_processed / file_size) * 100:.0f}%")
41 | 
42 | 		files_processed += 1
43 | 		log.info(f"{files_processed}/{len(input_file_paths)}: {file_name} : {created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : 100%")
44 | 


--------------------------------------------------------------------------------
/personal/diagnostic/test_files_multiprocess.py:
--------------------------------------------------------------------------------
  1 | import zstandard
  2 | import os
  3 | import json
  4 | import sys
  5 | import time
  6 | import argparse
  7 | import re
  8 | from collections import defaultdict
  9 | from datetime import datetime
 10 | import logging.handlers
 11 | import multiprocessing
 12 | 
 13 | 
 14 | # sets up logging to the console as well as a file
 15 | log = logging.getLogger("bot")
 16 | log.setLevel(logging.INFO)
 17 | log_formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')
 18 | 
 19 | log_stderr_handler = logging.StreamHandler()
 20 | log_stderr_handler.setFormatter(log_formatter)
 21 | log.addHandler(log_stderr_handler)
 22 | if not os.path.exists("logs"):
 23 | 	os.makedirs("logs")
 24 | log_file_handler = logging.handlers.RotatingFileHandler(
 25 | 	os.path.join("logs", "bot.log"), maxBytes=1024*1024*16, backupCount=5)
 26 | log_file_handler.setFormatter(log_formatter)
 27 | log.addHandler(log_file_handler)
 28 | 
 29 | 
 30 | # convenience object used to pass status information between processes
 31 | class FileConfig:
 32 | 	def __init__(self, input_path, output_path=None, complete=False, lines_processed=0, error_lines=0):
 33 | 		self.input_path = input_path
 34 | 		self.output_path = output_path
 35 | 		self.file_size = os.stat(input_path).st_size
 36 | 		self.complete = complete
 37 | 		self.bytes_processed = self.file_size if complete else 0
 38 | 		self.lines_processed = lines_processed if complete else 0
 39 | 		self.error_message = None
 40 | 		self.error_lines = error_lines
 41 | 
 42 | 	def __str__(self):
 43 | 		return f"{self.input_path} : {self.output_path} : {self.file_size} : {self.complete} : {self.bytes_processed} : {self.lines_processed}"
 44 | 
 45 | 
 46 | # used for calculating running average of read speed
 47 | class Queue:
 48 | 	def __init__(self, max_size):
 49 | 		self.list = []
 50 | 		self.max_size = max_size
 51 | 
 52 | 	def put(self, item):
 53 | 		if len(self.list) >= self.max_size:
 54 | 			self.list.pop(0)
 55 | 		self.list.append(item)
 56 | 
 57 | 	def peek(self):
 58 | 		return self.list[0] if len(self.list) > 0 else None
 59 | 
 60 | 
 61 | # save file information and progress to a json file
 62 | # we don't want to save the whole FileConfig object, since some info resets if we restart
 63 | def save_file_list(input_files, status_json, script_type):
 64 | 	simple_file_list = []
 65 | 	for file in input_files.values():
 66 | 		simple_file_list.append([file.input_path, file.output_path, file.complete, file.lines_processed, file.error_lines])
 67 | 	with open(status_json, 'w') as status_json_file:
 68 | 		output_dict = {
 69 | 			"files": simple_file_list,
 70 | 			"type": script_type,
 71 | 		}
 72 | 		status_json_file.write(json.dumps(output_dict, indent=4))
 73 | 
 74 | 
 75 | # load file information from the json file and recalculate file sizes
 76 | def load_file_list(status_json):
 77 | 	if os.path.exists(status_json):
 78 | 		with open(status_json, 'r') as status_json_file:
 79 | 			output_dict = json.load(status_json_file)
 80 | 			input_files = {}
 81 | 			for simple_file in output_dict["files"]:
 82 | 				input_files[simple_file[0]] = FileConfig(simple_file[0], simple_file[1], simple_file[2], simple_file[3], simple_file[4])
 83 | 			return input_files, output_dict["type"]
 84 | 	else:
 85 | 		return None, None
 86 | 
 87 | 
 88 | # recursively decompress and decode a chunk of bytes. If there's a decode error then read another chunk and try with that, up to a limit of max_window_size bytes
 89 | def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
 90 | 	chunk = reader.read(chunk_size)
 91 | 	bytes_read += chunk_size
 92 | 	if previous_chunk is not None:
 93 | 		chunk = previous_chunk + chunk
 94 | 	try:
 95 | 		return chunk.decode()
 96 | 	except UnicodeDecodeError:
 97 | 		if bytes_read > max_window_size:
 98 | 			raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
 99 | 		return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)
100 | 
101 | 
102 | # open a zst compressed ndjson file and yield lines one at a time
103 | # also passes back file progress
104 | def read_lines_zst(file_name):
105 | 	with open(file_name, 'rb') as file_handle:
106 | 		buffer = ''
107 | 		reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
108 | 		while True:
109 | 			chunk = read_and_decode(reader, 2**27, (2**29) * 2)
110 | 			if not chunk:
111 | 				break
112 | 			lines = (buffer + chunk).split("\n")
113 | 
114 | 			for line in lines[:-1]:
115 | 				yield line, file_handle.tell()
116 | 
117 | 			buffer = lines[-1]
118 | 		reader.close()
119 | 
120 | 
121 | def process_file(file, queue):
122 | 	try:
123 | 		for line, file_bytes_processed in read_lines_zst(file.input_path):
124 | 			try:
125 | 				obj = json.loads(line)
126 | 				observed = obj["created_utc"]
127 | 				# just load the json and try to access a field to make sure it works
128 | 			except (KeyError, json.JSONDecodeError) as err:
129 | 				file.error_lines += 1
130 | 			file.lines_processed += 1
131 | 			if file.lines_processed % 1000000 == 0:
132 | 				file.bytes_processed = file_bytes_processed
133 | 				queue.put(file)
134 | 
135 | 		file.complete = True
136 | 		file.bytes_processed = file.file_size
137 | 	except Exception as err:
138 | 		file.error_message = str(err)
139 | 	queue.put(file)
140 | 
141 | 
142 | def process_update(input_files, queue, last_log_time, force_write):
143 | 	file_update = queue.get()
144 | 	if file_update.error_message is not None:
145 | 		log.warning(f"File failed {file_update.input_path}: {file_update.error_message}")
146 | 	current_time = time.time()
147 | 
148 | 	input_files[file_update.input_path] = file_update
149 | 	if force_write or last_log_time is None or (current_time - last_log_time) > 5 or queue.empty():
150 | 		total_lines_processed = 0
151 | 		total_bytes_processed = 0
152 | 		total_lines_errored = 0
153 | 		files_processed = 0
154 | 		files_errored = 0
155 | 		i = 0
156 | 		for file in input_files.values():
157 | 			total_lines_processed += file.lines_processed
158 | 			total_bytes_processed += file.bytes_processed
159 | 			total_lines_errored += file.error_lines
160 | 			files_processed += 1 if file.complete or file.error_message is not None else 0
161 | 			files_errored += 1 if file.error_message is not None else 0
162 | 			i += 1
163 | 		if file_update.complete or file_update.error_message is not None:
164 | 			save_file_list(input_files, status_json, script_type)
165 | 		progress_queue.put([current_time, total_lines_processed, total_bytes_processed])
166 | 
167 | 		first_time, first_lines, first_bytes = progress_queue.peek()
168 | 		bytes_per_second = int((total_bytes_processed - first_bytes)/(current_time - first_time))
169 | 		speed_queue.put(bytes_per_second)
170 | 		seconds_left = int((total_bytes - total_bytes_processed) / int(sum(speed_queue.list) / len(speed_queue.list)))
171 | 		minutes_left = int(seconds_left / 60)
172 | 		hours_left = int(minutes_left / 60)
173 | 		days_left = int(hours_left / 24)
174 | 
175 | 		log.info(
176 | 			f"{total_lines_processed:,} lines at {(total_lines_processed - first_lines)/(current_time - first_time):,.0f}/s, {total_lines_errored:,} errored : "
177 | 			f"{(total_bytes_processed / (2**30)):.2f} gb at {(bytes_per_second / (2**20)):,.0f} mb/s, {(total_bytes_processed / total_bytes) * 100:.0f}% : "
178 | 			f"{files_processed}({files_errored})/{len(input_files)} files : "
179 | 			f"{(str(days_left) + 'd ' if days_left > 0 else '')}{hours_left - (days_left * 24)}:{minutes_left - (hours_left * 60):02}:{seconds_left - (minutes_left * 60):02} remaining : "
180 | 			f"{queue.qsize()} files in queue : {current_time} : {last_log_time} : {current_time - last_log_time if last_log_time is not None else 0} : "
181 | 			f"{(current_time - last_log_time) > 5 if last_log_time is not None else 0} : {int(current_time - last_log_time) > 5 if last_log_time is not None else 0} : "
182 | 			f"{last_log_time is None or (current_time - last_log_time) > 5 or queue.empty()} : {queue.empty()}")
183 | 		last_log_time = time.time()
184 | 	return last_log_time
185 | 
186 | 
187 | if __name__ == '__main__':
188 | 	parser = argparse.ArgumentParser(description="Use multiple processes to decompress and iterate over pushshift dump files")
189 | 	parser.add_argument("input", help="The input folder to recursively read files from")
190 | 	parser.add_argument("--processes", help="Number of processes to use", default=10, type=int)
191 | 	parser.add_argument("--debug", help="Enable debug logging", action='store_const', const=True, default=False)
192 | 	script_type = "test"
193 | 
194 | 	args = parser.parse_args()
195 | 
196 | 	if args.debug:
197 | 		log.setLevel(logging.DEBUG)
198 | 
199 | 	log.info(f"Loading files from: {args.input}")
200 | 
201 | 	multiprocessing.set_start_method('spawn')
202 | 	queue = multiprocessing.Manager().Queue()
203 | 	status_json = "status.json"
204 | 	input_files, saved_type = load_file_list(status_json)
205 | 
206 | 	if saved_type and saved_type != script_type:
207 | 		log.warning(f"Script type doesn't match type from json file. Delete working folder")
208 | 		sys.exit(0)
209 | 
210 | 	# if the file list wasn't loaded from the json, this is the first run, find what files we need to process
211 | 	if input_files is None:
212 | 		input_files = {}
213 | 		for subdir, dirs, files in os.walk(args.input):
214 | 			files.sort()
215 | 			for file_name in files:
216 | 				if file_name.endswith(".zst"):
217 | 					input_path = os.path.join(subdir, file_name)
218 | 					input_files[input_path] = FileConfig(input_path)
219 | 
220 | 		save_file_list(input_files, status_json, script_type)
221 | 	else:
222 | 		log.info(f"Existing input file was read, if this is not correct you should delete the {status_json} folder and run this script again")
223 | 
224 | 	files_processed = 0
225 | 	total_bytes = 0
226 | 	total_bytes_processed = 0
227 | 	total_lines_processed = 0
228 | 	total_lines_errored = 0
229 | 	files_to_process = []
230 | 	# calculate the total file size for progress reports, build a list of incomplete files to process
231 | 	# do this largest to smallest by file size so that we aren't processing a few really big files with only a few threads at the end
232 | 	for file in sorted(input_files.values(), key=lambda item: item.file_size, reverse=True):
233 | 		total_bytes += file.file_size
234 | 		if file.complete:
235 | 			files_processed += 1
236 | 			total_lines_processed += file.lines_processed
237 | 			total_bytes_processed += file.file_size
238 | 			total_lines_errored += file.error_lines
239 | 		else:
240 | 			files_to_process.append(file)
241 | 
242 | 	log.info(f"Processed {files_processed} of {len(input_files)} files with {(total_bytes_processed / (2**30)):.2f} of {(total_bytes / (2**30)):.2f} gigabytes")
243 | 
244 | 	start_time = time.time()
245 | 	last_log_time = None
246 | 	if len(files_to_process):
247 | 		progress_queue = Queue(40)
248 | 		progress_queue.put([start_time, total_lines_processed, total_bytes_processed])
249 | 		speed_queue = Queue(40)
250 | 		for file in files_to_process:
251 | 			log.debug(f"Processing file: {file.input_path}")
252 | 		# start the workers
253 | 		with multiprocessing.Pool(processes=min(args.processes, len(files_to_process))) as pool:
254 | 			workers = pool.starmap_async(process_file, [(file, queue) for file in files_to_process], chunksize=1, error_callback=log.info)
255 | 			while not workers.ready():
256 | 				# loop until the workers are all done, pulling in status messages as they are sent
257 | 				last_log_time = process_update(input_files, queue, last_log_time, False)
258 | 
259 | 			while not queue.empty():
260 | 
261 | 
262 | 	log.info(f"{total_lines_processed:,}, {total_lines_errored} errored : {(total_bytes_processed / (2**30)):.2f} gb, {(total_bytes_processed / total_bytes) * 100:.0f}% : {files_processed}/{len(input_files)}")
263 | 
264 | 	count_complete = 0
265 | 	count_incomplete = 0
266 | 	# build a list of output files to combine
267 | 	for file in input_files.values():
268 | 		if not file.complete:
269 | 			if file.error_message is not None:
270 | 				log.info(f"File {file.input_path} errored {file.error_message}")
271 | 			else:
272 | 				log.info(f"File {file.input_path} is not marked as complete")
273 | 			count_incomplete += 1
274 | 		else:
275 | 			if file.error_lines > 0:
276 | 				log.info(f"File {file.input_path} has {file.error_lines:,} errored lines out of {file.lines_processed:,}")
277 | 				count_incomplete += 1
278 | 			else:
279 | 				count_complete += 1
280 | 
281 | 	if count_incomplete > 0:
282 | 		log.info(f"{count_incomplete} files were not completed, errored or don't exist, something went wrong. Aborting")
283 | 	else:
284 | 		log.info(f"Processing complete, {count_complete} successful files")
285 | 


--------------------------------------------------------------------------------
/personal/mongo/export_mongo.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import utils
 4 | import discord_logging
 5 | import pymongo
 6 | import time
 7 | import sys
 8 | from datetime import datetime
 9 | 
10 | log = discord_logging.init_logging()
11 | 
12 | 
13 | if __name__ == "__main__":
14 | 	mongo_address = sys.argv[1]  # 192.168.1.131
15 | 	client = pymongo.MongoClient(f"mongodb://{mongo_address}:27017", serverSelectionTimeoutMS=5000)
16 | 	log.info(f"Database connected at {mongo_address} on {client.admin.command('serverStatus')['host']}")
17 | 
18 | 	subreddits = [
19 | 		"PersonalFinanceCanada"
20 | 	]
21 | 	start_date = datetime(2020, 1, 1)
22 | 	end_date = datetime(2021, 1, 1)
23 | 
24 | 	for subreddit in subreddits:
25 | 		count = 0
26 | 		start_time = time.time()
27 | 		cursor = client.reddit_database.comments.find(
28 | 			filter={"subreddit": subreddit, "created_utc": {"$gte": int(start_date.timestamp()), "$lt": int(end_date.timestamp())}},
29 | 			projection={'_id': False},
30 | 			sort=[('created_utc', pymongo.ASCENDING)]
31 | 		)
32 | 		log.info(f"Got cursor in {int(time.time() - start_time)} seconds")
33 | 
34 | 		output_writer = utils.OutputZst(r"\\MYCLOUDPR4100\Public\reddit_final\{0}_comments.zst".format(subreddit))
35 | 		start_time = time.time()
36 | 		for comment in cursor:
37 | 			count += 1
38 | 			output_writer.write(json.dumps(comment, separators=(',', ':')))
39 | 			output_writer.write("\n")
40 | 			if count % 10000 == 0:
41 | 				log.info(f"{count:,} through {datetime.utcfromtimestamp(int(comment['created_utc'])).strftime('%Y-%m-%d %H:%M:%S')} in {int(time.time() - start_time)} seconds r/{subreddit}")
42 | 
43 | 		output_writer.close()
44 | 		log.info(f"{count:,} in {int(time.time() - start_time)} seconds r/{subreddit}")
45 | 
46 | 
47 | # db.comments.createIndex({subreddit:1}) // remove
48 | # db.comments.createIndex({subreddit:1, created_utc:1})
49 | # db.comments.createIndex({author:1, created_utc:1})
50 | # db.comments.createIndex({id:1})
51 | # db.submissions.createIndex({subreddit:1, created_utc:1})
52 | # db.submissions.createIndex({author:1, created_utc:1})
53 | # db.submissions.createIndex({id:1})
54 | # db.submissions.createIndex({created_utc:1})
55 | # db.comments.createIndex({created_utc:1})
56 | 


--------------------------------------------------------------------------------
/personal/mongo/group_subs.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from datetime import datetime
 3 | import utils
 4 | import discord_logging
 5 | import pymongo
 6 | import time
 7 | import sys
 8 | 
 9 | log = discord_logging.init_logging()
10 | 
11 | 
12 | if __name__ == "__main__":
13 | 	mongo_address = sys.argv[1]  # 192.168.1.131
14 | 	client = pymongo.MongoClient(f"mongodb://{mongo_address}:27017", serverSelectionTimeoutMS=5000)
15 | 	log.info(f"Database connected at {mongo_address} on {client.admin.command('serverStatus')['host']}")
16 | 
17 | 	count = 0
18 | 	start_time = time.time()
19 | 	start_date = int(datetime(2021, 6, 1).timestamp())
20 | 	cursor = client.reddit_database.submissions.aggregate(
21 | 		[
22 | 			{"$match": {"created_utc": {"$gt": start_date}}},
23 | 			{"$project": {"subreddit": 1, "over_18": {"$cond": ["$over_18", 1, 0]}}},
24 | 			{"$group": {"_id": "$subreddit", "countTotal": {"$count": {}}, "countNsfw": {"$sum": "$over_18"}}},
25 | 			{"$match": {"countTotal": {"$gt": 100}}},
26 | 		],
27 | 		allowDiskUse=True
28 | 	)
29 | 	log.info(f"Got cursor in {int(time.time() - start_time)} seconds")
30 | 
31 | 	start_time = time.time()
32 | 	subreddits = []
33 | 	for subreddit in cursor:
34 | 		subreddit['percent'] = int((subreddit['countNsfw']/subreddit['countTotal'])*100)
35 | 		if subreddit['percent'] >= 10:
36 | 			subreddits.append(subreddit)
37 | 		count += 1
38 | 		if count % 100000 == 0:
39 | 			log.info(f"{count:,} in {int(time.time() - start_time)} seconds")
40 | 
41 | 	log.info(f"{count:,} in {int(time.time() - start_time)} seconds")
42 | 
43 | 	file_out = open(r"\\MYCLOUDPR4100\Public\reddit_final\subreddits.txt", 'w')
44 | 	for subreddit in sorted(subreddits, key=lambda item: (item['percent'], item['countTotal']), reverse=True):
45 | 		file_out.write(f"{subreddit['_id']: <22}{subreddit['countTotal']: <8}{subreddit['countNsfw']: <8}{subreddit['percent']}%\n")
46 | 	file_out.close()
47 | 
48 | 
49 | # db.comments.createIndex({subreddit:1}) // remove
50 | # db.comments.createIndex({subreddit:1, created_utc:1})
51 | # db.comments.createIndex({author:1, created_utc:1})
52 | # db.comments.createIndex({id:1})
53 | # db.submissions.createIndex({subreddit:1, created_utc:1})
54 | # db.submissions.createIndex({author:1, created_utc:1})
55 | # db.submissions.createIndex({id:1})
56 | # db.submissions.createIndex({created_utc:1})
57 | # db.comments.createIndex({created_utc:1})
58 | 


--------------------------------------------------------------------------------
/personal/mongo/insert_mongo.py:
--------------------------------------------------------------------------------
 1 | import utils
 2 | import discord_logging
 3 | import os
 4 | import pymongo
 5 | import sys
 6 | from datetime import datetime
 7 | 
 8 | log = discord_logging.init_logging()
 9 | 
10 | 
11 | if __name__ == "__main__":
12 | 	mongo_address = sys.argv[1]  # 192.168.1.131
13 | 	client = pymongo.MongoClient(f"mongodb://{mongo_address}:27017", serverSelectionTimeoutMS=5000)
14 | 
15 | 	log.info(f"Database connected at {mongo_address} on {client.admin.command('serverStatus')['host']}")
16 | 
17 | 	object_type = sys.argv[2]
18 | 	input_folder = sys.argv[3]
19 | 	input_files = []
20 | 	total_size = 0
21 | 	for subdir, dirs, files in os.walk(input_folder + os.sep + object_type):
22 | 		files.sort()
23 | 		for filename in files:
24 | 			input_path = os.path.join(subdir, filename)
25 | 			if input_path.endswith(".zst"):
26 | 				file_size = os.stat(input_path).st_size
27 | 				total_size += file_size
28 | 				input_files.append([input_path, file_size])
29 | 
30 | 	log.info(f"Processing {len(input_files)} files of {(total_size / (2 ** 30)):.2f} gigabytes")
31 | 
32 | 	collection = client.reddit_database[object_type]
33 | 
34 | 	log.info(f"Using collection {object_type} which has {collection.estimated_document_count()} objects already")
35 | 
36 | 	total_lines = 0
37 | 	total_bytes_processed = 0
38 | 	for input_file in input_files:
39 | 		file_lines = 0
40 | 		file_bytes_processed = 0
41 | 		created = None
42 | 		inserts = []
43 | 		for obj, line, file_bytes_processed in utils.read_obj_zst_meta(input_file[0]):
44 | 			inserts.append(obj)
45 | 			if len(inserts) >= 10000:
46 | 				collection.insert_many(inserts)
47 | 				inserts = []
48 | 
49 | 			created = datetime.utcfromtimestamp(int(obj['created_utc']))
50 | 			file_lines += 1
51 | 			if file_lines == 1:
52 | 				log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines + total_lines:,} : 0% : {(total_bytes_processed / total_size) * 100:.0f}%")
53 | 			if file_lines % 100000 == 0:
54 | 				log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines + total_lines:,} : {(file_bytes_processed / input_file[1]) * 100:.0f}% : {(total_bytes_processed / total_size) * 100:.0f}%")
55 | 
56 | 		if len(inserts) >= 0:
57 | 			collection.insert_many(inserts)
58 | 		total_lines += file_lines
59 | 		total_bytes_processed += input_file[1]
60 | 		log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {total_lines:,} : 100% : {(total_bytes_processed / total_size) * 100:.0f}%")
61 | 
62 | 	log.info(f"Total: {total_lines}")
63 | 


--------------------------------------------------------------------------------
/personal/move/copy_listed_files.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | import os
 3 | import logging.handlers
 4 | import re
 5 | 
 6 | log = logging.getLogger("bot")
 7 | log.setLevel(logging.DEBUG)
 8 | log.addHandler(logging.StreamHandler())
 9 | 
10 | if __name__ == '__main__':
11 | 	input_folder = r"\\MYCLOUDPR4100\Public\pushshift_output"
12 | 	output_folder = r"\\MYCLOUDPR4100\Public\request"
13 | 	subs = ['PoliticalDiscussion', 'worldnews', 'science']
14 | 	overwrite = False
15 | 
16 | 	lower_subs = set()
17 | 	for sub in subs:
18 | 		lower_subs.add(sub.lower())
19 | 
20 | 	matched_subs = set()
21 | 	total_size = 0
22 | 	for file_name in os.listdir(input_folder):
23 | 		file_path = os.path.join(input_folder, file_name)
24 | 		if file_name.endswith(".zst") and os.path.isfile(file_path):
25 | 			match = re.match(r"(\w+)(?:_(?:comments|submissions).zst)", file_name)
26 | 			if match:
27 | 				sub_cased = match.group(1)
28 | 				if sub_cased.lower() in lower_subs:
29 | 					matched_subs.add(sub_cased)
30 | 					file_size = os.stat(file_path).st_size
31 | 					total_size += file_size
32 | 					log.info(f"Copying {file_name} : {(file_size / (2**20)):,.0f} mb : {(total_size / (2**20)):,.0f} mb")
33 | 					output_path = os.path.join(output_folder, file_name)
34 | 					if overwrite or not os.path.exists(output_path):
35 | 						shutil.copy(file_path, output_path)
36 | 
37 | 	log.info(f"Copied {len(matched_subs)}/{len(subs)} subs of total size {(total_size / (2**20)):,.0f} mb")
38 | 	if len(matched_subs) != len(lower_subs):
39 | 		lower_matched_subs = [sub.lower() for sub in matched_subs]
40 | 		for sub in lower_subs:
41 | 			if sub not in lower_matched_subs:
42 | 				log.info(f"Missing r/{sub}")
43 | 
44 | 	sorted_case_subs = sorted(matched_subs)
45 | 	bldr = ['torrenttools create -a "https://academictorrents.com/announce.php" -c "Comments and submissions from r/']
46 | 	bldr.append(', r/'.join(sorted_case_subs))
47 | 	bldr.append(' through the end of 2022"  --include ".*')
48 | 	bldr.append('.*zst" --include ".*'.join(sorted_case_subs))
49 | 	bldr.append('.*zst" -o username.torrent reddit')
50 | 	log.info(''.join(bldr))
51 | 


--------------------------------------------------------------------------------
/personal/move/move_files.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import discord_logging
 3 | import re
 4 | from datetime import datetime
 5 | 
 6 | log = discord_logging.init_logging()
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 | 	parent_folder = r"\\MYCLOUDPR4100\Public\ingest"
11 | 	folders = [r"ingest\comments",r"ingest\submissions",r"rescan\comments",r"rescan\submissions"]
12 | 	reg = re.compile(r"\d\d-\d\d-\d\d_\d\d-\d\d")
13 | 	for folder in folders:
14 | 		files = []
15 | 		created_date_folders = set()
16 | 		folder_path = os.path.join(parent_folder, folder)
17 | 		for file in os.listdir(folder_path):
18 | 			file_path = os.path.join(folder_path, file)
19 | 			if file.endswith(".zst"):
20 | 				files.append(file)
21 | 		log.info(f"{folder}: {len(files):,}")
22 | 
23 | 		count_moved = 0
24 | 		for file in files:
25 | 			match = reg.search(file)
26 | 			if not match:
27 | 				log.info(f"File doesn't match regex: {file}")
28 | 				continue
29 | 			file_date = datetime.strptime(match.group(), '%y-%m-%d_%H-%M')
30 | 			date_folder_name = file_date.strftime('%y-%m-%d')
31 | 			date_folder_path = os.path.join(folder_path, date_folder_name)
32 | 			if date_folder_name not in created_date_folders:
33 | 				log.info(f"Creating folder: {date_folder_path}")
34 | 				if not os.path.exists(date_folder_path):
35 | 					os.makedirs(date_folder_path)
36 | 				created_date_folders.add(date_folder_name)
37 | 			old_file_path = os.path.join(folder_path, file)
38 | 			new_file_path = os.path.join(date_folder_path, file)
39 | 			os.rename(old_file_path, new_file_path)
40 | 			count_moved += 1
41 | 			if count_moved % 100 == 0:
42 | 				log.info(f"{count_moved:,}/{len(files):,}: {folder}")
43 | 		log.info(f"{count_moved:,}/{len(files):,}: {folder}")
44 | 


--------------------------------------------------------------------------------
/personal/move/rename_files.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import discord_logging
 3 | import re
 4 | from datetime import datetime
 5 | 
 6 | log = discord_logging.init_logging()
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 | 	parent_folder = r"\\MYCLOUDPR4100\Public\ingest\combined\comments"
11 | 	files = []
12 | 	for folder_name in os.listdir(parent_folder):
13 | 		folder = os.path.join(parent_folder, folder_name)
14 | 		for file in os.listdir(folder):
15 | 			file_path = os.path.join(parent_folder, folder, file)
16 | 			if file.endswith(".zst"):
17 | 				files.append((folder, file))
18 | 	log.info(f"{parent_folder}: {len(files):,}")
19 | 
20 | 	count_moved = 0
21 | 	for folder, old_file in files:
22 | 		old_path = os.path.join(folder, old_file)
23 | 		new_file = old_file.replace("RS_", "RC_")
24 | 		new_path = os.path.join(folder, new_file)
25 | 
26 | 		os.rename(old_path, new_path)
27 | 		count_moved += 1
28 | 		if count_moved % 100 == 0:
29 | 			log.info(f"{count_moved:,}/{len(files):,}: {folder}")
30 | 	log.info(f"{count_moved:,}/{len(files):,}")
31 | 


--------------------------------------------------------------------------------
/personal/opt_in_quarantined.py:
--------------------------------------------------------------------------------
 1 | import asyncpraw
 2 | import requests
 3 | import asyncio
 4 | 
 5 | 
 6 | async def opt_in(reddit, subreddit_name):
 7 | 	subreddit = await reddit.subreddit(subreddit_name)
 8 | 	await subreddit.quaran.opt_in()
 9 | 
10 | 
11 | async def main(subreddits):
12 | 	reddit = asyncpraw.Reddit("Watchful12")
13 | 	for subreddit_name in subreddits:
14 | 		print(f"r/{subreddit_name}")
15 | 		try:
16 | 			subreddit = await reddit.subreddit(subreddit_name)
17 | 			await subreddit.quaran.opt_in()
18 | 		except Exception as err:
19 | 			print(f"Error opting into r/{subreddit_name} : {err}")
20 | 	await reddit.close()
21 | 
22 | 
23 | if __name__ == "__main__":
24 | 	subreddits = requests.get("https://pastebin.com/raw/WKi36t1w").text.split("\r\n")
25 | 	asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
26 | 	asyncio.run(main(subreddits))
27 | 


--------------------------------------------------------------------------------
/personal/process_month.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append('personal')
  3 | sys.path.append('combine')
  4 | sys.path.append('personal/combine')
  5 | 
  6 | import os
  7 | import argparse
  8 | import json
  9 | import time
 10 | import logging.handlers
 11 | import requests
 12 | import praw
 13 | import traceback
 14 | from datetime import datetime, timedelta
 15 | import multiprocessing_logging
 16 | 
 17 | import discord_logging
 18 | import multiprocessing
 19 | 
 20 | log = discord_logging.init_logging()
 21 | discord_logging.init_discord_logging(
 22 | 	section_name="Watchful12",
 23 | 	log_level=logging.WARNING,
 24 | )
 25 | multiprocessing_logging.install_mp_handler(log)
 26 | 
 27 | import utils
 28 | from transform import split_blocks_by_minutes
 29 | from combine.merge_and_backfill import build_day, IngestType, ObjectType
 30 | from combine import build_month
 31 | 
 32 | 
 33 | def get_pushshift_token(old_token):
 34 | 	global pushshift_lock
 35 | 	pushshift_lock.acquire()
 36 | 	saved_token = load_pushshift_token()
 37 | 	if saved_token is None or saved_token == "" or old_token == saved_token:
 38 | 		if old_token is None:
 39 | 			log.warning("No saved or passed in token")
 40 | 			save_pushshift_token("")
 41 | 			raise ValueError("No saved or passed in token")
 42 | 
 43 | 		log.info(f"Requesting new token")
 44 | 		result_token = re_auth_pushshift(old_token)
 45 | 		save_pushshift_token(result_token)
 46 | 	else:
 47 | 		result_token = saved_token
 48 | 
 49 | 	pushshift_lock.release()
 50 | 	return result_token
 51 | 
 52 | 
 53 | def save_pushshift_token(token):
 54 | 	with open("pushshift.txt", 'w') as file:
 55 | 		file.write(token)
 56 | 
 57 | 
 58 | def load_pushshift_token():
 59 | 	if not os.path.exists("pushshift.txt"):
 60 | 		return None
 61 | 	with open("pushshift.txt", 'r') as file:
 62 | 		token = file.read().strip()
 63 | 	return token
 64 | 
 65 | 
 66 | def re_auth_pushshift(old_token):
 67 | 	url = f"https://auth.pushshift.io/refresh?access_token={old_token}"
 68 | 	log.warning(f"Reauth request: {url}")
 69 | 	response = requests.post(url)
 70 | 	result = response.json()
 71 | 	log.warning(f"Reauth response: {str(result)}")
 72 | 	discord_logging.flush_discord()
 73 | 	if 'access_token' in result:
 74 | 		new_token = result['access_token']
 75 | 		log.warning(f"New pushshift token: {new_token}")
 76 | 		save_pushshift_token(new_token)
 77 | 		discord_logging.flush_discord()
 78 | 		return new_token
 79 | 	elif 'detail' in result:
 80 | 		if result['detail'] == 'Access token is still active and can not be refreshed.':
 81 | 			log.warning(f"Access token still active, trying request again")
 82 | 			time.sleep(5)
 83 | 			return old_token
 84 | 
 85 | 		log.warning(f"Reauth failed: {result['detail']}")
 86 | 		discord_logging.flush_discord()
 87 | 		return old_token
 88 | 	else:
 89 | 		log.warning(f"Something went wrong re-authing")
 90 | 		discord_logging.flush_discord()
 91 | 		return old_token
 92 | 
 93 | 
 94 | def init(p_lock):
 95 | 	global pushshift_lock
 96 | 	pushshift_lock = p_lock
 97 | 
 98 | 
 99 | def save_status(status_json, stages, month):
100 | 	log.debug(f"Saving status: {stages}")
101 | 	output_dict = {
102 | 		"stages": stages,
103 | 		"month": month,
104 | 	}
105 | 	json_string = json.dumps(output_dict, indent=4, default=str)
106 | 	with open(status_json, 'w') as status_json_file:
107 | 		status_json_file.write(json_string)
108 | 
109 | 
110 | def load_status(status_json):
111 | 	if os.path.exists(status_json):
112 | 		with open(status_json, 'r') as status_json_file:
113 | 			output_dict = json.load(status_json_file)
114 | 			for stage_type, stage in output_dict["stages"].items():
115 | 				if stage["merge"] is not None:
116 | 					stage["merge"] = datetime.strptime(stage["merge"], "%Y-%m-%d %H:%M:%S")
117 | 			return output_dict["stages"], output_dict["month"]
118 | 	else:
119 | 		stages = {
120 | 			"comment": {
121 | 				"split": False,
122 | 				"merge": None,  # 24-02-01
123 | 				"build": False,
124 | 			},
125 | 			"submission": {
126 | 				"split": False,
127 | 				"merge": None,  # 24-02-01
128 | 				"build": False,
129 | 			}
130 | 		}
131 | 		return stages, None
132 | 
133 | 
134 | def end_of_day(input_minute):
135 | 	return input_minute.replace(hour=0, minute=0, second=0) + timedelta(days=1)
136 | 
137 | 
138 | def process(queue, base_folder, month, file_type, type_stages, reddit_username, compression_level, ignore_ids):
139 | 	try:
140 | 		# for stage, status in type_stages.items():
141 | 		# 	log.info(f"{file_type} {stage}: {status}")
142 | 		file_prefix = "RC" if file_type == "comment" else "RS"
143 | 		if not type_stages["split"]:
144 | 			original_split_file = os.path.join(base_folder, "reddit", "blocks", f"{file_prefix}_20{month}.zst")
145 | 			split_file = os.path.join(base_folder, "reddit", "blocks", f"{file_prefix}B_20{month}.zst")
146 | 			if os.path.exists(original_split_file):
147 | 				os.rename(original_split_file, split_file)
148 | 
149 | 			if not os.path.exists(split_file):
150 | 				log.info(f"{file_type}: File {split_file} doesn't exist, checking for blocks")
151 | 				split_file = os.path.join(base_folder, "reddit", "blocks", f"{file_prefix}_20{month}.zst_blocks")
152 | 				if not os.path.exists(split_file):
153 | 					log.error(f"{file_type}: File {split_file} doesn't exist, aborting")
154 | 					return False
155 | 
156 | 			split_folder = os.path.join(base_folder, "ingest", "download")
157 | 
158 | 			log.info(f"{file_type}: Starting {file_type} split")
159 | 			log.info(f"{file_type}: Reading from: {split_file}")
160 | 			log.info(f"{file_type}: Writing to: {split_folder}")
161 | 			split_blocks_by_minutes.split_by_minutes(split_file, split_folder)
162 | 
163 | 			log.warning(f"{file_type}: {file_type} split complete")
164 | 			discord_logging.flush_discord()
165 | 			queue.put((file_type, "split", True))
166 | 
167 | 		start_date = datetime.strptime(month, "%y-%m")
168 | 		if start_date.month == 12:
169 | 			end_date = start_date.replace(year=start_date.year + 1, month=1)
170 | 		else:
171 | 			end_date = start_date.replace(month=start_date.month + 1)
172 | 		if type_stages["merge"] is None or type_stages["merge"] < end_date:
173 | 			if type_stages["merge"] is not None:
174 | 				start_date = type_stages["merge"]
175 | 
176 | 			log.info(f"{file_type}: Starting {file_type} merge from {start_date}")
177 | 
178 | 			reddit = praw.Reddit(reddit_username)
179 | 
180 | 			input_folders = [
181 | 				(os.path.join(base_folder, "ingest", "ingest"), IngestType.INGEST),
182 | 				(os.path.join(base_folder, "ingest", "rescan"), IngestType.RESCAN),
183 | 				(os.path.join(base_folder, "ingest", "download"), IngestType.DOWNLOAD),
184 | 			]
185 | 			for input_folder in input_folders:
186 | 				log.info(f"{file_type}: Reading from: {input_folder[0]} : {input_folder[1]}")
187 | 			combined_folder = os.path.join(base_folder, "ingest", "combined")
188 | 			log.info(f"{file_type}: Writing to: {combined_folder}")
189 | 			while start_date < end_date:
190 | 				build_day(
191 | 					start_date,
192 | 					input_folders,
193 | 					combined_folder,
194 | 					ObjectType.COMMENT if file_type == "comment" else ObjectType.SUBMISSION,
195 | 					reddit,
196 | 					ignore_ids,
197 | 					get_pushshift_token
198 | 				)
199 | 				start_date = end_of_day(start_date)
200 | 				queue.put((file_type, "merge", start_date))
201 | 			log.warning(f"{file_type}: {file_type} merge complete")
202 | 			discord_logging.flush_discord()
203 | 
204 | 		if not type_stages["build"]:
205 | 			log.info(f"{file_type}: Starting {file_type} build")
206 | 			start_date = datetime.strptime(month, "%y-%m")
207 | 
208 | 			input_folder = os.path.join(base_folder, "ingest", "combined")
209 | 			output_folder = os.path.join(base_folder, "reddit")
210 | 			log.info(f"{file_type}: Reading from: {input_folder}")
211 | 			log.info(f"{file_type}: Writing to: {output_folder}")
212 | 			build_month.build_month(
213 | 				start_date,
214 | 				input_folder,
215 | 				output_folder,
216 | 				file_type+"s",
217 | 				compression_level
218 | 			)
219 | 			queue.put((file_type, "build", True))
220 | 			log.warning(f"{file_type}: {file_type} build complete")
221 | 			discord_logging.flush_discord()
222 | 
223 | 		log.warning(f"{file_type}: {file_type} all steps complete")
224 | 
225 | 		log.info(f'torrenttools create -a "https://academictorrents.com/announce.php" -c "Reddit comments and submissions from 20{month}" --include ".*(comments|submissions).*R._20{month}.zst$" -o reddit_20{month}.torrent reddit')
226 | 
227 | 		discord_logging.flush_discord()
228 | 
229 | 		# for stage, status in type_stages.items():
230 | 		# 	log.info(f"{file_type} {stage}: {status}")
231 | 	except Exception as err:
232 | 		log.warning(f"Error in {type}: {err}")
233 | 		log.warning(traceback.format_exc())
234 | 		queue.put((file_type, "error", str(err)))
235 | 		discord_logging.flush_discord()
236 | 		# for stage, status in type_stages.items():
237 | 		# 	log.info(f"{file_type} {stage}: {status}")
238 | 
239 | 
240 | if __name__ == "__main__":
241 | 	parser = argparse.ArgumentParser(description="")
242 | 	parser.add_argument('month', help='Month to process')
243 | 	parser.add_argument('folder', help='Folder under which all the files are stored')
244 | 	parser.add_argument("--ignore_ids", help="Ignore ids between the id ranges listed", default=None)
245 | 	parser.add_argument("--level", help="The compression ratio to output at", default="22")
246 | 	args = parser.parse_args()
247 | 
248 | 	ignore_ids = []
249 | 	if args.ignore_ids is not None:
250 | 		for id_range in args.ignore_ids.split(","):
251 | 			start_id, end_id = id_range.split("-")
252 | 			ignore_ids.append((utils.base36decode(start_id), utils.base36decode(end_id)))
253 | 
254 | 	log.warning(f"Processing {args.month}")
255 | 	discord_logging.flush_discord()
256 | 
257 | 	status_file = "process.json"
258 | 	stages, month = load_status(status_file)
259 | 
260 | 	if month is not None and args.month != month:
261 | 		log.error(f"Month does not match saved month, aborting: {month} : {args.month}")
262 | 		sys.exit(0)
263 | 	month = args.month
264 | 	log.info(f"Processing {month}")
265 | 	level = int(args.level)
266 | 	log.info(f"Compression level: {level}")
267 | 
268 | 	multiprocessing.set_start_method('spawn', force=True)
269 | 	queue = multiprocessing.Manager().Queue()
270 | 	p_lock = multiprocessing.Lock()
271 | 	with multiprocessing.Pool(processes=2, initializer=init, initargs=(p_lock,)) as pool:
272 | 		arguments = []
273 | 		for file_type, type_stages in stages.items():
274 | 			arguments.append((queue, args.folder, month, file_type, type_stages, "Watchful12", level, ignore_ids))
275 | 		workers = pool.starmap_async(process, arguments, chunksize=1, error_callback=log.info)
276 | 		while not workers.ready() or not queue.empty():
277 | 			file_type, stage, status = queue.get()
278 | 			if stage == "error":
279 | 				log.error(f"Error in {file_type}: {status}")
280 | 			stages[file_type][stage] = status
281 | 			save_status(status_file, stages, month)
282 | 			discord_logging.flush_discord()
283 | 			#log.info(f"workers {workers.ready()} : queue {queue.empty()}")
284 | 	discord_logging.flush_discord()
285 | 


--------------------------------------------------------------------------------
/personal/transform/split_blocks_by_minutes.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append('personal')
 3 | 
 4 | import discord_logging
 5 | import os
 6 | import zstandard
 7 | from datetime import datetime
 8 | import json
 9 | import argparse
10 | 
11 | log = discord_logging.get_logger(init=True)
12 | 
13 | import utils
14 | 
15 | NEWLINE_ENCODED = "\n".encode('utf-8')
16 | 
17 | 
18 | def split_by_minutes(input_file, output_file):
19 | 	file_type = "comments" if "RC" in input_file else "submissions"
20 | 
21 | 	log.info(f"{file_type}: Input file: {input_file}")
22 | 	log.info(f"{file_type}: Output folder: {output_file}")
23 | 	previous_minute, output_handle, created_utc = None, None, None
24 | 	count_objects, count_minute = 0, 0
25 | 	if input_file.endswith(".zst"):
26 | 		reader = utils.read_obj_zst(input_file)
27 | 	elif input_file.endswith(".zst_blocks"):
28 | 		reader = utils.read_obj_zst_blocks(input_file)
29 | 	else:
30 | 		log.error(f"{file_type}: Unsupported file type: {input_file}")
31 | 		return
32 | 	for obj in reader:
33 | 		created_utc = datetime.utcfromtimestamp(obj["created_utc"])
34 | 		current_minute = created_utc.replace(second=0)
35 | 
36 | 		if previous_minute is None or current_minute > previous_minute:
37 | 			log.info(f"{file_type}: {created_utc.strftime('%y-%m-%d_%H-%M')}: {count_objects:,} : {count_minute: ,}")
38 | 			previous_minute = current_minute
39 | 			count_minute = 0
40 | 			if output_handle is not None:
41 | 				output_handle.close()
42 | 
43 | 			output_path = os.path.join(output_file, file_type, created_utc.strftime('%y-%m-%d'))
44 | 			if not os.path.exists(output_path):
45 | 				os.makedirs(output_path)
46 | 			output_path = os.path.join(output_path, f"{('RC' if file_type == 'comments' else 'RS')}_{created_utc.strftime('%y-%m-%d_%H-%M')}.zst")
47 | 			output_handle = zstandard.ZstdCompressor().stream_writer(open(output_path, 'wb'))
48 | 
49 | 		count_objects += 1
50 | 		count_minute += 1
51 | 		output_handle.write(json.dumps(obj, sort_keys=True).encode('utf-8'))
52 | 		output_handle.write(NEWLINE_ENCODED)
53 | 
54 | 	if created_utc is None:
55 | 		log.error(f"{file_type}: {input_file} appears to be empty")
56 | 		sys.exit(1)
57 | 	log.info(f"{file_type}: {created_utc.strftime('%y-%m-%d_%H-%M')}: {count_objects:,} : {count_minute: ,}")
58 | 	if output_handle is not None:
59 | 		output_handle.close()
60 | 
61 | 
62 | if __name__ == "__main__":
63 | 	parser = argparse.ArgumentParser(description="Take a zst_blocks file and split it by minute chunks")
64 | 	parser.add_argument('--input', help='Input file', required=True)
65 | 	parser.add_argument('--output', help='Output folder', required=True)
66 | 	args = parser.parse_args()
67 | 
68 | 	split_by_minutes(args.input, args.output)
69 | 


--------------------------------------------------------------------------------
/personal/transform/split_by_minutes.py:
--------------------------------------------------------------------------------
 1 | import discord_logging
 2 | import os
 3 | import zstandard
 4 | from datetime import datetime
 5 | import json
 6 | 
 7 | log = discord_logging.init_logging()
 8 | 
 9 | import utils
10 | 
11 | NEWLINE_ENCODED = "\n".encode('utf-8')
12 | 
13 | 
14 | if __name__ == "__main__":
15 | 	input_file = r"\\MYCLOUDPR4100\Public\RS_2023-09.zst"
16 | 	output_folder = r"\\MYCLOUDPR4100\Public\ingest\download"
17 | 	file_type = "comments" if "RC" in input_file else "submissions"
18 | 
19 | 	log.info(f"Input: {input_file} - Output: {output_folder}")
20 | 	previous_minute, output_handle, created_utc = None, None, None
21 | 	count_objects, count_minute = 0, 0
22 | 	for obj in utils.read_obj_zst(input_file):
23 | 		created_utc = datetime.utcfromtimestamp(obj["created_utc"])
24 | 		current_minute = created_utc.replace(second=0)
25 | 
26 | 		if previous_minute is None or current_minute > previous_minute:
27 | 			log.info(f"{created_utc.strftime('%y-%m-%d_%H-%M')}: {count_objects:,} : {count_minute: ,}")
28 | 			previous_minute = current_minute
29 | 			count_minute = 0
30 | 			if output_handle is not None:
31 | 				output_handle.close()
32 | 
33 | 			output_path = os.path.join(output_folder, file_type, created_utc.strftime('%y-%m-%d'))
34 | 			if not os.path.exists(output_path):
35 | 				os.makedirs(output_path)
36 | 			output_path = os.path.join(output_path, f"{('RC' if file_type == 'comments' else 'RS')}_{created_utc.strftime('%y-%m-%d_%H-%M')}.zst")
37 | 			output_handle = zstandard.ZstdCompressor().stream_writer(open(output_path, 'wb'))
38 | 
39 | 		count_objects += 1
40 | 		count_minute += 1
41 | 		output_handle.write(json.dumps(obj, sort_keys=True).encode('utf-8'))
42 | 		output_handle.write(NEWLINE_ENCODED)
43 | 
44 | 	log.info(f"{created_utc.strftime('%y-%m-%d_%H-%M')}: {count_objects:,} : {count_minute: ,}")
45 | 	if output_handle is not None:
46 | 		output_handle.close()
47 | 


--------------------------------------------------------------------------------
/personal/transform/split_by_subreddit.py:
--------------------------------------------------------------------------------
 1 | import utils
 2 | import discord_logging
 3 | import os
 4 | from datetime import datetime
 5 | 
 6 | log = discord_logging.init_logging()
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 | 	subreddits = {}
11 | 	field = 'subreddit'
12 | 	object_type = "submissions"
13 | 	folder = f"\\\\MYCLOUDPR4100\\Public\\reddit_final\\multisub_{object_type}"
14 | 	if not os.path.exists(folder):
15 | 		os.makedirs(folder)
16 | 	input_file = f"\\\\MYCLOUDPR4100\\Public\\reddit_final\\multisub_{object_type}.zst"
17 | 	input_file_size = os.stat(input_file).st_size
18 | 	total_lines = 0
19 | 	for comment, line, file_bytes_processed in utils.read_obj_zst_meta(input_file):
20 | 		if comment[field] not in subreddits:
21 | 			subreddits[comment[field]] = {'writer': utils.OutputZst(os.path.join(folder, comment[field] + f"_{object_type}.zst")), 'lines': 0}
22 | 		subreddit = subreddits[comment[field]]
23 | 		subreddit['writer'].write(line)
24 | 		subreddit['writer'].write("\n")
25 | 		subreddit['lines'] += 1
26 | 		total_lines += 1
27 | 		if total_lines % 100000 == 0:
28 | 			log.info(f"{total_lines:,} lines, {(file_bytes_processed / input_file_size) * 100:.0f}%")
29 | 
30 | 	log.info(f"{total_lines:,} lines, 100%")
31 | 
32 | 	for name, subreddit in subreddits.items():
33 | 		log.info(f"r/{name}: {subreddit['lines']:,} lines")
34 | 		subreddit['writer'].close()
35 | 


--------------------------------------------------------------------------------
/personal/utils.py:
--------------------------------------------------------------------------------
  1 | import zstandard
  2 | import json
  3 | import os
  4 | from zst_blocks import ZstBlocksFile
  5 | 
  6 | 
  7 | def read_obj_zst(file_name):
  8 | 	with open(file_name, 'rb') as file_handle:
  9 | 		buffer = ''
 10 | 		reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
 11 | 		while True:
 12 | 			chunk = read_and_decode(reader, 2**27, (2**29) * 2)
 13 | 			if not chunk:
 14 | 				break
 15 | 			lines = (buffer + chunk).split("\n")
 16 | 			for line in lines[:-1]:
 17 | 				if line == "":
 18 | 					continue
 19 | 				yield json.loads(line.strip())
 20 | 
 21 | 			buffer = lines[-1]
 22 | 		reader.close()
 23 | 
 24 | 
 25 | def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
 26 | 	chunk = reader.read(chunk_size)
 27 | 	bytes_read += chunk_size
 28 | 	if previous_chunk is not None:
 29 | 		chunk = previous_chunk + chunk
 30 | 	try:
 31 | 		return chunk.decode()
 32 | 	except UnicodeDecodeError:
 33 | 		if bytes_read > max_window_size:
 34 | 			raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
 35 | 		return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)
 36 | 
 37 | 
 38 | def read_obj_zst_meta(file_name):
 39 | 	with open(file_name, 'rb') as file_handle:
 40 | 		buffer = ''
 41 | 		reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
 42 | 		while True:
 43 | 			chunk = read_and_decode(reader, 2**27, (2**29) * 2)
 44 | 			if not chunk:
 45 | 				break
 46 | 			lines = (buffer + chunk).split("\n")
 47 | 
 48 | 			for line in lines[:-1]:
 49 | 				line = line.strip()
 50 | 				try:
 51 | 					json_object = json.loads(line)
 52 | 				except (KeyError, json.JSONDecodeError) as err:
 53 | 					continue
 54 | 				yield json_object, line, file_handle.tell()
 55 | 
 56 | 			buffer = lines[-1]
 57 | 		reader.close()
 58 | 
 59 | 
 60 | class OutputZst:
 61 | 	def __init__(self, file_name):
 62 | 		output_file = open(file_name, 'wb')
 63 | 		self.writer = zstandard.ZstdCompressor().stream_writer(output_file)
 64 | 
 65 | 	def write(self, line):
 66 | 		encoded_line = line.encode('utf-8')
 67 | 		self.writer.write(encoded_line)
 68 | 
 69 | 	def close(self):
 70 | 		self.writer.close()
 71 | 
 72 | 	def __enter__(self):
 73 | 		return self
 74 | 
 75 | 	def __exit__(self, exc_type, exc_value, exc_traceback):
 76 | 		self.close()
 77 | 		return True
 78 | 
 79 | 
 80 | # copied from https://github.com/ArthurHeitmann/zst_blocks_format
 81 | def read_obj_zst_blocks(file_name):
 82 | 	with open(file_name, "rb") as file:
 83 | 		for row in ZstBlocksFile.streamRows(file):
 84 | 			line = row.decode()
 85 | 			yield json.loads(line.strip())
 86 | 
 87 | 
 88 | def base36encode(integer: int) -> str:
 89 | 	chars = '0123456789abcdefghijklmnopqrstuvwxyz'
 90 | 	sign = '-' if integer < 0 else ''
 91 | 	integer = abs(integer)
 92 | 	result = ''
 93 | 	while integer > 0:
 94 | 		integer, remainder = divmod(integer, 36)
 95 | 		result = chars[remainder] + result
 96 | 	return sign + result
 97 | 
 98 | 
 99 | def base36decode(base36: str) -> int:
100 | 	return int(base36, 36)
101 | 
102 | 
103 | def merge_lowest_highest_id(str_id, lowest_id, highest_id):
104 | 	int_id = base36decode(str_id)
105 | 	if lowest_id is None or int_id < lowest_id:
106 | 		lowest_id = int_id
107 | 	if highest_id is None or int_id > highest_id:
108 | 		highest_id = int_id
109 | 	return lowest_id, highest_id
110 | 
111 | 
112 | def chunk_list(items, chunk_size):
113 | 	for i in range(0, len(items), chunk_size):
114 | 		yield items[i:i + chunk_size]
115 | 


--------------------------------------------------------------------------------
/personal/zst_blocks.py:
--------------------------------------------------------------------------------
  1 | # copied from https://github.com/ArthurHeitmann/zst_blocks_format
  2 | 
  3 | from __future__ import annotations
  4 | from dataclasses import dataclass
  5 | import os
  6 | import time
  7 | import struct
  8 | from typing import BinaryIO, Callable, Iterable, Literal
  9 | from zstandard import ZstdDecompressor, ZstdCompressor
 10 | 
 11 | _endian: Literal["little", "big"] = "little"
 12 | 
 13 | _uint32Struct = struct.Struct("<I")
 14 | _uint32X2Struct = struct.Struct("<II")
 15 | 
 16 | _defaultCompressionLevel = 3
 17 | 
 18 | 
 19 | class ZstBlocksFile:
 20 | 	blocks: list[ZstBlock]
 21 | 
 22 | 	def __init__(self, blocks: list[ZstBlock]):
 23 | 		self.blocks = blocks
 24 | 
 25 | 	@staticmethod
 26 | 	def readBlockRowAt(file: BinaryIO, rowPosition: RowPosition) -> bytes:
 27 | 		file.seek(rowPosition.blockOffset)
 28 | 		return ZstBlock.readRow(file, rowPosition.rowIndex)
 29 | 
 30 | 	@staticmethod
 31 | 	def readMultipleBlocks(file: BinaryIO, rowPositions: list[RowPosition]) -> \
 32 | 	list[bytes]:
 33 | 		blockGroupsDict: dict[int, RowPositionGroup] = {}
 34 | 		for i, rowPosition in enumerate(rowPositions):
 35 | 			if rowPosition.blockOffset not in blockGroupsDict:
 36 | 				blockGroupsDict[rowPosition.blockOffset] = RowPositionGroup(
 37 | 					rowPosition.blockOffset, [])
 38 | 			blockGroupsDict[rowPosition.blockOffset].rowIndices.append(
 39 | 				RowIndex(rowPosition.rowIndex, i))
 40 | 		blockGroups = list(blockGroupsDict.values())
 41 | 
 42 | 		rows: list = [None] * len(rowPositions)
 43 | 		for blockGroup in blockGroups:
 44 | 			file.seek(blockGroup.blockOffset)
 45 | 			blockRows = ZstBlock.readSpecificRows(file, map(lambda
 46 | 																pair: pair.withinBlockIndex,
 47 | 															blockGroup.rowIndices))
 48 | 			for originalPosition, row in zip(blockGroup.rowIndices, blockRows):
 49 | 				rows[originalPosition.originalRowIndex] = row
 50 | 
 51 | 		return rows
 52 | 
 53 | 	@staticmethod
 54 | 	def streamRows(file: BinaryIO, blockIndexProgressCallback: Callable[[
 55 | 		int], None] | None = None) -> Iterable[bytes]:
 56 | 		fileSize = os.path.getsize(file.name)
 57 | 		blockIndex = 0
 58 | 		while file.tell() < fileSize:
 59 | 			yield from ZstBlock.streamRows(file)
 60 | 			blockIndex += 1
 61 | 			if blockIndexProgressCallback is not None:
 62 | 				blockIndexProgressCallback(blockIndex)
 63 | 
 64 | 	@staticmethod
 65 | 	def appendBlock(file: BinaryIO, rows: list[bytes],
 66 | 					compressionLevel=_defaultCompressionLevel) -> None:
 67 | 		file.seek(file.tell())
 68 | 		ZstBlock(rows).write(file, compressionLevel=compressionLevel)
 69 | 
 70 | 	@staticmethod
 71 | 	def writeStream(file: BinaryIO, rowStream: Iterable[bytes], blockSize: int,
 72 | 					rowPositions: list[RowPosition] | None = None,
 73 | 					compressionLevel=_defaultCompressionLevel) -> None:
 74 | 		pendingRows = []
 75 | 		for row in rowStream:
 76 | 			pendingRows.append(row)
 77 | 			if len(pendingRows) >= blockSize:
 78 | 				ZstBlock(pendingRows).write(file, rowPositions,
 79 | 											compressionLevel=compressionLevel)
 80 | 				pendingRows = []
 81 | 		if len(pendingRows) > 0:
 82 | 			ZstBlock(pendingRows).write(file, rowPositions,
 83 | 										compressionLevel=compressionLevel)
 84 | 
 85 | 	@staticmethod
 86 | 	def writeBlocksStream(file: BinaryIO, blocksStream: Iterable[list[bytes]],
 87 | 						  rowPositions: list[RowPosition] | None = None,
 88 | 						  compressionLevel=_defaultCompressionLevel) -> None:
 89 | 		for rows in blocksStream:
 90 | 			ZstBlock(rows).write(file, rowPositions,
 91 | 								 compressionLevel=compressionLevel)
 92 | 
 93 | 	@staticmethod
 94 | 	def countBlocks(file: BinaryIO) -> int:
 95 | 		fileSize = os.path.getsize(file.name)
 96 | 		blockCount = 0
 97 | 		initialPos = file.tell()
 98 | 		pos = initialPos
 99 | 		while pos < fileSize:
100 | 			blockCount += 1
101 | 			blockSize = _uint32Struct.unpack(file.read(4))[0]
102 | 			pos += 4 + blockSize
103 | 			file.seek(pos)
104 | 		file.seek(initialPos)
105 | 		return blockCount
106 | 
107 | 	@staticmethod
108 | 	def generateRowPositions(file: BinaryIO) -> Iterable[RowPosition]:
109 | 		fileSize = os.path.getsize(file.name)
110 | 		while file.tell() < fileSize:
111 | 			yield from ZstBlock.generateRowPositions(file)
112 | 
113 | 
114 | class ZstBlock:
115 | 	rows: list[bytes]
116 | 
117 | 	def __init__(self, rows: list[bytes]):
118 | 		self.rows = rows
119 | 
120 | 	@classmethod
121 | 	def streamRows(cls, file: BinaryIO) -> Iterable[bytes]:
122 | 		compressedSize = _uint32Struct.unpack(file.read(4))[0]
123 | 		compressedData = file.read(compressedSize)
124 | 		decompressedData = ZstdDecompressor().decompress(compressedData)
125 | 
126 | 		memoryView = memoryview(decompressedData)
127 | 		count = _uint32Struct.unpack(memoryView[0:4])[0]
128 | 		rows: list[ZstRowInfo] = [None] * count
129 | 		for i in range(count):
130 | 			rows[i] = ZstRowInfo.read(memoryView, 4 + i * ZstRowInfo.structSize)
131 | 
132 | 		dataStart = 4 + count * ZstRowInfo.structSize
133 | 		for row in rows:
134 | 			yield decompressedData[
135 | 				  dataStart + row.offset: dataStart + row.offset + row.size]
136 | 
137 | 	@classmethod
138 | 	def readSpecificRows(cls, file: BinaryIO, rowIndices: Iterable[int]) -> \
139 | 	list[bytes]:
140 | 		compressedSize = _uint32Struct.unpack(file.read(4))[0]
141 | 		compressedData = file.read(compressedSize)
142 | 		decompressedData = ZstdDecompressor().decompress(compressedData)
143 | 
144 | 		memoryView = memoryview(decompressedData)
145 | 		count = _uint32Struct.unpack(memoryView[0:4])[0]
146 | 		rows: list[ZstRowInfo] = [None] * count
147 | 		for i in range(count):
148 | 			rows[i] = ZstRowInfo.read(memoryView, 4 + i * ZstRowInfo.structSize)
149 | 
150 | 		dataStart = 4 + count * ZstRowInfo.structSize
151 | 		return [
152 | 			decompressedData[
153 | 			dataStart + rows[rowIndex].offset: dataStart + rows[
154 | 				rowIndex].offset + rows[rowIndex].size]
155 | 			for rowIndex in rowIndices
156 | 		]
157 | 
158 | 	@classmethod
159 | 	def readRow(cls, file: BinaryIO, rowIndex: int) -> bytes:
160 | 		compressedSize = _uint32Struct.unpack(file.read(4))[0]
161 | 		compressedData = file.read(compressedSize)
162 | 		decompressedData = ZstdDecompressor().decompress(compressedData)
163 | 
164 | 		memoryView = memoryview(decompressedData)
165 | 		count = _uint32Struct.unpack(memoryView[0:4])[0]
166 | 		if rowIndex >= count:
167 | 			raise Exception("Row index out of range")
168 | 		row = ZstRowInfo.read(memoryView, 4 + rowIndex * ZstRowInfo.structSize)
169 | 
170 | 		dataStart = 4 + count * ZstRowInfo.structSize
171 | 		return decompressedData[
172 | 			   dataStart + row.offset: dataStart + row.offset + row.size]
173 | 
174 | 	def write(self, file: BinaryIO,
175 | 			  rowPositions: list[RowPosition] | None = None,
176 | 			  compressionLevel=_defaultCompressionLevel) -> None:
177 | 		uncompressedSize = \
178 | 			4 + \
179 | 			len(self.rows) * ZstRowInfo.structSize + \
180 | 			sum(len(row) for row in self.rows)
181 | 		uncompressedBytes = bytearray(uncompressedSize)
182 | 		uncompressedBytes[0:4] = len(self.rows).to_bytes(4, _endian)
183 | 
184 | 		dataOffset = 4 + len(self.rows) * ZstRowInfo.structSize
185 | 		blockOffset = file.tell()
186 | 		currentDataLocalOffset = 0
187 | 		for i in range(len(self.rows)):
188 | 			row = self.rows[i]
189 | 			rowInfo = ZstRowInfo(currentDataLocalOffset, len(row))
190 | 			rowInfo.write(uncompressedBytes, 4 + i * ZstRowInfo.structSize)
191 | 			uncompressedBytes[
192 | 			dataOffset + currentDataLocalOffset: dataOffset + currentDataLocalOffset + len(
193 | 				row)] = row
194 | 			currentDataLocalOffset += len(row)
195 | 			if rowPositions is not None:
196 | 				rowPositions.append(RowPosition(blockOffset, i))
197 | 		uncompressedData = bytes(uncompressedBytes)
198 | 		compressedData = ZstdCompressor(compressionLevel).compress(
199 | 			uncompressedData)
200 | 		compressedSize = len(compressedData)
201 | 		blockBytes = bytearray(4 + compressedSize)
202 | 		blockBytes[0:4] = compressedSize.to_bytes(4, _endian)
203 | 		blockBytes[4:4 + compressedSize] = compressedData
204 | 		file.write(blockBytes)
205 | 
206 | 	@staticmethod
207 | 	def generateRowPositions(file: BinaryIO) -> Iterable[RowPosition]:
208 | 		blockOffset = file.tell()
209 | 		compressedSize = _uint32Struct.unpack(file.read(4))[0]
210 | 		compressedData = file.read(compressedSize)
211 | 		decompressedData = ZstdDecompressor().decompress(compressedData)
212 | 
213 | 		memoryView = memoryview(decompressedData)
214 | 		count = _uint32Struct.unpack(memoryView[0:4])[0]
215 | 		for i in range(count):
216 | 			yield RowPosition(blockOffset, i)
217 | 
218 | 
219 | class ZstRowInfo:
220 | 	structSize = 8
221 | 	offset: int
222 | 	size: int
223 | 
224 | 	def __init__(self, offset: int, size: int):
225 | 		self.offset = offset
226 | 		self.size = size
227 | 
228 | 	@staticmethod
229 | 	def read(bytes: bytes, position: int) -> ZstRowInfo:
230 | 		offset, size = _uint32X2Struct.unpack(
231 | 			bytes[position: position + ZstRowInfo.structSize])
232 | 		return ZstRowInfo(offset, size)
233 | 
234 | 	def write(self, bytes: bytearray, position: int) -> None:
235 | 		bytes[position + 0: position + 4] = self.offset.to_bytes(4, _endian)
236 | 		bytes[position + 4: position + 8] = self.size.to_bytes(4, _endian)
237 | 
238 | 
239 | @dataclass
240 | class RowPosition:
241 | 	blockOffset: int
242 | 	rowIndex: int
243 | 
244 | 
245 | @dataclass
246 | class RowIndex:
247 | 	withinBlockIndex: int
248 | 	originalRowIndex: int
249 | 
250 | 
251 | @dataclass
252 | class RowPositionGroup:
253 | 	blockOffset: int
254 | 	rowIndices: list[RowIndex]
255 | 


--------------------------------------------------------------------------------
/scripts/combine_folder_multiprocess.py:
--------------------------------------------------------------------------------
  1 | # this script iterates through zst compressed ndjson files, like the pushshift reddit dumps, loads each line
  2 | # and if it matches the criteria in the command line arguments, it's written out into a separate file for
  3 | # that month. After all the ndjson files are processed, it iterates through the resulting files and combines
  4 | # them into a final file.
  5 | 
  6 | # this script assumes the files are named in chronological order and prefixed with RS_ or RC_, like the pushshift dumps
  7 | 
  8 | # features:
  9 | #  - multiple processes in parallel to maximize drive read and decompression
 10 | #  - saves state as it completes each file and picks up where it stopped
 11 | #  - detailed progress indicators
 12 | 
 13 | # examples:
 14 | #  - get all comments that have a subreddit field (subreddit is the default) of "wallstreetbets". This will create a single output file "wallstreetbets_comments.zst" in the folder the script is run in
 15 | #    python3 combine_folder_multiprocess.py reddit/comments --value wallstreetbets
 16 | #  - get all comments and submissions (assuming both types of dump files are under the reddit folder) that have an author field of Watchful1 or spez and output the results to a folder called pushshift.
 17 | #    This will result in four files, pushshift/Watchful1_comments, pushshift/Watchful1_submissions, pushshift/spez_comments, pushshift/spez_submissions
 18 | #    python3 combine_folder_multiprocess.py reddit --field author --value Watchful1,spez --output pushshift
 19 | 
 20 | import zstandard
 21 | import os
 22 | import json
 23 | import sys
 24 | import time
 25 | import argparse
 26 | import re
 27 | from collections import defaultdict
 28 | import logging.handlers
 29 | import multiprocessing
 30 | from enum import Enum
 31 | 
 32 | 
 33 | # sets up logging to the console as well as a file
 34 | log = logging.getLogger("bot")
 35 | log.setLevel(logging.INFO)
 36 | log_formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')
 37 | 
 38 | log_str_handler = logging.StreamHandler()
 39 | log_str_handler.setFormatter(log_formatter)
 40 | log.addHandler(log_str_handler)
 41 | if not os.path.exists("logs"):
 42 | 	os.makedirs("logs")
 43 | log_file_handler = logging.handlers.RotatingFileHandler(
 44 | 	os.path.join("logs", "bot.log"), maxBytes=1024*1024*16, backupCount=5)
 45 | log_file_handler.setFormatter(log_formatter)
 46 | log.addHandler(log_file_handler)
 47 | 
 48 | 
 49 | class FileType(Enum):
 50 | 	COMMENT = 1
 51 | 	SUBMISSION = 2
 52 | 
 53 | 	@staticmethod
 54 | 	def to_str(file_type):
 55 | 		if file_type == FileType.COMMENT:
 56 | 			return "comments"
 57 | 		elif file_type == FileType.SUBMISSION:
 58 | 			return "submissions"
 59 | 		return "other"
 60 | 
 61 | 
 62 | # convenience object used to pass status information between processes
 63 | class FileConfig:
 64 | 	def __init__(self, input_path, output_path=None, complete=False, lines_processed=0, error_lines=0, lines_matched=0):
 65 | 		self.input_path = input_path
 66 | 		self.output_path = output_path
 67 | 		self.file_size = os.stat(input_path).st_size
 68 | 		self.complete = complete
 69 | 		self.bytes_processed = self.file_size if complete else 0
 70 | 		self.lines_processed = lines_processed if complete else 0
 71 | 		self.error_message = None
 72 | 		self.error_lines = error_lines
 73 | 		self.lines_matched = lines_matched
 74 | 		file_name = os.path.split(input_path)[1]
 75 | 		if file_name.startswith("RS"):
 76 | 			self.file_type = FileType.SUBMISSION
 77 | 		elif file_name.startswith("RC"):
 78 | 			self.file_type = FileType.COMMENT
 79 | 		else:
 80 | 			raise ValueError(f"Unknown working file type: {file_name}")
 81 | 
 82 | 	def __str__(self):
 83 | 		return f"{self.input_path} : {self.output_path} : {self.file_size} : {self.complete} : {self.bytes_processed} : {self.lines_processed}"
 84 | 
 85 | 
 86 | # another convenience object to read and write from both zst files and ndjson files
 87 | class FileHandle:
 88 | 	newline_encoded = "\n".encode('utf-8')
 89 | 	ext_len = len(".zst")
 90 | 
 91 | 	def __init__(self, path, is_split=False):
 92 | 		self.path = path
 93 | 		self.is_split = is_split
 94 | 		self.handles = {}
 95 | 
 96 | 	def get_paths(self, character_filter=None):
 97 | 		if self.is_split:
 98 | 			paths = []
 99 | 			for file in os.listdir(self.path):
100 | 				if not file.endswith(".zst"):
101 | 					continue
102 | 				if character_filter is not None and character_filter != file[-FileHandle.ext_len - 1:-FileHandle.ext_len]:
103 | 					continue
104 | 				paths.append(os.path.join(self.path, file))
105 | 			return paths
106 | 		else:
107 | 			return [self.path]
108 | 
109 | 	def get_count_files(self):
110 | 		return len(self.get_paths())
111 | 
112 | 	# recursively decompress and decode a chunk of bytes. If there's a decode error then read another chunk and try with that, up to a limit of max_window_size bytes
113 | 	@staticmethod
114 | 	def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
115 | 		chunk = reader.read(chunk_size)
116 | 		bytes_read += chunk_size
117 | 		if previous_chunk is not None:
118 | 			chunk = previous_chunk + chunk
119 | 		try:
120 | 			return chunk.decode()
121 | 		except UnicodeDecodeError:
122 | 			if bytes_read > max_window_size:
123 | 				raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
124 | 			return FileHandle.read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)
125 | 
126 | 	# open a zst compressed ndjson file, or a regular uncompressed ndjson file and yield lines one at a time
127 | 	# also passes back file progress
128 | 	def yield_lines(self, character_filter=None):
129 | 		if self.is_split:
130 | 			if character_filter is not None:
131 | 				path = os.path.join(self.path, f"{character_filter}.zst")
132 | 			else:
133 | 				raise ValueError(f"{self.path} is split but no filter passed")
134 | 		else:
135 | 			path = self.path
136 | 		if os.path.exists(path):
137 | 			with open(path, 'rb') as file_handle:
138 | 				buffer = ''
139 | 				reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
140 | 				while True:
141 | 					chunk = FileHandle.read_and_decode(reader, 2**27, (2**29) * 2)
142 | 					if not chunk:
143 | 						break
144 | 					lines = (buffer + chunk).split("\n")
145 | 
146 | 					for line in lines[:-1]:
147 | 						yield line, file_handle.tell()
148 | 
149 | 					buffer = lines[-1]
150 | 				reader.close()
151 | 
152 | 	# get either the main write handle or the character filter one, opening a new handle as needed
153 | 	def get_write_handle(self, character_filter=None):
154 | 		if character_filter is None:
155 | 			character_filter = 1  # use 1 as the default name since ints hash quickly
156 | 		handle = self.handles.get(character_filter)
157 | 		if handle is None:
158 | 			if character_filter == 1:
159 | 				path = self.path
160 | 			else:
161 | 				if not os.path.exists(self.path):
162 | 					os.makedirs(self.path)
163 | 				path = os.path.join(self.path, f"{character_filter}.zst")
164 | 			handle = zstandard.ZstdCompressor().stream_writer(open(path, 'wb'))
165 | 			self.handles[character_filter] = handle
166 | 		return handle
167 | 
168 | 	# write a line, opening the appropriate handle
169 | 	def write_line(self, line, value=None):
170 | 		if self.is_split:
171 | 			if value is None:
172 | 				raise ValueError(f"{self.path} is split but no value passed")
173 | 			character_filter = value[:1]
174 | 			handle = self.get_write_handle(character_filter)
175 | 		else:
176 | 			handle = self.get_write_handle()
177 | 
178 | 		handle.write(line.encode('utf-8'))
179 | 		handle.write(FileHandle.newline_encoded)
180 | 
181 | 	def close(self):
182 | 		for handle in self.handles.values():
183 | 			handle.close()
184 | 
185 | 
186 | # used for calculating running average of read speed
187 | class Queue:
188 | 	def __init__(self, max_size):
189 | 		self.list = []
190 | 		self.max_size = max_size
191 | 
192 | 	def put(self, item):
193 | 		if len(self.list) >= self.max_size:
194 | 			self.list.pop(0)
195 | 		self.list.append(item)
196 | 
197 | 	def peek(self):
198 | 		return self.list[0] if len(self.list) > 0 else None
199 | 
200 | 
201 | # save file information and progress to a json file
202 | # we don't want to save the whole FileConfig object, since some info resets if we restart
203 | def save_file_list(input_files, working_folder, status_json, arg_string, script_type, completed_prefixes=None):
204 | 	if not os.path.exists(working_folder):
205 | 		os.makedirs(working_folder)
206 | 	simple_file_list = []
207 | 	for file in input_files:
208 | 		simple_file_list.append([file.input_path, file.output_path, file.complete, file.lines_processed, file.error_lines, file.lines_matched])
209 | 	if completed_prefixes is None:
210 | 		completed_prefixes = []
211 | 	else:
212 | 		completed_prefixes = sorted([prefix for prefix in completed_prefixes])
213 | 	with open(status_json, 'w') as status_json_file:
214 | 		output_dict = {
215 | 			"args": arg_string,
216 | 			"type": script_type,
217 | 			"completed_prefixes": completed_prefixes,
218 | 			"files": simple_file_list,
219 | 		}
220 | 		status_json_file.write(json.dumps(output_dict, indent=4))
221 | 
222 | 
223 | # load file information from the json file and recalculate file sizes
224 | def load_file_list(status_json):
225 | 	if os.path.exists(status_json):
226 | 		with open(status_json, 'r') as status_json_file:
227 | 			output_dict = json.load(status_json_file)
228 | 			input_files = []
229 | 			for simple_file in output_dict["files"]:
230 | 				input_files.append(
231 | 					FileConfig(simple_file[0], simple_file[1], simple_file[2], simple_file[3], simple_file[4], simple_file[5])
232 | 				)
233 | 			completed_prefixes = set()
234 | 			for prefix in output_dict["completed_prefixes"]:
235 | 				completed_prefixes.add(prefix)
236 | 			return input_files, output_dict["args"], output_dict["type"], completed_prefixes
237 | 	else:
238 | 		return None, None, None, set()
239 | 
240 | 
241 | # base of each separate process. Loads a file, iterates through lines and writes out
242 | # the ones where the `field` of the object matches `value`. Also passes status
243 | # information back to the parent via a queue
244 | def process_file(file, queue, field, values, partial, regex, split_intermediate):
245 | 	queue.put(file)
246 | 	input_handle = FileHandle(file.input_path)
247 | 	output_handle = FileHandle(file.output_path, is_split=split_intermediate)
248 | 
249 | 	value = None
250 | 	if len(values) == 1:
251 | 		value = min(values)
252 | 
253 | 	try:
254 | 		for line, file_bytes_processed in input_handle.yield_lines():
255 | 			try:
256 | 				obj = json.loads(line)
257 | 				matched = False
258 | 				observed = obj[field].lower()
259 | 				if regex:
260 | 					for reg in values:
261 | 						if reg.search(observed):
262 | 							matched = True
263 | 							break
264 | 				elif partial:
265 | 					for val in values:
266 | 						if val in observed:
267 | 							matched = True
268 | 							break
269 | 				else:
270 | 					if value is not None:
271 | 						if observed == value:
272 | 							matched = True
273 | 					elif observed in values:
274 | 						matched = True
275 | 
276 | 				if matched:
277 | 					output_handle.write_line(line, observed)
278 | 					file.lines_matched += 1
279 | 			except (KeyError, json.JSONDecodeError, AttributeError) as err:
280 | 				file.error_lines += 1
281 | 			file.lines_processed += 1
282 | 			if file.lines_processed % 1000000 == 0:
283 | 				file.bytes_processed = file_bytes_processed
284 | 				queue.put(file)
285 | 
286 | 		output_handle.close()
287 | 		file.complete = True
288 | 		file.bytes_processed = file.file_size
289 | 	except Exception as err:
290 | 		file.error_message = str(err)
291 | 	queue.put(file)
292 | 
293 | 
294 | if __name__ == '__main__':
295 | 	parser = argparse.ArgumentParser(description="Use multiple processes to decompress and iterate over pushshift dump files")
296 | 	parser.add_argument("input", help="The input folder to recursively read files from")
297 | 	parser.add_argument("--output", help="Put the output files in this folder", default="")
298 | 	parser.add_argument("--working", help="The folder to store temporary files in", default="pushshift_working")
299 | 	parser.add_argument("--field", help="When deciding what lines to keep, use this field for comparisons", default="subreddit")
300 | 	parser.add_argument("--value", help="When deciding what lines to keep, compare the field to this value. Supports a comma separated list. This is case sensitive", default="pushshift")
301 | 	parser.add_argument("--value_list", help="A file of newline separated values to use. Overrides the value param if it is set", default=None)
302 | 	parser.add_argument("--processes", help="Number of processes to use", default=10, type=int)
303 | 	parser.add_argument("--file_filter", help="Regex filenames have to match to be processed", default="^RC_|^RS_")
304 | 	parser.add_argument(
305 | 		"--split_intermediate",
306 | 		help="Split the intermediate files by the first letter of the matched field, use if the filter will result in a large number of separate files",
307 | 		action="store_true")
308 | 	parser.add_argument(
309 | 		"--single_output",
310 | 		help="Output a single combined file instead of splitting by the search term",
311 | 		action="store_true")
312 | 	parser.add_argument(
313 | 		"--error_rate", help=
314 | 		"Percentage as an integer from 0 to 100 of the lines where the field can be missing. For the subreddit field especially, "
315 | 		"there are a number of posts that simply don't have a subreddit attached", default=1, type=int)
316 | 	parser.add_argument("--debug", help="Enable debug logging", action='store_const', const=True, default=False)
317 | 	parser.add_argument(
318 | 		"--partial", help="The values only have to be contained in the field, not match exactly. If this is set, "
319 | 		"the output files are not split by value. WARNING: This can severely slow down the script, especially if searching the "
320 | 		"body.", action='store_const', const=True, default=False)
321 | 	parser.add_argument(
322 | 		"--regex", help="The values are treated as regular expressions. If this is set, "
323 | 		"the output files are not split by value. WARNING: This can severely slow down the script, especially if searching the "
324 | 		"body. If set, ignores the --partial flag", action='store_const', const=True, default=False)
325 | 	script_type = "split"
326 | 
327 | 	args = parser.parse_args()
328 | 	arg_string = f"{args.field}:{(args.value if args.value else args.value_list)}"
329 | 
330 | 	if args.debug:
331 | 		log.setLevel(logging.DEBUG)
332 | 
333 | 	log.info(f"Loading files from: {args.input}")
334 | 	if args.output:
335 | 		log.info(f"Writing output to: {args.output}")
336 | 	else:
337 | 		log.info(f"Writing output to working folder")
338 | 
339 | 	if (args.partial or args.regex or args.single_output) and args.split_intermediate:
340 | 		log.info("The partial, regex and single_output flags are not compatible with the split_intermediate flag")
341 | 		sys.exit(1)
342 | 
343 | 	values = set()
344 | 	if args.value_list:
345 | 		log.info(f"Reading {args.value_list} for values to compare")
346 | 		with open(args.value_list, 'r') as value_list_handle:
347 | 			for line in value_list_handle:
348 | 				values.add(line)
349 | 
350 | 	else:
351 | 		values = set(args.value.split(","))
352 | 
353 | 	if args.regex:
354 | 		regexes = []
355 | 		for reg in values:
356 | 			regexes.append(re.compile(reg))
357 | 		values = regexes
358 | 		if len(values) > 1:
359 | 			log.info(f"Checking field {args.field} against {len(values)} regexes")
360 | 		else:
361 | 			log.info(f"Checking field {args.field} against regex {values[0]}")
362 | 	else:
363 | 		lower_values = set()
364 | 		for value_inner in values:
365 | 			lower_values.add(value_inner.strip().lower())
366 | 		values = lower_values
367 | 		if len(values) > 5:
368 | 			val_string = f"any of {len(values)} values"
369 | 		elif len(values) == 1:
370 | 			val_string = f"the value {(','.join(values))}"
371 | 		else:
372 | 			val_string = f"any of the values {(','.join(values))}"
373 | 		if args.partial:
374 | 			log.info(f"Checking if any of {val_string} are contained in field {args.field}")
375 | 		else:
376 | 			log.info(f"Checking if any of {val_string} exactly match field {args.field}")
377 | 
378 | 	if args.partial or args.regex or args.single_output:
379 | 		log.info(f"Outputing to a single combined file")
380 | 
381 | 	multiprocessing.set_start_method('spawn')
382 | 	queue = multiprocessing.Manager().Queue()
383 | 	status_json = os.path.join(args.working, "status.json")
384 | 	input_files, saved_arg_string, saved_type, completed_prefixes = load_file_list(status_json)
385 | 	if saved_arg_string and saved_arg_string != arg_string:
386 | 		log.warning(f"Args don't match args from json file. Delete working folder")
387 | 		sys.exit(0)
388 | 
389 | 	if saved_type and saved_type != script_type:
390 | 		log.warning(f"Script type doesn't match type from json file. Delete working folder")
391 | 		sys.exit(0)
392 | 
393 | 	# if the file list wasn't loaded from the json, this is the first run, find what files we need to process
394 | 	if input_files is None:
395 | 		input_files = []
396 | 		for subdir, dirs, files in os.walk(args.input):
397 | 			files.sort()
398 | 			for file_name in files:
399 | 				if file_name.endswith(".zst") and re.search(args.file_filter, file_name) is not None:
400 | 					input_path = os.path.join(subdir, file_name)
401 | 					if args.split_intermediate:
402 | 						output_extension = ""
403 | 					else:
404 | 						output_extension = ".zst"
405 | 					output_path = os.path.join(args.working, f"{file_name[:-4]}{output_extension}")
406 | 					input_files.append(FileConfig(input_path, output_path=output_path))
407 | 
408 | 		save_file_list(input_files, args.working, status_json, arg_string, script_type)
409 | 	else:
410 | 		log.info(f"Existing input file was read, if this is not correct you should delete the {args.working} folder and run this script again")
411 | 
412 | 	files_processed, total_bytes, total_bytes_processed, total_lines_processed, total_lines_matched, total_lines_errored = 0, 0, 0, 0, 0, 0
413 | 	files_to_process = []
414 | 	# calculate the total file size for progress reports, build a list of incomplete files to process
415 | 	# do this largest to smallest by file size so that we aren't processing a few really big files with only a few threads at the end
416 | 	for file in sorted(input_files, key=lambda item: item.file_size, reverse=True):
417 | 		total_bytes += file.file_size
418 | 		if file.complete:
419 | 			files_processed += 1
420 | 			total_lines_processed += file.lines_processed
421 | 			total_lines_matched += file.lines_matched
422 | 			total_bytes_processed += file.file_size
423 | 			total_lines_errored += file.error_lines
424 | 		else:
425 | 			files_to_process.append(file)
426 | 
427 | 	log.info(f"Processed {files_processed} of {len(input_files)} files with {(total_bytes_processed / (2**30)):.2f} of {(total_bytes / (2**30)):.2f} gigabytes")
428 | 
429 | 	start_time = time.time()
430 | 	if len(files_to_process):
431 | 		progress_queue = Queue(40)
432 | 		progress_queue.put([start_time, total_lines_processed, total_bytes_processed])
433 | 		speed_queue = Queue(40)
434 | 		for file in files_to_process:
435 | 			log.info(f"Processing file: {file.input_path}")
436 | 		# start the workers
437 | 		with multiprocessing.Pool(processes=min(args.processes, len(files_to_process))) as pool:
438 | 			workers = pool.starmap_async(process_file, [(file, queue, args.field, values, args.partial, args.regex, args.split_intermediate) for file in files_to_process], chunksize=1, error_callback=log.info)
439 | 			while not workers.ready() or not queue.empty():
440 | 				# loop until the workers are all done, pulling in status messages as they are sent
441 | 				file_update = queue.get()
442 | 				if file_update.error_message is not None:
443 | 					log.warning(f"File failed {file_update.input_path}: {file_update.error_message}")
444 | 
445 | 				# this is the workers telling us they are starting a new file, print the debug message but nothing else
446 | 				if file_update.lines_processed == 0:
447 | 					log.debug(f"Starting file: {file_update.input_path} : {file_update.file_size:,}")
448 | 					continue
449 | 
450 | 				# I'm going to assume that the list of files is short enough that it's no
451 | 				# big deal to just iterate each time since that saves a bunch of work
452 | 				total_lines_processed, total_lines_matched, total_bytes_processed, total_lines_errored, files_processed, files_errored, i = 0, 0, 0, 0, 0, 0, 0
453 | 				for file in input_files:
454 | 					if file.input_path == file_update.input_path:
455 | 						input_files[i] = file_update
456 | 						file = file_update
457 | 					total_lines_processed += file.lines_processed
458 | 					total_lines_matched += file.lines_matched
459 | 					total_bytes_processed += file.bytes_processed
460 | 					total_lines_errored += file.error_lines
461 | 					files_processed += 1 if file.complete or file.error_message is not None else 0
462 | 					files_errored += 1 if file.error_message is not None else 0
463 | 					i += 1
464 | 				if file_update.complete or file_update.error_message is not None:
465 | 					save_file_list(input_files, args.working, status_json, arg_string, script_type)
466 | 					log.debug(f"Finished file: {file_update.input_path} : {file_update.file_size:,}")
467 | 				current_time = time.time()
468 | 				progress_queue.put([current_time, total_lines_processed, total_bytes_processed])
469 | 
470 | 				first_time, first_lines, first_bytes = progress_queue.peek()
471 | 				bytes_per_second = int((total_bytes_processed - first_bytes)/(current_time - first_time))
472 | 				speed_queue.put(bytes_per_second)
473 | 				seconds_left = int((total_bytes - total_bytes_processed) / int(sum(speed_queue.list) / len(speed_queue.list)))
474 | 				minutes_left = int(seconds_left / 60)
475 | 				hours_left = int(minutes_left / 60)
476 | 				days_left = int(hours_left / 24)
477 | 
478 | 				log.info(
479 | 					f"{total_lines_processed:,} lines at {(total_lines_processed - first_lines)/(current_time - first_time):,.0f}/s, {total_lines_errored:,} errored, {total_lines_matched:,} matched : "
480 | 					f"{(total_bytes_processed / (2**30)):.2f} gb at {(bytes_per_second / (2**20)):,.0f} mb/s, {(total_bytes_processed / total_bytes) * 100:.0f}% : "
481 | 					f"{files_processed}({files_errored})/{len(input_files)} files : "
482 | 					f"{(str(days_left) + 'd ' if days_left > 0 else '')}{hours_left - (days_left * 24)}:{minutes_left - (hours_left * 60):02}:{seconds_left - (minutes_left * 60):02} remaining")
483 | 
484 | 	log.info(f"{total_lines_processed:,}, {total_lines_errored} errored : {(total_bytes_processed / (2**30)):.2f} gb, {(total_bytes_processed / total_bytes) * 100:.0f}% : {files_processed}/{len(input_files)}")
485 | 
486 | 	type_handles = defaultdict(list)
487 | 	prefixes = set()
488 | 	count_incomplete = 0
489 | 	count_intermediate_files = 0
490 | 	# build a list of output files to combine
491 | 	for file in sorted(input_files, key=lambda item: os.path.split(item.output_path)[1]):
492 | 		if not file.complete:
493 | 			if file.error_message is not None:
494 | 				log.info(f"File {file.input_path} errored {file.error_message}")
495 | 			else:
496 | 				log.info(f"File {file.input_path} is not marked as complete")
497 | 			count_incomplete += 1
498 | 		else:
499 | 			if file.error_lines > file.lines_processed * (args.error_rate * 0.01):
500 | 				log.info(
501 | 					f"File {file.input_path} has {file.error_lines:,} errored lines out of {file.lines_processed:,}, "
502 | 					f"{(file.error_lines / file.lines_processed) * (args.error_rate * 0.01):.2f}% which is above the limit of {args.error_rate}%")
503 | 				count_incomplete += 1
504 | 			elif file.output_path is not None and os.path.exists(file.output_path):
505 | 				input_handle = FileHandle(file.output_path, is_split=args.split_intermediate)
506 | 				for path in input_handle.get_paths():
507 | 					prefixes.add(path[-FileHandle.ext_len - 1:-FileHandle.ext_len])
508 | 					count_intermediate_files += 1
509 | 				type_handles[file.file_type].append(input_handle)
510 | 
511 | 	if count_incomplete > 0:
512 | 		log.info(f"{count_incomplete} files were not completed, errored or don't exist, something went wrong. Aborting")
513 | 		sys.exit()
514 | 
515 | 	log.info(f"Processing complete, combining {count_intermediate_files} result files")
516 | 
517 | 	for completed_prefix in completed_prefixes:
518 | 		if completed_prefix in prefixes:
519 | 			prefixes.remove(completed_prefix)
520 | 
521 | 	output_lines = 0
522 | 	output_handles = {}
523 | 	files_combined = 0
524 | 	if values:
525 | 		split = True
526 | 	else:
527 | 		split = False
528 | 	if args.split_intermediate:
529 | 		for prefix in sorted(prefixes):
530 | 			log.info(f"From {files_combined}/{count_intermediate_files} files to {len(output_handles):,} output handles : {output_lines:,}/{total_lines_matched:,} lines")
531 | 			for file_type, input_handles in type_handles.items():
532 | 				for input_handle in input_handles:
533 | 					has_lines = False
534 | 					for line, file_bytes_processed in input_handle.yield_lines(character_filter=prefix):
535 | 						if not has_lines:
536 | 							has_lines = True
537 | 							files_combined += 1
538 | 						output_lines += 1
539 | 						obj = json.loads(line)
540 | 						observed_case = obj[args.field]
541 | 						observed = observed_case.lower()
542 | 						if observed not in output_handles:
543 | 							if args.output:
544 | 								if not os.path.exists(args.output):
545 | 									os.makedirs(args.output)
546 | 								output_file_path = os.path.join(args.output, f"{observed_case}_{FileType.to_str(file_type)}.zst")
547 | 							else:
548 | 								output_file_path = f"{observed_case}_{FileType.to_str(file_type)}.zst"
549 | 							log.debug(f"Writing to file {output_file_path}")
550 | 							output_handle = FileHandle(output_file_path)
551 | 							output_handles[observed] = output_handle
552 | 						else:
553 | 							output_handle = output_handles[observed]
554 | 
555 | 						output_handle.write_line(line)
556 | 						if output_lines % 1000000 == 0:
557 | 							log.info(f"From {files_combined}/{count_intermediate_files} files to {len(output_handles):,} output handles : {output_lines:,}/{total_lines_matched:,} lines : {input_handle.path} / {prefix}")
558 | 				for handle in output_handles.values():
559 | 					handle.close()
560 | 				output_handles = {}
561 | 			completed_prefixes.add(prefix)
562 | 			save_file_list(input_files, args.working, status_json, arg_string, script_type, completed_prefixes)
563 | 
564 | 	else:
565 | 		log.info(f"From {files_combined}/{count_intermediate_files} files to {len(output_handles):,} output handles : {output_lines:,}/{total_lines_matched:,} lines")
566 | 		for file_type, input_handles in type_handles.items():
567 | 			for input_handle in input_handles:
568 | 				files_combined += 1
569 | 				for line, file_bytes_processed in input_handle.yield_lines():
570 | 					output_lines += 1
571 | 					obj = json.loads(line)
572 | 					if args.partial or args.regex or args.single_output:
573 | 						observed_case = "output"
574 | 					else:
575 | 						observed_case = obj[args.field]
576 | 					observed = observed_case.lower()
577 | 					if observed not in output_handles:
578 | 						if args.output:
579 | 							if not os.path.exists(args.output):
580 | 								os.makedirs(args.output)
581 | 							output_file_path = os.path.join(args.output, f"{observed_case}_{FileType.to_str(file_type)}.zst")
582 | 						else:
583 | 							output_file_path = f"{observed_case}_{FileType.to_str(file_type)}.zst"
584 | 						log.debug(f"Writing to file {output_file_path}")
585 | 						output_handle = FileHandle(output_file_path)
586 | 						output_handles[observed] = output_handle
587 | 					else:
588 | 						output_handle = output_handles[observed]
589 | 
590 | 					output_handle.write_line(line)
591 | 					if output_lines % 1000000 == 0:
592 | 						log.info(f"From {files_combined}/{count_intermediate_files} files to {len(output_handles):,} output handles : {output_lines:,}/{total_lines_matched:,} lines : {input_handle.path}")
593 | 			for handle in output_handles.values():
594 | 				handle.close()
595 | 			output_handles = {}
596 | 
597 | 	log.info(f"From {files_combined}/{count_intermediate_files} files to {len(output_handles):,} output handles : {output_lines:,}/{total_lines_matched:,} lines")
598 | 


--------------------------------------------------------------------------------
/scripts/count_words_single_file.py:
--------------------------------------------------------------------------------
  1 | # this is an example of loading and iterating over a single file, doing some processing along the way to export a resulting csv
  2 | 
  3 | import zstandard
  4 | import os
  5 | import json
  6 | from collections import defaultdict
  7 | from datetime import datetime
  8 | import logging.handlers
  9 | 
 10 | 
 11 | log = logging.getLogger("bot")
 12 | log.setLevel(logging.DEBUG)
 13 | log.addHandler(logging.StreamHandler())
 14 | 
 15 | 
 16 | def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
 17 | 	chunk = reader.read(chunk_size)
 18 | 	bytes_read += chunk_size
 19 | 	if previous_chunk is not None:
 20 | 		chunk = previous_chunk + chunk
 21 | 	try:
 22 | 		return chunk.decode()
 23 | 	except UnicodeDecodeError:
 24 | 		if bytes_read > max_window_size:
 25 | 			raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
 26 | 		log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk")
 27 | 		return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)
 28 | 
 29 | 
 30 | def read_lines_zst(file_name):
 31 | 	with open(file_name, 'rb') as file_handle:
 32 | 		buffer = ''
 33 | 		reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
 34 | 		while True:
 35 | 			chunk = read_and_decode(reader, 2**27, (2**29) * 2)
 36 | 
 37 | 			if not chunk:
 38 | 				break
 39 | 			lines = (buffer + chunk).split("\n")
 40 | 
 41 | 			for line in lines[:-1]:
 42 | 				yield line, file_handle.tell()
 43 | 
 44 | 			buffer = lines[-1]
 45 | 
 46 | 		reader.close()
 47 | 
 48 | 
 49 | if __name__ == "__main__":
 50 | 	# the path to the input comment file
 51 | 	input_path = r"\\MYCLOUDPR4100\Public\reddit\requests\wallstreetbets_comments.zst"
 52 | 	# the path to the output csv file of word counts
 53 | 	output_path = r"\\MYCLOUDPR4100\Public\reddit\wallstreetbets_counts.csv"
 54 | 	# skip everything before this date. The subreddit was created in 2012, so there's a lot of dates before it gets to the good stuff if you want to skip them
 55 | 	start_date = datetime.strptime("2020-01-01", '%Y-%m-%d')
 56 | 	# list of word phrases to search for. Make sure these are all lowercase
 57 | 	phrases = [
 58 | 		"diamond hands",
 59 | 		"sell",
 60 | 	]
 61 | 
 62 | 	# bunch of initialization stuff
 63 | 	word_counts = defaultdict(int)
 64 | 	file_lines = 0
 65 | 	file_bytes_processed = 0
 66 | 	created = None
 67 | 	bad_lines = 0
 68 | 	current_day = None
 69 | 	output_file = open(output_path, 'w')
 70 | 	output_file.write(f"Date,{(','.join(phrases))}\n")
 71 | 	input_size = os.stat(input_path).st_size
 72 | 	try:
 73 | 		# this is the main loop where we iterate over every single line in the zst file
 74 | 		for line, file_bytes_processed in read_lines_zst(input_path):
 75 | 			try:
 76 | 				# load the line into a json object
 77 | 				obj = json.loads(line)
 78 | 				# turn the created timestamp into a date object
 79 | 				created = datetime.utcfromtimestamp(int(obj['created_utc']))
 80 | 				# skip if we're before the start date defined above
 81 | 				if created >= start_date:
 82 | 					# if this is a different day than the previous line we looked at, save the word counts to the csv
 83 | 					if current_day != created.replace(hour=0, minute=0, second=0, microsecond=0):
 84 | 						# don't save the dates if this is the very first day, we're just starting
 85 | 						if current_day is not None:
 86 | 							# write out the date at the beginning of the line
 87 | 							output_file.write(f"{current_day.strftime('%Y-%m-%d')}")
 88 | 							# for each phrase in the list, look up the count associated with it and write it out
 89 | 							for phrase in phrases:
 90 | 								output_file.write(",")
 91 | 								output_file.write(str(word_counts[phrase]))
 92 | 							output_file.write("\n")
 93 | 							# reset the dictionary so we can start counting up for the new day
 94 | 							word_counts = defaultdict(int)
 95 | 						# update the variable to the new day, so we can then tell when we get to the next day
 96 | 						current_day = created.replace(hour=0, minute=0, second=0, microsecond=0)
 97 | 
 98 | 					# get the lowercase of the object text
 99 | 					body_lower = obj['body'].lower()
100 | 					# for each of the phrases in the list
101 | 					for phrase in phrases:
102 | 						# check if it's the text
103 | 						if phrase in body_lower:
104 | 							word_counts[phrase] += 1
105 | 
106 | 			# just in case there's corruption somewhere in the file
107 | 			except (KeyError, json.JSONDecodeError) as err:
108 | 				bad_lines += 1
109 | 			file_lines += 1
110 | 			if file_lines % 100000 == 0:
111 | 				log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : {bad_lines:,} : {(file_bytes_processed / input_size) * 100:.0f}%")
112 | 	except Exception as err:
113 | 		log.info(err)
114 | 
115 | 	# write out the last day
116 | 	output_file.write(f"{current_day.strftime('%Y-%m-%d')}")
117 | 	for phrase in phrases:
118 | 		output_file.write(",")
119 | 		output_file.write(str(word_counts[phrase]))
120 | 	output_file.write("\n")
121 | 
122 | 	output_file.close()
123 | 	log.info(f"Complete : {file_lines:,} : {bad_lines:,}")
124 | 


--------------------------------------------------------------------------------
/scripts/filter_file.py:
--------------------------------------------------------------------------------
  1 | import zstandard
  2 | import os
  3 | import json
  4 | import sys
  5 | import csv
  6 | from datetime import datetime
  7 | import logging.handlers
  8 | 
  9 | # put the path to the input file, or a folder of files to process all of
 10 | input_file = r"\\MYCLOUDPR4100\Public\reddit\subreddits23\wallstreetbets_submissions.zst"
 11 | # put the name or path to the output file. The file extension from below will be added automatically. If the input file is a folder, the output will be treated as a folder as well
 12 | output_file = r"\\MYCLOUDPR4100\Public\output"
 13 | # the format to output in, pick from the following options
 14 | #   zst: same as the input, a zstandard compressed ndjson file. Can be read by the other scripts in the repo
 15 | #   txt: an ndjson file, which is a text file with a separate json object on each line. Can be opened by any text editor
 16 | #   csv: a comma separated value file. Can be opened by a text editor or excel
 17 | # WARNING READ THIS: if you use txt or csv output on a large input file without filtering out most of the rows, the resulting file will be extremely large. Usually about 7 times as large as the compressed input file
 18 | output_format = "csv"
 19 | # override the above format and output only this field into a text file, one per line. Useful if you want to make a list of authors or ids. See the examples below
 20 | # any field that's in the dump is supported, but useful ones are
 21 | #   author: the username of the author
 22 | #   id: the id of the submission or comment
 23 | #   link_id: only for comments, the fullname of the submission the comment is associated with
 24 | #   parent_id: only for comments, the fullname of the parent of the comment. Either another comment or the submission if it's top level
 25 | single_field = None
 26 | # the fields in the file are different depending on whether it has comments or submissions. If we're writing a csv, we need to know which fields to write.
 27 | # set this to true to write out to the log every time there's a bad line, set to false if you're expecting only some of the lines to match the key
 28 | write_bad_lines = True
 29 | 
 30 | # only output items between these two dates
 31 | from_date = datetime.strptime("2005-01-01", "%Y-%m-%d")
 32 | to_date = datetime.strptime("2030-12-31", "%Y-%m-%d")
 33 | 
 34 | # the field to filter on, the values to filter with and whether it should be an exact match
 35 | # some examples:
 36 | #
 37 | # return only objects where the author is u/watchful1 or u/spez
 38 | # field = "author"
 39 | # values = ["watchful1","spez"]
 40 | # exact_match = True
 41 | #
 42 | # return only objects where the title contains either "stonk" or "moon"
 43 | # field = "title"
 44 | # values = ["stonk","moon"]
 45 | # exact_match = False
 46 | #
 47 | # return only objects where the body contains either "stonk" or "moon". For submissions the body is in the "selftext" field, for comments it's in the "body" field
 48 | # field = "selftext"
 49 | # values = ["stonk","moon"]
 50 | # exact_match = False
 51 | #
 52 | #
 53 | # filter a submission file and then get a file with all the comments only in those submissions. This is a multi step process
 54 | # add your submission filters and set the output file name to something unique
 55 | # input_file = "redditdev_submissions.zst"
 56 | # output_file = "filtered_submissions"
 57 | # output_format = "csv"
 58 | # field = "author"
 59 | # values = ["watchful1"]
 60 | #
 61 | # run the script, this will result in a file called "filtered_submissions.csv" that contains only submissions by u/watchful1
 62 | # now we'll run the script again with the same input and filters, but set the output to single field. Be sure to change the output file to a new name, but don't change any of the other inputs
 63 | # output_file = "submission_ids"
 64 | # single_field = "id"
 65 | #
 66 | # run the script again, this will result in a file called "submission_ids.txt" that has an id on each line
 67 | # now we'll remove all the other filters and update the script to input from the comments file, and use the submission ids list we created before. And change the output name again so we don't override anything
 68 | # input_file = "redditdev_comments.zst"
 69 | # output_file = "filtered_comments"
 70 | # single_field = None  # resetting this back so it's not used
 71 | # field = "link_id"  # in the comment object, this is the field that contains the submission id
 72 | # values_file = "submission_ids.txt"
 73 | # exact_match = False  # the link_id field has a prefix on it, so we can't do an exact match
 74 | #
 75 | # run the script one last time and now you have a file called "filtered_comments.csv" that only has comments from your submissions above
 76 | # if you want only top level comments instead of all comments, you can set field to "parent_id" instead of "link_id"
 77 | 
 78 | # change this to field = None if you don't want to filter by anything
 79 | field = "body"
 80 | values = ['']
 81 | # if you have a long list of values, you can put them in a file and put the filename here. If set this overrides the value list above
 82 | # if this list is very large, it could greatly slow down the process
 83 | values_file = None
 84 | exact_match = False
 85 | 
 86 | 
 87 | # sets up logging to the console as well as a file
 88 | log = logging.getLogger("bot")
 89 | log.setLevel(logging.INFO)
 90 | log_formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')
 91 | log_str_handler = logging.StreamHandler()
 92 | log_str_handler.setFormatter(log_formatter)
 93 | log.addHandler(log_str_handler)
 94 | if not os.path.exists("logs"):
 95 | 	os.makedirs("logs")
 96 | log_file_handler = logging.handlers.RotatingFileHandler(os.path.join("logs", "bot.log"), maxBytes=1024*1024*16, backupCount=5)
 97 | log_file_handler.setFormatter(log_formatter)
 98 | log.addHandler(log_file_handler)
 99 | 
100 | 
101 | def write_line_zst(handle, line):
102 | 	handle.write(line.encode('utf-8'))
103 | 	handle.write("\n".encode('utf-8'))
104 | 
105 | 
106 | def write_line_json(handle, obj):
107 | 	handle.write(json.dumps(obj))
108 | 	handle.write("\n")
109 | 
110 | 
111 | def write_line_single(handle, obj, field):
112 | 	if field in obj:
113 | 		handle.write(obj[field])
114 | 	else:
115 | 		log.info(f"{field} not in object {obj['id']}")
116 | 	handle.write("\n")
117 | 
118 | 
119 | def write_line_csv(writer, obj, is_submission):
120 | 	output_list = []
121 | 	output_list.append(str(obj['score']))
122 | 	output_list.append(datetime.fromtimestamp(int(obj['created_utc'])).strftime("%Y-%m-%d"))
123 | 	if is_submission:
124 | 		output_list.append(obj['title'])
125 | 	output_list.append(f"u/{obj['author']}")
126 | 	if 'permalink' in obj:
127 | 		output_list.append(f"https://www.reddit.com{obj['permalink']}")
128 | 	else:
129 | 		output_list.append(f"https://www.reddit.com/r/{obj['subreddit']}/comments/{obj['link_id'][3:]}/_/{obj['id']}")
130 | 	if is_submission:
131 | 		if obj['is_self']:
132 | 			if 'selftext' in obj:
133 | 				output_list.append(obj['selftext'])
134 | 			else:
135 | 				output_list.append("")
136 | 		else:
137 | 			output_list.append(obj['url'])
138 | 	else:
139 | 		output_list.append(obj['body'])
140 | 	writer.writerow(output_list)
141 | 
142 | 
143 | def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
144 | 	chunk = reader.read(chunk_size)
145 | 	bytes_read += chunk_size
146 | 	if previous_chunk is not None:
147 | 		chunk = previous_chunk + chunk
148 | 	try:
149 | 		return chunk.decode()
150 | 	except UnicodeDecodeError:
151 | 		if bytes_read > max_window_size:
152 | 			raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
153 | 		log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk")
154 | 		return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)
155 | 
156 | 
157 | def read_lines_zst(file_name):
158 | 	with open(file_name, 'rb') as file_handle:
159 | 		buffer = ''
160 | 		reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
161 | 		while True:
162 | 			chunk = read_and_decode(reader, 2**27, (2**29) * 2)
163 | 
164 | 			if not chunk:
165 | 				break
166 | 			lines = (buffer + chunk).split("\n")
167 | 
168 | 			for line in lines[:-1]:
169 | 				yield line.strip(), file_handle.tell()
170 | 
171 | 			buffer = lines[-1]
172 | 
173 | 		reader.close()
174 | 
175 | 
176 | def process_file(input_file, output_file, output_format, field, values, from_date, to_date, single_field, exact_match):
177 | 	output_path = f"{output_file}.{output_format}"
178 | 	is_submission = "submission" in input_file
179 | 	log.info(f"Input: {input_file} : Output: {output_path} : Is submission {is_submission}")
180 | 	writer = None
181 | 	if output_format == "zst":
182 | 		handle = zstandard.ZstdCompressor().stream_writer(open(output_path, 'wb'))
183 | 	elif output_format == "txt":
184 | 		handle = open(output_path, 'w', encoding='UTF-8')
185 | 	elif output_format == "csv":
186 | 		handle = open(output_path, 'w', encoding='UTF-8', newline='')
187 | 		writer = csv.writer(handle)
188 | 	else:
189 | 		log.error(f"Unsupported output format {output_format}")
190 | 		sys.exit()
191 | 
192 | 	file_size = os.stat(input_file).st_size
193 | 	created = None
194 | 	matched_lines = 0
195 | 	bad_lines = 0
196 | 	total_lines = 0
197 | 	for line, file_bytes_processed in read_lines_zst(input_file):
198 | 		total_lines += 1
199 | 		if total_lines % 100000 == 0:
200 | 			log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {total_lines:,} : {matched_lines:,} : {bad_lines:,} : {file_bytes_processed:,}:{(file_bytes_processed / file_size) * 100:.0f}%")
201 | 
202 | 		try:
203 | 			obj = json.loads(line)
204 | 			created = datetime.utcfromtimestamp(int(obj['created_utc']))
205 | 
206 | 			if created < from_date:
207 | 				continue
208 | 			if created > to_date:
209 | 				continue
210 | 
211 | 			if field is not None:
212 | 				field_value = obj[field].lower()
213 | 				matched = False
214 | 				for value in values:
215 | 					if exact_match:
216 | 						if value == field_value:
217 | 							matched = True
218 | 							break
219 | 					else:
220 | 						if value in field_value:
221 | 							matched = True
222 | 							break
223 | 				if not matched:
224 | 					continue
225 | 
226 | 			matched_lines += 1
227 | 			if output_format == "zst":
228 | 				write_line_zst(handle, line)
229 | 			elif output_format == "csv":
230 | 				write_line_csv(writer, obj, is_submission)
231 | 			elif output_format == "txt":
232 | 				if single_field is not None:
233 | 					write_line_single(handle, obj, single_field)
234 | 				else:
235 | 					write_line_json(handle, obj)
236 | 			else:
237 | 				log.info(f"Something went wrong, invalid output format {output_format}")
238 | 		except (KeyError, json.JSONDecodeError) as err:
239 | 			bad_lines += 1
240 | 			if write_bad_lines:
241 | 				if isinstance(err, KeyError):
242 | 					log.warning(f"Key {field} is not in the object: {err}")
243 | 				elif isinstance(err, json.JSONDecodeError):
244 | 					log.warning(f"Line decoding failed: {err}")
245 | 				log.warning(line)
246 | 
247 | 	handle.close()
248 | 	log.info(f"Complete : {total_lines:,} : {matched_lines:,} : {bad_lines:,}")
249 | 
250 | 
251 | if __name__ == "__main__":
252 | 	if single_field is not None:
253 | 		log.info("Single field output mode, changing output file format to txt")
254 | 		output_format = "txt"
255 | 
256 | 	if values_file is not None:
257 | 		values = []
258 | 		with open(values_file, 'r') as values_handle:
259 | 			for value in values_handle:
260 | 				values.append(value.strip().lower())
261 | 		log.info(f"Loaded {len(values)} from values file {values_file}")
262 | 	else:
263 | 		values = [value.lower() for value in values]  # convert to lowercase
264 | 
265 | 	log.info(f"Filtering field: {field}")
266 | 	if len(values) <= 20:
267 | 		log.info(f"On values: {','.join(values)}")
268 | 	else:
269 | 		log.info(f"On values:")
270 | 		for value in values:
271 | 			log.info(value)
272 | 	log.info(f"Exact match {('on' if exact_match else 'off')}. Single field {single_field}.")
273 | 	log.info(f"From date {from_date.strftime('%Y-%m-%d')} to date {to_date.strftime('%Y-%m-%d')}")
274 | 	log.info(f"Output format set to {output_format}")
275 | 
276 | 	input_files = []
277 | 	if os.path.isdir(input_file):
278 | 		if not os.path.exists(output_file):
279 | 			os.makedirs(output_file)
280 | 		for file in os.listdir(input_file):
281 | 			if not os.path.isdir(file) and file.endswith(".zst"):
282 | 				input_name = os.path.splitext(os.path.splitext(os.path.basename(file))[0])[0]
283 | 				input_files.append((os.path.join(input_file, file), os.path.join(output_file, input_name)))
284 | 	else:
285 | 		input_files.append((input_file, output_file))
286 | 	log.info(f"Processing {len(input_files)} files")
287 | 	for file_in, file_out in input_files:
288 | 		try:
289 | 			process_file(file_in, file_out, output_format, field, values, from_date, to_date, single_field, exact_match)
290 | 		except Exception as err:
291 | 			log.warning(f"Error processing {file_in}: {err}")
292 | 			log.warning(traceback.format_exc())
293 | 


--------------------------------------------------------------------------------
/scripts/find_overlapping_users.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from collections import defaultdict
  3 | from datetime import datetime, timedelta
  4 | import time
  5 | import os
  6 | import logging.handlers
  7 | import zstandard
  8 | import json
  9 | 
 10 | # IMPORTANT SETUP INSTRUCTIONS
 11 | # get subreddit files from here https://www.reddit.com/r/pushshift/comments/1itme1k/separate_dump_files_for_the_top_40k_subreddits/
 12 | # change the folder line to the folder where the files are stored
 13 | # change the subreddits to the list of subreddits, one per line. The case must exactly match, ie, for r/AskReddit, put "AskReddit"
 14 | # the files in the folder must match the format from the torrent, subreddit_type.zst, like AskReddit_comments.zst
 15 | # the script will look for both comments and submissions files for each subreddit
 16 | folder = r"\\MYCLOUDPR4100\Public\reddit\subreddits24"
 17 | subreddits_string = """
 18 | 	askcarsales
 19 | 	Denton
 20 | 	relationship_advice
 21 | 	Dallas
 22 | 	askdfw
 23 | 	AskMen
 24 | 	rolex
 25 | 	lego
 26 | """
 27 | ignored_users = {'[deleted]', 'automoderator'}
 28 | # this is a list of users to ignore when doing the comparison. Most popular bots post in many subreddits and aren't the person you're looking for
 29 | # here's a good start, but add bots to your list as you encounter them https://github.com/Watchful1/PushshiftDumps/blob/master/scripts/ignored.txt
 30 | ignored_users_file = "ignored.txt"
 31 | min_comments_per_sub = 1
 32 | output_file_name = "users.txt"
 33 | require_first_subreddit = False  # if true, print users that occur in the first subreddit and any one of the following ones. Otherwise just find the most overlap between all subs
 34 | from_date = datetime.strptime("2005-01-01", "%Y-%m-%d")
 35 | to_date = datetime.strptime("2040-12-31", "%Y-%m-%d")
 36 | 
 37 | 
 38 | # sets up logging to the console as well as a file
 39 | log = logging.getLogger("bot")
 40 | log.setLevel(logging.INFO)
 41 | log_formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')
 42 | log_str_handler = logging.StreamHandler()
 43 | log_str_handler.setFormatter(log_formatter)
 44 | log.addHandler(log_str_handler)
 45 | if not os.path.exists("logs"):
 46 | 	os.makedirs("logs")
 47 | log_file_handler = logging.handlers.RotatingFileHandler(os.path.join("logs", "bot.log"), maxBytes=1024*1024*16, backupCount=5)
 48 | log_file_handler.setFormatter(log_formatter)
 49 | log.addHandler(log_file_handler)
 50 | 
 51 | 
 52 | def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
 53 | 	chunk = reader.read(chunk_size)
 54 | 	bytes_read += chunk_size
 55 | 	if previous_chunk is not None:
 56 | 		chunk = previous_chunk + chunk
 57 | 	try:
 58 | 		return chunk.decode()
 59 | 	except UnicodeDecodeError:
 60 | 		if bytes_read > max_window_size:
 61 | 			raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
 62 | 		log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk")
 63 | 		return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)
 64 | 
 65 | 
 66 | def read_lines_zst(file_name):
 67 | 	with open(file_name, 'rb') as file_handle:
 68 | 		buffer = ''
 69 | 		reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
 70 | 		while True:
 71 | 			chunk = read_and_decode(reader, 2**27, (2**29) * 2)
 72 | 
 73 | 			if not chunk:
 74 | 				break
 75 | 			lines = (buffer + chunk).split("\n")
 76 | 
 77 | 			for line in lines[:-1]:
 78 | 				yield line.strip(), file_handle.tell()
 79 | 
 80 | 			buffer = lines[-1]
 81 | 
 82 | 		reader.close()
 83 | 
 84 | 
 85 | def get_commenters_from_file(subreddit, subreddit_file, subreddit_commenters, total_lines, files_status, from_date, to_date):
 86 | 	file_lines = 0
 87 | 	created = None
 88 | 	file_size = os.stat(subreddit_file).st_size
 89 | 	for line, file_bytes_processed in read_lines_zst(subreddit_file):
 90 | 		total_lines += 1
 91 | 		file_lines += 1
 92 | 		if total_lines % 100000 == 0:
 93 | 			log.info(f"{files_status}: {total_lines:,}: r/{subreddit}: {created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : {(file_bytes_processed / file_size) * 100:.0f}%")
 94 | 
 95 | 		try:
 96 | 			obj = json.loads(line)
 97 | 			created = datetime.utcfromtimestamp(int(obj['created_utc']))
 98 | 			if created < from_date or created > to_date:
 99 | 				continue
100 | 
101 | 			if obj['author'].lower() not in ignored_users:
102 | 				subreddit_commenters[obj['author']] += 1
103 | 		except (KeyError, json.JSONDecodeError) as err:
104 | 			pass
105 | 	log.info(f"{total_lines:,}: {subreddit_file}: {created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : 100%")
106 | 	return total_lines
107 | 
108 | 
109 | if __name__ == "__main__":
110 | 	log.info(f"Subreddit's folder: {folder}")
111 | 	if not os.path.exists(folder):
112 | 		log.error(f"Subreddit's folder either doesn't exist or the script doesn't have access to it: {folder}")
113 | 		sys.exit()
114 | 	subreddits = []
115 | 	for line in subreddits_string.split("\n"):
116 | 		subreddit = line.strip()
117 | 		if subreddit == "":
118 | 			continue
119 | 		subreddits.append(subreddit)
120 | 
121 | 	if len(subreddits) <= 10:
122 | 		log.info(f"Finding overlapping users in {', '.join(subreddits)}")
123 | 	else:
124 | 		log.info(f"Finding overlapping users in {len(subreddits)} subreddits")
125 | 	if require_first_subreddit:
126 | 		log.info(f"Finding users from the first subreddit that are in any of the other subreddits")
127 | 	log.info(f"Minimum comments per subreddit set to {min_comments_per_sub}")
128 | 	log.info(f"Outputting to {output_file_name}")
129 | 	log.info(f"From date {from_date.strftime('%Y-%m-%d')} to date {to_date.strftime('%Y-%m-%d')}")
130 | 
131 | 	if os.path.exists(ignored_users_file):
132 | 		with open(ignored_users_file) as fh:
133 | 			for user in fh.readlines():
134 | 				ignored_users.add(user.strip().lower())
135 | 		log.info(f"Loaded {len(ignored_users)} ignored users from {ignored_users_file}")
136 | 
137 | 	log.info(f"Checking that subreddit files are present")
138 | 
139 | 	folder_files = {}
140 | 	for file in os.listdir(folder):
141 | 		folder_files[file.lower()] = file
142 | 
143 | 	subreddit_stats = []
144 | 	for subreddit in subreddits:
145 | 		subreddit_stat = {"files": 0, "bytes": 0, "name": subreddit}
146 | 		for file_type in ["submissions", "comments"]:
147 | 			file_ending = f"_{file_type}.zst"
148 | 			file_name = folder_files.get(f"{subreddit.lower()}{file_ending}")
149 | 			if file_name is None:
150 | 				continue
151 | 			subreddit_file = os.path.join(folder, file_name)
152 | 
153 | 			subreddit_stat["name"] = file_name[0:-len(file_ending)]
154 | 			subreddit_stat[file_type] = subreddit_file
155 | 			subreddit_stat["files"] += 1
156 | 			subreddit_stat["bytes"] += os.stat(subreddit_file).st_size
157 | 
158 | 		subreddit_stats.append(subreddit_stat)
159 | 
160 | 	subreddit_stats.sort(key=lambda x: x["bytes"], reverse=True)
161 | 	abort = False
162 | 	for subreddit_stat in subreddit_stats:
163 | 		if subreddit_stat["files"] == 0:
164 | 			log.info(f"No files for {subreddit_stat['name']} exist")
165 | 			abort = True
166 | 		else:
167 | 			log.info(f"r/{subreddit_stat['name']} files total {(subreddit_stat['bytes'] / (2**30)):.2f} gb")
168 | 
169 | 	if abort:
170 | 		log.error(f"The script can see {len(folder_files)} files in the folder, but not the ones requested: {folder}")
171 | 		sys.exit(0)
172 | 
173 | 	commenterSubreddits = defaultdict(int)
174 | 	is_first = True
175 | 	total_lines = 0
176 | 	files_processed = 1
177 | 	for subreddit_stat in subreddit_stats:
178 | 		commenters = defaultdict(int)
179 | 		for file_type in ["submissions", "comments"]:
180 | 			total_lines = get_commenters_from_file(
181 | 				f"{subreddit_stat['name']}_{file_type}",
182 | 				subreddit_stat[file_type],
183 | 				commenters,
184 | 				total_lines,
185 | 				f"{files_processed}|{len(subreddit_stats)}",
186 | 				from_date,
187 | 				to_date
188 | 			)
189 | 		for commenter in commenters:
190 | 			if require_first_subreddit and not is_first and commenter not in commenterSubreddits:
191 | 				continue
192 | 			if commenters[commenter] >= min_comments_per_sub:
193 | 				commenterSubreddits[commenter] += 1
194 | 		is_first = False
195 | 		files_processed += 1
196 | 
197 | 	if require_first_subreddit:
198 | 		count_found = 0
199 | 		with open(output_file_name, 'w') as txt:
200 | 			txt.write(f"Commenters in r/{subreddits[0]} and at least one of {(', '.join(subreddits))}\n")
201 | 			for commenter, countSubreddits in commenterSubreddits.items():
202 | 				if countSubreddits >= 2:
203 | 					count_found += 1
204 | 					txt.write(f"{commenter}\n")
205 | 		log.info(f"{count_found} commenters in r/{subreddits[0]} and at least one of {(', '.join(subreddits))}")
206 | 
207 | 	else:
208 | 		sharedCommenters = defaultdict(list)
209 | 		for commenter, countSubreddits in commenterSubreddits.items():
210 | 			if countSubreddits >= 2:
211 | 				sharedCommenters[countSubreddits].append(commenter)
212 | 
213 | 		with open(output_file_name, 'w') as txt:
214 | 			log.info(f"Writing output to {output_file_name}")
215 | 			txt.write(f"Commenters in subreddits {(', '.join(subreddits))}\n")
216 | 			for i in range(len(subreddits)):
217 | 				commenters = len(sharedCommenters[len(subreddits) - i])
218 | 				inner_str = f"but {i} " if i != 0 else ""
219 | 				log.info(f"{commenters} commenters in all {inner_str}subreddits")
220 | 				if commenters == 0:
221 | 					txt.write(f"No commenters in all {inner_str}subreddits\n")
222 | 				else:
223 | 					txt.write(f"{commenters} commenters in all {inner_str}subreddits\n")
224 | 					for user in sorted(sharedCommenters[len(subreddits) - i], key=str.lower):
225 | 						txt.write(f"{user}\n")
226 | 				txt.write("\n")
227 | 				if commenters > 3:
228 | 					break
229 | 


--------------------------------------------------------------------------------
/scripts/ignored.txt:
--------------------------------------------------------------------------------
  1 | alphabet_order_bot
  2 | AmputatorBot
  3 | anti-gif-bot
  4 | Anti-ThisBot-IB
  5 | autotldr
  6 | B0tRank
  7 | converter-bot
  8 | exclaim_bot
  9 | GenderNeutralBot
 10 | Good_Human_Bot_v2
 11 | haikusbot
 12 | LearnDifferenceBot
 13 | LuckyNumber-Bot
 14 | nice___bot
 15 | of_patrol_bot
 16 | Paid-Not-Payed-Bot
 17 | Reddit-Book-Bot
 18 | RemindMeBot
 19 | SexPanther_Bot
 20 | Shakespeare-Bot
 21 | sneakpeekbot
 22 | TheGratitudeBot
 23 | TotesMessenger
 24 | Upside_Down-Bot
 25 | useles-converter-bot
 26 | WaitingToBeTriggered
 27 | WaterIsWetBot
 28 | wikipedia_answer_bot
 29 | WikiSummarizerBot
 30 | TheDroidNextDoor
 31 | agree-with-you
 32 | BadDadBot
 33 | Booty_Warrior_bot
 34 | dadbot_2
 35 | dadbot_3000
 36 | FakespotAnalysisBot
 37 | Grammar-Bot-Elite
 38 | IamYodaBot
 39 | LinkifyBot
 40 | phonebatterylevelbot
 41 | PORTMANTEAU-BOT
 42 | queen_of_england_bot
 43 | SpambotSwatter
 44 | SpunkyDred
 45 | tiny_smile_bot
 46 | twitterInfo_bot
 47 | YoMommaJokeBot
 48 | FatFingerHelperBot
 49 | WikiTextBot
 50 | CommonMisspellingBot
 51 | auto-xkcd37
 52 | HelperBot_
 53 | imguralbumbot
 54 | RepostSleuthBot
 55 | wikipedia_text_bot
 56 | TheSunflowerSeeds
 57 | Bot_Metric
 58 | CakeDay--Bot
 59 | GoodBot_BadBot
 60 | BigLebowskiBot
 61 | jashxn
 62 | nice-scores
 63 | NoGoogleAMPBot
 64 | sub_doesnt_exist_bot
 65 | WikiMobileLinkBot
 66 | EncouragementRobot
 67 | AnimalFactsBot
 68 | eazeaze
 69 | epic_gamer_4268
 70 | ghost_of_dongerbot
 71 | LimbRetrieval-Bot
 72 | LoneKharnivore
 73 | video_descriptionbot
 74 | WhyNotCollegeBoard
 75 | youtubefactsbot
 76 | _youtubot_
 77 | Alternative_Case_878
 78 | botrickbateman
 79 | ClickableLinkBot
 80 | I-Am-Dad-Bot
 81 | SEND_NUKES_PLZ
 82 | UselessConversionBot
 83 | Agrees_withyou
 84 | AreYouDeaf
 85 | autowikibot
 86 | ectbot
 87 | Generic_Reddit_Bot
 88 | happy-cake-day-bot-
 89 | Philip_Jeffries
 90 | SokkaHaikuBot
 91 | table_it_bot
 92 | UkraineWithoutTheBot
 93 | BotThatSaysBro
 94 | ConceptMajestic9156
 95 | kelvin_bot
 96 | ReverseCaptioningBot
 97 | ShelSilverstain
 98 | SmileBot-2020
 99 | Chuck_Norris_Jokebot
100 | ConvertsToMetric
101 | EmojifierBot
102 | haikubot-1911
103 | I-AM-PIRATE
104 | MaxImageBot
105 | nkid299
106 | resavr_bot
107 | serendipitybot
108 | SmallSubBot
109 | smile-bot-2019
110 | YodaOnReddit-Bot
111 | Anti_Fake_Yoda_Bot
112 | AntiObnoxiousBot
113 | bruh__bot
114 | LeEpicRedditor69
115 | lerobinbot
116 | not_so_magic_8_ball
117 | nwordcountbot
118 | oofed-bot
119 | RatedCommentBot
120 | same_post_bot
121 | same_subreddit_bot
122 | SuicideAwarenessBot
123 | thebenshapirobot
124 | these_days_bot
125 | totes_meta_bot
126 | aardBot
127 | gifv-bot
128 | I_Love_You-BOT
129 | imdad_bot
130 | metric_units
131 | YoUaReSoHiLaRiOuS
132 | HIPPAbot
133 | VettedBot
134 | ackchyually_bot
135 | amp-is-watching-you
136 | AntiLowEffortBot
137 | auddbot
138 | BananaFactBot
139 | BlogSpammr
140 | Chick-fil-A_spellbot
141 | CoolDownBot
142 | demonitize_bot
143 | EverySingleThread
144 | GANDHI-BOT
145 | HappyFriendlyBot
146 | icarebot
147 | kzreminderbot
148 | MCTerminologyBot
149 | Mentioned_Videos
150 | morejpeg_auto
151 | profanitycounter
152 | remindditbot
153 | SaveVideo
154 | savevideobot
155 | The-Worst-Bot
156 | theHelperdroid
157 | VredditDownloader
158 | YOUREABOT
159 | YTubeInfoBot
160 | URLfixerBot
161 | TweetsInCommentsBot
162 | SovietRussiaBot
163 | ShibeBot
164 | PressFBot
165 | LittleHelperRobot
166 | LinkFixerBot
167 | LinkFixerBotSnr
168 | Link_Demobilizer
169 | LazyLinkerBot
170 | Darnit_Bot
171 | checks_out_bot
172 | HippoBot9000
173 | could-of-bot
174 | mentionhelper
175 | RossGellerBot
176 | the_timezone_bot


--------------------------------------------------------------------------------
/scripts/iterate_folder.py:
--------------------------------------------------------------------------------
 1 | # this is an example of iterating over all zst files in a single folder,
 2 | # decompressing them and reading the created_utc field to make sure the files
 3 | # are intact. It has no output other than the number of lines
 4 | 
 5 | import zstandard
 6 | import os
 7 | import json
 8 | import sys
 9 | from datetime import datetime
10 | import logging.handlers
11 | 
12 | 
13 | log = logging.getLogger("bot")
14 | log.setLevel(logging.DEBUG)
15 | log.addHandler(logging.StreamHandler())
16 | 
17 | 
18 | def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
19 | 	chunk = reader.read(chunk_size)
20 | 	bytes_read += chunk_size
21 | 	if previous_chunk is not None:
22 | 		chunk = previous_chunk + chunk
23 | 	try:
24 | 		return chunk.decode()
25 | 	except UnicodeDecodeError:
26 | 		if bytes_read > max_window_size:
27 | 			raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
28 | 		log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk")
29 | 		return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)
30 | 
31 | 
32 | def read_lines_zst(file_name):
33 | 	with open(file_name, 'rb') as file_handle:
34 | 		buffer = ''
35 | 		reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
36 | 		while True:
37 | 			chunk = read_and_decode(reader, 2**27, (2**29) * 2)
38 | 
39 | 			if not chunk:
40 | 				break
41 | 			lines = (buffer + chunk).split("\n")
42 | 
43 | 			for line in lines[:-1]:
44 | 				yield line.strip(), file_handle.tell()
45 | 
46 | 			buffer = lines[-1]
47 | 
48 | 		reader.close()
49 | 
50 | 
51 | input_folder = sys.argv[1]
52 | input_files = []
53 | total_size = 0
54 | for subdir, dirs, files in os.walk(input_folder):
55 | 	for filename in files:
56 | 		input_path = os.path.join(subdir, filename)
57 | 		if input_path.endswith(".zst"):
58 | 			file_size = os.stat(input_path).st_size
59 | 			total_size += file_size
60 | 			input_files.append([input_path, file_size])
61 | 
62 | log.info(f"Processing {len(input_files)} files of {(total_size / (2**30)):.2f} gigabytes")
63 | 
64 | total_lines = 0
65 | total_bytes_processed = 0
66 | for input_file in input_files:
67 | 	file_lines = 0
68 | 	file_bytes_processed = 0
69 | 	created = None
70 | 	for line, file_bytes_processed in read_lines_zst(input_file[0]):
71 | 		obj = json.loads(line)
72 | 		created = datetime.utcfromtimestamp(int(obj['created_utc']))
73 | 		file_lines += 1
74 | 		if file_lines == 1:
75 | 			log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines + total_lines:,} : 0% : {(total_bytes_processed / total_size) * 100:.0f}%")
76 | 		if file_lines % 100000 == 0:
77 | 			log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines + total_lines:,} : {(file_bytes_processed / input_file[1]) * 100:.0f}% : {(total_bytes_processed / total_size) * 100:.0f}%")
78 | 	total_lines += file_lines
79 | 	total_bytes_processed += input_file[1]
80 | 	log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {total_lines:,} : 100% : {(total_bytes_processed / total_size) * 100:.0f}%")
81 | 
82 | log.info(f"Total: {total_lines}")
83 | 


--------------------------------------------------------------------------------
/scripts/single_file.py:
--------------------------------------------------------------------------------
 1 | # this is an example of loading and iterating over a single file
 2 | 
 3 | import zstandard
 4 | import os
 5 | import json
 6 | import sys
 7 | from datetime import datetime
 8 | import logging.handlers
 9 | 
10 | 
11 | log = logging.getLogger("bot")
12 | log.setLevel(logging.DEBUG)
13 | log.addHandler(logging.StreamHandler())
14 | 
15 | 
16 | def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
17 | 	chunk = reader.read(chunk_size)
18 | 	bytes_read += chunk_size
19 | 	if previous_chunk is not None:
20 | 		chunk = previous_chunk + chunk
21 | 	try:
22 | 		return chunk.decode()
23 | 	except UnicodeDecodeError:
24 | 		if bytes_read > max_window_size:
25 | 			raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
26 | 		log.info(f"Decoding error with {bytes_read:,} bytes, reading another chunk")
27 | 		return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)
28 | 
29 | 
30 | def read_lines_zst(file_name):
31 | 	with open(file_name, 'rb') as file_handle:
32 | 		buffer = ''
33 | 		reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
34 | 		while True:
35 | 			chunk = read_and_decode(reader, 2**27, (2**29) * 2)
36 | 
37 | 			if not chunk:
38 | 				break
39 | 			lines = (buffer + chunk).split("\n")
40 | 
41 | 			for line in lines[:-1]:
42 | 				yield line, file_handle.tell()
43 | 
44 | 			buffer = lines[-1]
45 | 
46 | 		reader.close()
47 | 
48 | 
49 | if __name__ == "__main__":
50 | 	file_path = sys.argv[1]
51 | 	file_size = os.stat(file_path).st_size
52 | 	file_lines = 0
53 | 	file_bytes_processed = 0
54 | 	created = None
55 | 	field = "subreddit"
56 | 	value = "wallstreetbets"
57 | 	bad_lines = 0
58 | 	# try:
59 | 	for line, file_bytes_processed in read_lines_zst(file_path):
60 | 		try:
61 | 			obj = json.loads(line)
62 | 			created = datetime.utcfromtimestamp(int(obj['created_utc']))
63 | 			temp = obj[field] == value
64 | 		except (KeyError, json.JSONDecodeError) as err:
65 | 			bad_lines += 1
66 | 		file_lines += 1
67 | 		if file_lines % 100000 == 0:
68 | 			log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : {bad_lines:,} : {file_bytes_processed:,}:{(file_bytes_processed / file_size) * 100:.0f}%")
69 | 
70 | 	# except Exception as err:
71 | 	# 	log.info(err)
72 | 
73 | 	log.info(f"Complete : {file_lines:,} : {bad_lines:,}")
74 | 
75 | 


--------------------------------------------------------------------------------
/scripts/to_csv.py:
--------------------------------------------------------------------------------
  1 | # this converts a zst file to csv
  2 | #
  3 | # it's important to note that the resulting file will likely be quite large
  4 | # and you probably won't be able to open it in excel or another csv reader
  5 | #
  6 | # arguments are inputfile, outputfile, fields
  7 | # call this like
  8 | # python to_csv.py wallstreetbets_submissions.zst wallstreetbets_submissions.csv author,selftext,title
  9 | 
 10 | import zstandard
 11 | import os
 12 | import json
 13 | import sys
 14 | import csv
 15 | from datetime import datetime
 16 | import logging.handlers
 17 | 
 18 | 
 19 | # put the path to the input file
 20 | input_file_path = r"\\MYCLOUDPR4100\Public\tools\PushshiftDumps\Straight-Wrap-172_submissions.zst"
 21 | # put the path to the output file, with the csv extension
 22 | output_file_path = r"\\MYCLOUDPR4100\Public\Straight-Wrap-172_submissions.csv"
 23 | # if you want a custom set of fields, put them in the following list. If you leave it empty the script will use a default set of fields
 24 | fields = []
 25 | 
 26 | log = logging.getLogger("bot")
 27 | log.setLevel(logging.DEBUG)
 28 | log.addHandler(logging.StreamHandler())
 29 | 
 30 | 
 31 | def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
 32 | 	chunk = reader.read(chunk_size)
 33 | 	bytes_read += chunk_size
 34 | 	if previous_chunk is not None:
 35 | 		chunk = previous_chunk + chunk
 36 | 	try:
 37 | 		return chunk.decode()
 38 | 	except UnicodeDecodeError:
 39 | 		if bytes_read > max_window_size:
 40 | 			raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
 41 | 		return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)
 42 | 
 43 | 
 44 | def read_lines_zst(file_name):
 45 | 	with open(file_name, 'rb') as file_handle:
 46 | 		buffer = ''
 47 | 		reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
 48 | 		while True:
 49 | 			chunk = read_and_decode(reader, 2**27, (2**29) * 2)
 50 | 			if not chunk:
 51 | 				break
 52 | 			lines = (buffer + chunk).split("\n")
 53 | 
 54 | 			for line in lines[:-1]:
 55 | 				yield line, file_handle.tell()
 56 | 
 57 | 			buffer = lines[-1]
 58 | 		reader.close()
 59 | 
 60 | 
 61 | if __name__ == "__main__":
 62 | 	if len(sys.argv) >= 3:
 63 | 		input_file_path = sys.argv[1]
 64 | 		output_file_path = sys.argv[2]
 65 | 		fields = sys.argv[3].split(",")
 66 | 
 67 | 	is_submission = "submission" in input_file_path
 68 | 	if not len(fields):
 69 | 		if is_submission:
 70 | 			fields = ["author","title","score","created","link","text","url"]
 71 | 		else:
 72 | 			fields = ["author","score","created","link","body"]
 73 | 
 74 | 	file_size = os.stat(input_file_path).st_size
 75 | 	file_lines, bad_lines = 0, 0
 76 | 	line, created = None, None
 77 | 	output_file = open(output_file_path, "w", encoding='utf-8', newline="")
 78 | 	writer = csv.writer(output_file)
 79 | 	writer.writerow(fields)
 80 | 	try:
 81 | 		for line, file_bytes_processed in read_lines_zst(input_file_path):
 82 | 			try:
 83 | 				obj = json.loads(line)
 84 | 				output_obj = []
 85 | 				for field in fields:
 86 | 					if field == "created":
 87 | 						value = datetime.fromtimestamp(int(obj['created_utc'])).strftime("%Y-%m-%d %H:%M")
 88 | 					elif field == "link":
 89 | 						if 'permalink' in obj:
 90 | 							value = f"https://www.reddit.com{obj['permalink']}"
 91 | 						else:
 92 | 							value = f"https://www.reddit.com/r/{obj['subreddit']}/comments/{obj['link_id'][3:]}/_/{obj['id']}/"
 93 | 					elif field == "author":
 94 | 						value = f"u/{obj['author']}"
 95 | 					elif field == "text":
 96 | 						if 'selftext' in obj:
 97 | 							value = obj['selftext']
 98 | 						else:
 99 | 							value = ""
100 | 					else:
101 | 						value = obj[field]
102 | 
103 | 					output_obj.append(str(value).encode("utf-8", errors='replace').decode())
104 | 				writer.writerow(output_obj)
105 | 
106 | 				created = datetime.utcfromtimestamp(int(obj['created_utc']))
107 | 			except json.JSONDecodeError as err:
108 | 				bad_lines += 1
109 | 			file_lines += 1
110 | 			if file_lines % 100000 == 0:
111 | 				log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : {bad_lines:,} : {(file_bytes_processed / file_size) * 100:.0f}%")
112 | 	except KeyError as err:
113 | 		log.info(f"Object has no key: {err}")
114 | 		log.info(line)
115 | 	except Exception as err:
116 | 		log.info(err)
117 | 		log.info(line)
118 | 
119 | 	output_file.close()
120 | 	log.info(f"Complete : {file_lines:,} : {bad_lines:,}")
121 | 
122 | 


--------------------------------------------------------------------------------