├── .gitignore
├── .gitmodules
├── 4grep
├── LICENSE
├── Makefile
├── README.md
├── bitmap
    ├── .gitignore
    ├── Makefile
    ├── exec
    │   └── .gitignore
    ├── lib
    │   ├── minunit.h
    │   └── portable_endian.h
    ├── main
    │   ├── generate_bitmap.c
    │   └── test.c
    └── src
    │   ├── bitmap.c
    │   ├── bitmap.h
    │   ├── filter.c
    │   ├── filter.h
    │   ├── packfile.c
    │   ├── packfile.h
    │   ├── util.c
    │   └── util.h
├── build_deb.sh
├── debian
    ├── 4grep.links
    ├── changelog
    ├── compat
    ├── control
    ├── copyright
    ├── postinst
    ├── rules
    └── source
    │   ├── format
    │   └── options
├── description
├── disp_bitmap.py
├── docker_build
    ├── Dockerfile
    └── docker_build_deb.sh
├── img
    ├── example.gif
    └── zgrepvs4grep.png
├── test.py
└── tune.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | .*.swp
 2 | .*.swo
 3 | *.orig
 4 | *.DS_Store
 5 | **/*~
 6 | **/*.pyc
 7 | 4grepc
 8 | debian/4grep
 9 | debian/4grep.substvars
10 | debian/4grep.debhelper.log
11 | debian/files
12 | debian/debhelper-build-stamp
13 | compile_commands.json
14 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "zstd"]
 2 | 	path = bitmap/lib/zstd
 3 | 	url = https://github.com/facebook/zstd
 4 | [submodule "bitmap/lib/xxHash"]
 5 | 	path = bitmap/lib/xxhash
 6 | 	url = https://github.com/Cyan4973/xxHash
 7 | 	ignore = dirty
 8 | [submodule "bitmap/lib/zstd"]
 9 | 	path = bitmap/lib/zstd
10 | 	url = https://github.com/facebook/zstd.git
11 | 


--------------------------------------------------------------------------------
/4grep:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import print_function
  3 | from distutils.spawn import find_executable
  4 | from multiprocessing.pool import ThreadPool
  5 | from ctypes.util import find_library
  6 | from contextlib import contextmanager
  7 | from collections import deque
  8 | from subprocess import PIPE
  9 | from Queue import Empty
 10 | 
 11 | import multiprocessing as mp
 12 | import subprocess
 13 | import threading
 14 | import itertools
 15 | import argparse
 16 | import tempfile
 17 | import getpass
 18 | import shutil
 19 | import signal
 20 | import ctypes as ct
 21 | import errno
 22 | import math
 23 | import time
 24 | import sys
 25 | import os
 26 | import re
 27 | 
 28 | NGRAM_CHARS = 5
 29 | TGREP_DIR = os.path.dirname(os.path.realpath(__file__))
 30 | MODULE_PATHS = [os.path.join(TGREP_DIR, module_name)
 31 | 		for module_name in ("bitmap/4grep.so", "4grep.so")]
 32 | REGEX_METACHARACTERS = r".^$*+?{}[]\|()"
 33 | ESCAPED_REGEX_METACHARACTERS = re.escape(REGEX_METACHARACTERS)
 34 | 
 35 | try:
 36 | 	module_path = next(m for m in MODULE_PATHS if os.path.isfile(m))
 37 | except StopIteration:
 38 | 	module_path = find_library("4grep")
 39 | 	if module_path is None:
 40 | 		print("4grep: Error: 4grep.so not found")
 41 | 		sys.exit(-1)
 42 | 
 43 | class intarray(ct.Structure):
 44 | 	_fields_ = [("length", ct.c_int), ("data", ct.POINTER(ct.c_int))]
 45 | 
 46 | class intarrayarray(ct.Structure):
 47 | 	_fields_ = [("num_rows", ct.c_int), ("rows", ct.POINTER(intarray))]
 48 | 
 49 | mymod = ct.cdll.LoadLibrary(module_path)
 50 | 
 51 | strings_to_sorted_indices = mymod.strings_to_sorted_indices
 52 | strings_to_sorted_indices.argtypes = [ct.POINTER(ct.c_char_p), ct.c_int]
 53 | strings_to_sorted_indices.restype = intarray
 54 | 
 55 | start_filter = mymod.start_filter
 56 | start_filter.argtypes = [intarrayarray, ct.c_char_p, ct.c_char_p]
 57 | start_filter.restype = ct.c_int
 58 | 
 59 | pack = mymod.pack_loose_files
 60 | pack.argtypes = [ct.c_char_p]
 61 | 
 62 | get_index_directory = mymod.get_index_directory
 63 | get_index_directory.restype = ct.c_char_p
 64 | 
 65 | HELP = '''\033[1m4grep\033[0m: fast grep using multiple cpus and 4gram filter
 66 | 
 67 | \033[1mSIMPLE USAGE\033[0m
 68 | 	4grep <regex> <filelist>
 69 | 	find <args> | 4grep <regex>
 70 | 
 71 | \033[1mADVANCED USAGE\033[0m
 72 | 	4grep --filter <filter string> <regex> <filelist>
 73 | 	4grep --filter <filter string1> --filter <filter string2> <regex> <filelist>
 74 | 	4grep <regex> <filelist> --cores N --indexdir path/to/index
 75 | 
 76 | \033[1mOPTIONAL ARGUMENTS\033[0m
 77 | 	--filter 		specify a filter string
 78 | 	--cores			limit number of cores used
 79 | 	--excludes		exclude files and directories by regex
 80 | 	--indexdir		specify directory to store index
 81 | 
 82 | \033[1mDESCRIPTION\033[0m
 83 | 	For standard use, 4grep takes in two parameters: a non-regex string
 84 | 	and the list of files. The string is first used to filter out files that
 85 | 	have no instances of the string anywhere in the file. It will the grep using
 86 | 	the string to find the lines where the string exists.
 87 | 
 88 | 	The more advanced uses breaks down the standard case. You can specify the
 89 | 	filter string and regex seperately. The filter string will be used to
 90 | 	filter any files that have no instances of the string anywhere in the file.
 91 | 	The regex is then passed onto grep for these subset of files and will give
 92 | 	you the lines which contain the regex.
 93 | 
 94 | 	Hence an advanced usage of 4grep may be that the list of files are first
 95 | 	filtered so that only files that have a certain string remain. Then the
 96 | 	regex will grep for lines that contain something else.
 97 | 
 98 | 	[--cores] was added to limit the number of cores that 4grep uses. If not
 99 | 	specified, or too large, the program will use the maximum number of cores -1.
100 | 
101 | \033[1mEXAMPLES\033[0m
102 | 	$ 4grep WARNING foo/bar/log.gz
103 | 	This will search for WARNING in the file 'log.gz', first filtering then grep
104 | 
105 | 	$ 4grep --filter WARNING [0-9] foo/bar/log.gz
106 | 	This is more advanced use. First the list of files will be filtered so that
107 | 	only files with the string 'WARNING' remain. Then the regex '[0-9]'
108 | 	will be used to grep and so any line that contains a number, in this subset
109 | 	of files, will be printed.
110 | 
111 | 	$ 4grep --exclude='bar|fizz' 'STACKTRACE' foo///.log
112 | 	This will search all of the files matching the regex
113 | 	.*foo.*/.*/.*/.*.log.* and exclude all directories named 'bar' or
114 | 	'fizz'.
115 | 
116 | \033[1mNOTES\033[0m
117 | 	- Filter strings are auto-detected from the regex in simple cases.
118 | 	- Filter strings must be at least 5 characters
119 | 	- Filter strings do not support regex yet and so are parsed as a
120 | 	  literal string
121 | 	'''
122 | 
123 | def run_pack_process(*args):
124 | 	ignore_sigint()
125 | 	pack(*args)
126 | 
127 | 
128 | # adapted from answers at https://stackoverflow.com/questions/5081657/
129 | @contextmanager
130 | def redirect(from_file, to_file):
131 | 	"""
132 | 	Redirects output to from_file's file descriptor into to_file's
133 | 	descriptor.
134 | 
135 | 	For example, the following redirects stdout to stderr:
136 | 
137 | 		import sys, os
138 | 
139 | 		with redirect(sys.stdout, sys.stderr):
140 | 			print('Hello, world!')  # outputs to stderr
141 | 			os.system("echo hello world")  # also outputs to stderr
142 | 
143 | 	from_file is flushed when entering and to_file is flushed when exiting
144 | 	the context to make sure userspace buffers don't write to the wrong
145 | 	place.
146 | 
147 | 	You probably don't want to write to both from_file and to_file under
148 | 	this context.
149 | 	"""
150 | 	# backup from_file fd by dup'ing it
151 | 	from_fd = from_file.fileno()
152 | 	dup_from_fd = os.dup(from_fd)
153 | 
154 | 	from_file.flush()
155 | 
156 | 	# replace from_file fd with to_file dup fd
157 | 	os.dup2(to_file.fileno(), from_fd)
158 | 	try:
159 | 		yield
160 | 	finally:
161 | 		# restore original from_file fd
162 | 		os.dup2(dup_from_fd, from_fd)
163 | 		# get rid of backup from_file fd
164 | 		os.close(dup_from_fd)
165 | 
166 | 		to_file.flush()
167 | 
168 | def filter_and_grep_worker_func(in_queue, out_queue, options, regex, index,
169 |                                 index_dir, quit_flag):
170 | 	ignore_sigint()
171 | 	tp = ThreadPool(1)
172 | 	while not quit_flag.value:
173 | 		try:
174 | 			item = in_queue.get(timeout=1)
175 | 			if item is None:
176 | 				return
177 | 			(i, f) = item
178 | 			result = tp.apply_async(
179 | 				do_filter_and_grep, (i, options, regex,
180 | 					f, index, index_dir))
181 | 			while not result.ready():
182 | 				result.wait(1.0)
183 | 				if quit_flag.value:
184 | 					tp.terminate()
185 | 					return
186 | 			out_queue.put(result.get())
187 | 		except Empty:
188 | 			pass
189 | 
190 | def do_filter_and_grep(i, options, regex, f, index=None, index_dir=None):
191 | 	BTMP_MTCH = 1
192 | 	BTMP_NOMTCH = 2
193 | 	NOBTMP_MTCH = 3 #never gets used since default
194 | 	NOBTMP_NOMTCH = 4
195 | 	bitmapped = filtered = False
196 | 	err = output = ""
197 | 
198 | 	if index and not index.empty():
199 | 		assert index_dir is not None
200 | 		index_dir_char_p = ct.c_char_p(index_dir)
201 | 		c_filename = ct.c_char_p(f)
202 | 		filter_struct = index.get_index_struct()
203 | 		with tempfile.TemporaryFile() as temp:
204 | 			with redirect(sys.stderr, temp):
205 | 				ret = start_filter(
206 | 					filter_struct, c_filename,
207 | 					index_dir_char_p)
208 | 			temp.seek(0)
209 | 			err = temp.read()
210 | 
211 | 		bitmapped = ret == BTMP_MTCH or ret == BTMP_NOMTCH
212 | 		filtered = ret == NOBTMP_NOMTCH or ret == BTMP_NOMTCH
213 | 
214 | 	if not filtered:
215 | 		output, grep_err = do_grep(options, regex, f)
216 | 		err += grep_err
217 | 	return (i, output, err, (bitmapped, filtered))
218 | 
219 | def do_grep(options, regex, f):
220 | 	grep = ["zgrep"] + options + ["--"] + [regex, f]
221 | 	# see https://blog.nelhage.com/2010/02/a-very-subtle-bug/
222 | 	# or http://bugs.python.org/issue1652 for why we need to handle SIGPIPE
223 | 	default_sigpipe = lambda: signal.signal(signal.SIGPIPE, signal.SIG_DFL)
224 | 	p = subprocess.Popen(grep, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
225 | 			     preexec_fn=default_sigpipe)
226 | 	output, err = p.communicate()
227 | 	return (output, err)
228 | 
229 | def print_progress_bar(progress, done, tracelog):
230 | 	total_files = progress.total_files
231 | 	count = progress.count
232 | 	elapsed = time.time() - progress.init_time
233 | 	mins, secs = divmod(elapsed, 60)
234 | 	if total_files == 0:
235 | 		return
236 | 
237 | 	if count == 0:
238 | 		print('>{bold}Done:{end}{:5.1f}% of {}{}{end} '
239 | 		      '{bold}Elapsed:{end}{:g}m{:>04.1f}s{}'.format(
240 | 		      0, progress.color, total_files,
241 | 		      mins, secs, Color.CLEAR_END+Color.UP, bold=Color.BOLD,
242 | 		      end=Color.END), file=sys.stderr)
243 | 		return
244 | 
245 | 	remain = (total_files-count)*elapsed/count
246 | 	mins2, secs2 = divmod(remain, 60)
247 | 	bitmapped_p = progress.bitmapped*100.0/count
248 | 	filtered_p = progress.filtered*100.0/count
249 | 
250 | 	if not done:
251 | 		done_p = math.floor(count*1000.0/total_files)/10
252 | 		print('>{bold}Done:{end}{:5.1f}% of {}{}{end} '
253 | 		      '{bold}Elapsed:{end}{:g}m{:>04.1f}s '
254 | 		      '{bold}Bitmapped:{end}{:5.1f}% '
255 | 		      '{bold}Filtered:{end}{:5.1f}% '
256 | 		      '{bold}ETA:{end}{:g}m{:>04.1f}s{}'.format(
257 | 		      done_p, progress.color, total_files, mins, secs,
258 | 		      bitmapped_p, filtered_p, mins2, secs2, Color.CLEAR_END+Color.UP,
259 | 		      bold=Color.BOLD, end=Color.END), file=sys.stderr)
260 | 
261 | 	if done:
262 | 		print('>{bold}{}Finished:{end}{} files '
263 | 		      '{bold}Elapsed:{end}{:g}m{:>04.1f}s '
264 | 		      '{bold}Bitmapped:{end}{:5.1f}% '
265 | 		      '{bold}Filtered:{end}{:5.1f}%{}'.format(
266 | 		      Color.GREEN, total_files, mins, secs, bitmapped_p,
267 | 		      filtered_p, Color.CLEAR_END, bold=Color.BOLD, end=Color.END), file=sys.stderr)
268 | 		tracelog.bitmapped = bitmapped_p
269 | 		tracelog.filtered = filtered_p
270 | 		tracelog.total_files = total_files
271 | 		tracelog.elapsed = elapsed
272 | 		print_to_log(tracelog)
273 | 
274 | 
275 | def print_accumulated(progress):
276 | 	if progress.printed in progress.gout:
277 | 		print(Color.CLEAR_LINE, end='', file=sys.stderr)
278 | 	while progress.printed in progress.gout:
279 | 		output = progress.gout[progress.printed]
280 | 		print(output, end='')
281 | 		progress.printed += 1
282 | 
283 | def start_pack_process(progress, bitmap_store_dir_char_p):
284 | 	if progress.pack_process:
285 | 		progress.pack_process.join()
286 | 	progress.pack_process = mp.Process(
287 | 			target=run_pack_process,
288 | 			args=(bitmap_store_dir_char_p,))
289 | 	progress.pack_process.start()
290 | 
291 | def handle_results(result_queue, progress, index_dir):
292 | 	while not result_queue.empty():
293 | 		update_progress(result_queue.get(timeout=1), progress, index_dir)
294 | 	output_progress(progress)
295 | 
296 | def update_progress(result, progress, bitmap_store_dir_char_p):
297 | 	i, output, err, b = result
298 | 	if err:
299 | 		progress.error_queue.append(err)
300 | 	progress.bitmapped += b[0]
301 | 	progress.filtered += b[1]
302 | 	progress.gout[i] = output
303 | 	progress.count += 1
304 | 	if ((progress.count % 1000 == 0) or (progress.count == \
305 | 	    progress.total_files)) and not (progress.pack_process.is_alive()):
306 | 		start_pack_process(progress, bitmap_store_dir_char_p)
307 | 
308 | 
309 | def output_progress(progress):
310 | 	while len(progress.error_queue) != 0:
311 | 		err = progress.error_queue.popleft()
312 | 		print(Color.CLEAR_LINE, end='', file=sys.stderr)
313 | 		print(err, end='', file=sys.stderr)
314 | 	print_accumulated(progress)
315 | 	print_progress_bar(progress, False, None)
316 | 
317 | def queue_generator(queue, generator):
318 | 	for item in generator:
319 | 		queue.append(item)
320 | 	queue.append(None)
321 | 
322 | class Color:
323 | 	GREEN = '\033[92m'
324 | 	RED = '\033[91m'
325 | 	BOLD = '\033[1m'
326 | 	END = '\033[0m'
327 | 	UP = '\033[F'
328 | 	CLEAR_END = '\033[K'
329 | 	CLEAR_LINE = '\x1b[2K'
330 | 
331 | class SearchProgress(object):
332 | 	def __init__(self):
333 | 		self.init_time = 0
334 | 		self.count = 0
335 | 		self.bitmapped = 0
336 | 		self.filtered = 0
337 | 		self.printed = 0
338 | 		self.total_files = 0
339 | 		self.gout = {}
340 | 		self.color = Color.RED + Color.BOLD
341 | 		self.pack_process = None
342 | 		self.error_queue = deque()
343 | 
344 | 
345 | def ignore_sigint():
346 | 	signal.signal(signal.SIGINT, signal.SIG_IGN)
347 | 
348 | def smp_loop(options, files, index, tracelog):
349 | 	index_dir = tracelog.indexdir_abs
350 | 	progress = SearchProgress()
351 | 	progress.init_time = tracelog.init_time
352 | 	file_queue = deque()
353 | 	file_queueing_thread = threading.Thread(target=queue_generator,
354 | 	                                        args=(file_queue, files))
355 | 	file_queueing_thread.daemon = True
356 | 	file_queueing_thread.start()
357 | 	cores = min(mp.cpu_count()-1, tracelog.cores) if tracelog.cores \
358 | 		else mp.cpu_count() - 1
359 | 	print('{bold}using {} cores{end}\n'.format(cores, bold=Color.BOLD,
360 | 		      end=Color.END), file=sys.stderr)
361 | 	filter_and_grep_work_input_queue = mp.Queue()
362 | 	output_queue = mp.Queue()
363 | 	quit_flag = mp.Value("i", 0)
364 | 	processes = [mp.Process(
365 | 		target=filter_and_grep_worker_func,
366 | 		args=(filter_and_grep_work_input_queue, output_queue, options,
367 | 			tracelog.regex, index, index_dir, quit_flag))
368 | 		for i in range(cores)]
369 | 	for p in processes:
370 | 		p.daemon = True
371 | 		p.start()
372 | 	try:
373 | 		start_pack_process(progress, index_dir)
374 | 		work_queued = 0
375 | 		while True:
376 | 			while len(file_queue) == 0:
377 | 				handle_results(output_queue, progress, index_dir)
378 | 				time.sleep(0.1)
379 | 			f = file_queue.popleft()
380 | 			if f == None:
381 | 				break
382 | 			if not os.path.exists(f):
383 | 				print(Color.CLEAR_LINE + '4grep: {}: No such file or directory'.format(f),
384 | 						file=sys.stderr)
385 | 				continue
386 | 			if os.path.isdir(f):
387 | 				print(Color.CLEAR_LINE + '4grep: {}: Is a directory'.format(f),
388 | 						file=sys.stderr)
389 | 				continue
390 | 			progress.total_files = len(file_queue) + work_queued
391 | 			filter_and_grep_work_input_queue.put((work_queued, f))
392 | 			handle_results(output_queue, progress, index_dir)
393 | 			work_queued += 1
394 | 		for _ in range(cores):
395 | 			filter_and_grep_work_input_queue.put(None)
396 | 		progress.total_files = work_queued
397 | 		progress.color = Color.GREEN + Color.BOLD
398 | 		for p in processes:
399 | 			while p.is_alive():
400 | 				handle_results(output_queue, progress,
401 | 						index_dir)
402 | 				time.sleep(0.1)
403 | 		handle_results(output_queue, progress, index_dir)
404 | 		if progress.count != 0:
405 | 			print_progress_bar(progress, True, tracelog)
406 | 		else:
407 | 			print(Color.CLEAR_LINE + '4grep: no files found', file=sys.stderr)
408 | 	except KeyboardInterrupt:
409 | 		ignore_sigint()  # prevent interruption of interrupt handling
410 | 		print(file=sys.stderr)
411 | 		print(Color.END + "Aborting 4grep...", file=sys.stderr)
412 | 		quit_flag.value = 1
413 | 		empty_queue(filter_and_grep_work_input_queue)
414 | 		for p in processes:
415 | 			p.join()
416 | 		sys.exit(1)
417 | 
418 | def empty_queue(queue):
419 | 	while not queue.empty():
420 | 		try:
421 | 			queue.get_nowait()
422 | 		except Empty:
423 | 			pass
424 | 
425 | class stdin_iter:
426 | 	def __init__(self):
427 | 		pass
428 | 
429 | 	def __iter__(self):
430 | 		return self
431 | 
432 | 	def next(self):
433 | 		ret = sys.stdin.readline().strip()
434 | 		if not ret:
435 | 			raise StopIteration
436 | 		return ret
437 | 
438 | class regex_iter:
439 | 	def __init__(self, regex, excludes):
440 | 		self.regex = [".*"+r+".*" for r in regex.split("/")]
441 | 		self.level = 0
442 | 		self.height = len(self.regex) - 1
443 | 		self.ls = ['' for r in self.regex]
444 | 		self.ls[0] = sorted([f for f in os.listdir('.')
445 | 		                    if re.match(self.regex[0], f)])
446 | 		self.excludes = excludes
447 | 
448 | 	def __iter__(self):
449 | 		return self
450 | 
451 | 	def walkup(self):
452 | 		while self.level and not self.ls[self.level]:
453 | 			self.level -= 1
454 | 		if self.level == 0 and not self.ls[0]:
455 | 			raise StopIteration
456 | 
457 | 	def next(self):
458 | 		def dirlist(l):
459 | 			return [f for f in l if os.path.isdir(f)]
460 | 
461 | 		def filelist(l):
462 | 			return [f for f in l if os.path.isfile(f)]
463 | 
464 | 		self.walkup()
465 | 		while self.level < self.height:
466 | 			dir = self.ls[self.level].pop(0)
467 | 			self.level += 1
468 | 			self.ls[self.level] = sorted([dir+'/'+ f for f in os.listdir(dir)
469 | 			                             if re.match(self.regex[self.level],
470 | 			                                         f)])
471 | 			if self.excludes:
472 | 				self.ls[self.level] = [f for f in self.ls[self.level]
473 | 									   if not re.match(self.excludes, f)]
474 | 
475 | 			if self.level < self.height:
476 | 				self.ls[self.level] = dirlist(self.ls[self.level])
477 | 			else:
478 | 				self.ls[self.level] = filelist(self.ls[self.level])
479 | 			self.walkup()
480 | 		return self.ls[self.level].pop(0)
481 | 
482 | def intersect(a, b):
483 | 	return list(set(a) & set(b))
484 | 
485 | def get_index_from_regex(regex):
486 | 	"""
487 | 	Parsing a regex is hard.
488 | 	But there are some low-hanging fruits that can satisfy most usecases.
489 | 	"""
490 | 	non_regex_metachar = "[^{}]".format(ESCAPED_REGEX_METACHARACTERS)
491 | 	parsable_metachar = r"(\.\*?|\+)"
492 | 	# 'parsable': things we can parse an ANDed index from
493 | 	parsable = "({}|{})".format(non_regex_metachar, parsable_metachar)
494 | 	# first low-hanging fruit: does grabbing all non-metachars work?
495 | 	if re.match("^{}+$".format(parsable), regex):
496 | 		literals = re.split("{}+".format(parsable_metachar), regex)
497 | 		long_enough = (l for l in literals if len(l) >= NGRAM_CHARS)
498 | 		return StringIndex([long_enough])
499 | 	# second: is it a series of the above |'d together?
500 | 	elif re.match(r"^{0}+(\|{0}+)+$".format(parsable), regex):
501 | 		subexprs = regex.split("|")
502 | 		sub_indices = tuple(get_index_from_regex(r) for r in subexprs)
503 | 		if any(s.empty() for s in sub_indices):
504 | 			return empty_index()
505 | 		string_sets = tuple(ind.strings[0] for ind in sub_indices)
506 | 		return StringIndex(string_sets)
507 | 	# third: is it safe to just grab string literals from the start/end?
508 | 	if any(x in regex for x in ('|', '?', '*', '{,', '{0')):
509 | 		return empty_index()
510 | 	start = re.match("^{}+".format(non_regex_metachar), regex)
511 | 	end = re.search("{}+$".format(non_regex_metachar), regex)
512 | 	if start and end:
513 | 		start = start.group()
514 | 		end = end.group()
515 | 		return StringIndex([[start]] if start == end else [[start, end]])
516 | 	elif start:
517 | 		return StringIndex([[start.group()]])
518 | 	elif end:
519 | 		return StringIndex([[end.group()]])
520 | 	return empty_index()
521 | 
522 | class StringIndex(object):
523 | 	""" Represents a search index.
524 | 
525 | 	strings: a list of lists of strings.
526 | 	The inner lists of strings will be ANDed together, and the outer lists
527 | 	will all be ORed together.
528 | 
529 | 	For example,
530 | 		StringIndex([["one", "two"], "three"])
531 | 	represents the index matching ("one" AND "two") OR "three"
532 | 	"""
533 | 	def __init__(self, strings):
534 | 		strings = tuple(strings)
535 | 		self.strings = tuple(tuple(
536 | 				s for s in sublist if len(s) >= NGRAM_CHARS)
537 | 			for sublist in strings)
538 | 		if any(len(s) == 0 for s in self.strings):
539 | 			self.strings = ()
540 | 
541 | 	def empty(self):
542 | 		return len(self.strings) == 0
543 | 
544 | 	def get_index_struct(self):
545 | 		""" Returns a struct suitable for passing to our C code as an
546 | 		index.
547 | 		"""
548 | 		intarrays = []
549 | 		assert not self.empty()
550 | 		for ss in self.strings:
551 | 			assert len(ss) != 0
552 | 			assert not any(len(s) < NGRAM_CHARS for s in ss)
553 | 			char_p_p = (ct.c_char_p * len(ss)) (*ss)
554 | 			intarrays.append(strings_to_sorted_indices(char_p_p, len(ss)))
555 | 		iaa = intarrayarray()
556 | 		iaa.num_rows = len(intarrays)
557 | 		iaa.rows = (intarray * len(intarrays)) (*intarrays)
558 | 		return iaa
559 | 
560 | 	def __str__(self):
561 | 		return ' OR '.join(
562 | 			' AND '.join(s for s in ss)
563 | 			for ss in self.strings)
564 | 
565 | 	def __eq__(self, other):
566 | 		return type(self) is type(other) \
567 | 				and self.strings == other.strings
568 | 
569 | 	def __ne__(self, other):
570 | 		return not self.__eq__(other)
571 | 
572 | 	def __repr__(self):
573 | 		return "StringIndex({})".format(str(self))
574 | 
575 | def empty_index():
576 | 	return StringIndex([])
577 | 
578 | def get_index(args):
579 | 	""" Returns a StringIndex parsed from the args.
580 | 
581 | 	If --filter was specified, it uses args.filter, else it uses
582 | 	args.regex.
583 | 	"""
584 | 	if args.filter is not None:
585 | 		indices = [s for s in args.filter if len(s) >= NGRAM_CHARS]
586 | 		if len(indices) == 0:
587 | 			print("{bold}4grep: cannot filter on {} (too short) {end} "
588 | 					.format(args.filter, bold=Color.BOLD, end=Color.END), file=sys.stderr, end='')
589 | 			return empty_index()
590 | 		else:
591 | 			return StringIndex([indices])
592 | 	else:
593 | 		index = get_index_from_regex(args.regex)
594 | 		if index.empty():
595 | 			print("{bold}4grep: cannot detect filter for '{}' {end} "
596 | 					.format(args.regex, bold=Color.BOLD, end=Color.END), file=sys.stderr, end='')
597 | 
598 | 	if not index.empty():
599 | 		print('{bold}4grep filtering on {} {end}'.format(index, bold=Color.BOLD,
600 | 		      end=Color.END), file=sys.stderr, end='')
601 | 	return index
602 | 
603 | class TraceLog(object):
604 | 	def __init__(self):
605 | 		self.user = getpass.getuser()
606 | 		self.init_time = time.time()
607 | 		self.end_time = 0
608 | 		self.filtered = 0
609 | 		self.bitmapped = 0
610 | 		self.total_files = 0
611 | 		self.elapsed = 0
612 | 		self.regex = None
613 | 		self.exclude = None
614 | 		self.cores = None
615 | 		self.filter = None
616 | 		self.indexdir = None
617 | 		self.indexdir_abs = None
618 | 
619 | def print_to_log(tracelog):
620 | 	# Keep .4grep.log hidden or will be packed
621 | 	file_name = tracelog.indexdir_abs + "/.4grep.log"
622 | 	exists = os.path.exists(file_name)
623 | 	datetime = time.gmtime(tracelog.init_time)
624 | 	elapsed = tracelog.elapsed
625 | 	with open(file_name, 'a') as f:
626 | 		if not exists:
627 | 			f.write("t_end\tuser\tt_elapsed\ttotal_files\t%_filtered\t%_bitmapped"
628 | 			        "\tregex\t--exclude\t--cores\t--filter\t--indexdir\n")
629 | 		elapsed = tracelog.elapsed
630 | 		f.write("{}/{}/{} {}:{}:{}\t{}\t{:.2f}\t{:.0f}\t{:.2f}\t{:.2f}\t{}\t{}\t{}\t{}\t{}\n".format(
631 | 		        datetime[1], datetime[2], datetime[0], datetime[3], datetime[4], datetime[5],
632 | 		        tracelog.user, elapsed, tracelog.total_files, tracelog.filtered, tracelog.bitmapped,
633 | 		        tracelog.regex, tracelog.exclude, tracelog.cores, tracelog.filter, tracelog.indexdir))
634 | 	if not exists:
635 | 		os.chmod(file_name, 0o666)
636 | 
637 | 
638 | def main():
639 | 	tracelog = TraceLog()
640 | 
641 | 	parser = argparse.ArgumentParser("4grep", usage=HELP, add_help=False)
642 | 	parser.add_argument('regex', metavar='REGEX', type=str)
643 | 	parser.add_argument('files', metavar='FILE', type=str, nargs='*')
644 | 	parser.add_argument('--exclude', type=str)
645 | 	parser.add_argument('--cores', type=int)
646 | 	parser.add_argument('--filter', action='append', type=str)
647 | 	parser.add_argument('--indexdir', type=str)
648 | 	parser.add_argument('--help', action="help")
649 | 	args, options = parser.parse_known_args()
650 | 
651 | 	tracelog.regex = args.regex
652 | 	tracelog.exclude = args.exclude
653 | 	tracelog.cores = args.cores
654 | 	tracelog.filter = args.filter
655 | 	tracelog.indexdir = args.indexdir
656 | 
657 | 	filelist = args.files
658 | 	# hack to handle mixed flags and filenames, because argparse doesn't
659 | 	filelist.extend(opt for opt in options if opt[0] != '-')
660 | 	options = [opt for opt in options if opt[0] == '-']
661 | 	index = get_index(args)
662 | 	tracelog.indexdir_abs = os.path.abspath(os.path.expanduser(os.path.expandvars(
663 | 			args.indexdir if args.indexdir is not None
664 | 			else get_index_directory())))
665 | 
666 | 	# smart default for -h vs. -H
667 | 	if intersect(("-h", "-H", "--with-filename", "--no-filename"), options):
668 | 		# explicitly set by caller
669 | 		pass
670 | 	elif len(filelist) == 1 and os.path.isfile(filelist[0]):
671 | 		options.append("-h")
672 | 	else:
673 | 		options.append("-H")
674 | 
675 | 	if not filelist:
676 | 		# read filelist from stdin instead
677 | 		filelist = stdin_iter()
678 | 	elif (len(filelist) == 1) and not os.path.isfile(filelist[0]):
679 | 		filelist = regex_iter(filelist[0], args.exclude)
680 | 
681 | 	smp_loop(options, filelist, index, tracelog)
682 | 
683 | if __name__ == "__main__":
684 | 	try:
685 | 		main()
686 | 	except IOError as e:
687 | 		if e.errno == errno.EPIPE:
688 | 			pass
689 | 	except KeyboardInterrupt:
690 | 		pass
691 | 
692 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    Copyright 2017 Pure Storage
179 | 
180 |    Licensed under the Apache License, Version 2.0 (the "License");
181 |    you may not use this file except in compliance with the License.
182 |    You may obtain a copy of the License at
183 | 
184 |        http://www.apache.org/licenses/LICENSE-2.0
185 | 
186 |    Unless required by applicable law or agreed to in writing, software
187 |    distributed under the License is distributed on an "AS IS" BASIS,
188 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
189 |    See the License for the specific language governing permissions and
190 |    limitations under the License.
191 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | all:
 2 | 	@$(MAKE) -C bitmap
 3 | 
 4 | install: all
 5 | 	install -D -m 0755 4grep -t $(DESTDIR)/usr/bin
 6 | 	install -D -m 0644 bitmap/4grep.so -t $(DESTDIR)/usr/lib
 7 | 
 8 | test:
 9 | 	@$(MAKE) -C bitmap
10 | 	@./bitmap/exec/test
11 | 	@python ./test.py
12 | 
13 | clean:
14 | 	@$(MAKE) -C bitmap clean
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 4grep
  2 | 
  3 | ![alt tag](https://github.com/purestorage/4grep/blob/master/img/example.gif)
  4 | 
  5 | 4grep is a tool developed by interns [Viveque Ramji](https://github.com/vivequeramji) and [Matthew Pfeiffer](https://github.com/Spferical) during Summer 2017 at Pure Storage to extend the functionality of zgrep. It makes searching log files faster by having a stored 5gram index file and conducting a 'pre-search'. 4grep works by looking at this stored index file (creates one if it doesn’t exist) and skips the grep process altogether if the search string definitely doesn’t exist. It uses multiple processors to run zgrep concurrently.
  6 | 
  7 | It also has a *really* fancy progress bar.
  8 | 
  9 | 4grep also allows you to use the same flags that grep does. The 4grep program passes these on to grep internally if/when a search occurs.
 10 | 
 11 | ## Contents
 12 | * [How the Indexing Works](#how-the-indexing-works)
 13 |   * [More Nuance](more-nuance)
 14 | * [How to Get it](#how-to-get-it)
 15 | * [Usage](#usage)
 16 |   * [Simple](#simple)
 17 |   * [Advanced Options](#advanced-options)
 18 | * [Progress Bar](#progress-bar)
 19 | * [Limitations](#limitations)
 20 | * [Where is the Index Saved?](#where-is-the-index-saved)
 21 | * [4grep Log](#4grep-log)
 22 | * [Other Tools Used by 4grep](#other-tools-used-by-4grep)
 23 |   * [Zstandard](#zstandard)
 24 |   * [xxHash](#xxhash)
 25 | * [License](#license)
 26 | 
 27 | ![alt tag](https://github.com/purestorage/4grep/blob/master/img/zgrepvs4grep.png)
 28 | 
 29 | ## How the Indexing Works
 30 | 
 31 | A file is indexed whenever it is first encountered. The index is stored based on its full, expanded, de-symlinked path, and once generated, it will never again be re-indexed. The index stores the existence of all 5-grams in a file (sequences of 5 characters).
 32 | 
 33 | When searching, 4grep will first parse 5-grams from the regex parameter. If filter strings are given via `--filter`, 5-grams will be generated from them instead. Then, 4grep filters out files that, based on the index, do not contain all of the 5-grams from the parameters. A "normal" search is performed on the files that pass this 5-gram filtering step.
 34 | 
 35 | ### More Nuance
 36 | 
 37 | For every character in a 5-gram, 4grep will apply a 4-bit mask. This drastically reduces the number of possible 5-grams from 2^40 to 2^20, making the index much smaller. It also means that there are collisions. For example, the 5-grams "AAAAA" and "aaaaa" are considered the same. There is a balance between filtering files out more effectively and filtering files out faster, and 5-grams with 4 bits-per-gram happens to be very effective on our log files.
 38 | 
 39 | 
 40 | ## How to Get It
 41 | 
 42 | *Insert instructions here*
 43 | 
 44 | 
 45 | ## Usage
 46 | 
 47 | ### Simple
 48 | ```bash
 49 | $ 4grep <regex> <filelist>
 50 | $ find <args> | 4grep <regex>
 51 | ```
 52 | 
 53 | #### Example
 54 | 
 55 | ```bash
 56 | # searches for files that contain 'STACK'
 57 | $ find ~/Desktop/logs/* | ./4grep STACK
 58 | 4grep filtering on 'STACK'
 59 | ```
 60 | 
 61 | ### Advanced Options
 62 | **--cores**
 63 | ```bash
 64 | $ 4grep <regex> <filelist> --cores N
 65 | ```
 66 | --cores was added to limit the number of cores that 4grep uses. If not specified, or too large, the program will use the maximum number of cores - 1.
 67 | 
 68 | **--indexdir**
 69 | ```bash
 70 | $ 4grep <regex> <filelist> --indexdir=<location>
 71 | ```
 72 | This option specifies where 4grep stores its index. See [Where is the Index Saved?](#where-is-the-index-saved) for the default index locations.
 73 | 
 74 | **--filter**
 75 | 
 76 | 4grep tries to parse string literals from the provided regex. In the pre-filtering step, it uses its index files to filter out files that don't contain all of these string literals. For example, the regex "Overslept by [0-9]{3}" can only match in files that contain the string literal "Overslept by ". So, 4grep will detect "Overslept by" as a filter string and filter out files that don't contain it in the pre-filtering step.
 77 | ```bash
 78 | $ 4grep STACKTRACE logs/core.log
 79 | 4grep filtering on 'STACKTRACE'
 80 | ```
 81 | ```bash
 82 | $ 4grep "STACK.*TRACE" logs/core.log
 83 | 4grep filtering on ['STACK', 'TRACE']
 84 | ```
 85 | ```bash
 86 | $ 4grep "(STACK|TRACE)" logs/core.log
 87 | 4grep: cannot detect filter for '(STACK|TRACE)'
 88 | ```
 89 | This auto-detection works for regexes that have their literals at their start and/or end. However, 4grep's only does really basic regex parsing, and in some cases it may help to manually specify string literals to index with. We will call these string literals "filter strings." This can be done with the `--filter` option:
 90 | ```bash
 91 | $ 4grep --filter <filter_string> <regex> <filelist>
 92 | ```
 93 | You can specify the filter string and regex separately. The filter string will be used to filter any files that have no instances of the string anywhere in the file. The regex is then passed onto grep for these subset of files and will give you the lines which contain the regex.
 94 | ```bash
 95 | $ 4grep --filter <filter_string_1> --filter <filter_string_2> <regex> <filelist>
 96 | ```
 97 | 
 98 | * **You can also specify multiple filter strings that can be used to create a smaller subset of filtered files
 99 | Filter strings must be at least 5 characters**
100 | 
101 | 
102 | * **The filter string must be a literal string, not a regex**
103 | 
104 | 
105 | #### Advanced Examples
106 | ```bash
107 | # Filters files that contain 'WARNING' anywhere then prints out lines that contain a number
108 | $ find ~/Desktop/logs/* | ./4grep --filter WARNING [0-9]
109 | 4grep filtering on 'WARNING'
110 | ```
111 | ```bash
112 | # Filters files that contain 'WARNING' and 'STACK' anywhere then prints out lines that contain a number
113 | $ find ~/Desktop/logs/* | ./4grep --filter WARNING --filter STACK [0-9]
114 | 4grep filtering on ['WARNING','STACK']
115 | ```
116 | ```bash
117 | # searches for files that contain 'STACK' with at most 10 cores
118 | $ find ~/Desktop/logs/* | ./4grep STACK --cores 10
119 | 4grep filtering on 'STACK'
120 | ```
121 | ```bash
122 | # searches for files that contain 'STACK' whilst storing index files in ~/.4grep
123 | $ find ~/Desktop/logs/* | ./4grep STACK --indexdir=~/.4grep
124 | 4grep filtering on 'STACK'
125 | 
126 | ```
127 | 
128 | 
129 | 
130 | ## Progress Bar
131 | 
132 | Secondly, 4grep includes a progress bar. This is a feature that has become super useful. Here is an example of the progress that is printed to stderr as you run 4grep:
133 | ```bash
134 | >Done: 15.8% of 54752 Elapsed:0m02.4s Bitmapped:100.0% Filtered: 99.9% ETA:0m12.5s
135 | ```
136 | | Output        | Meaning       |
137 | | ------------- | ------------- |
138 | | Done          | Indicates the percentage of files that have been searched from the number of files found.|
139 | | File Count    | The number of files found has two colours. Green if all of the files are found, and red if files are still being piped into stdin.|
140 | | Elapsed       | The time since the program began.|
141 | | Bitmapped     | Indicates the proportion of files that had already been indexed. If this is the first time 4grep has seen any of the files (and so no index files exist) this will be 0%. The higher this is, the fewer files 4grep will have to index, and the faster 4grep will be.|
142 | | Filtered      | Indicates the percentage of files that have skipped the grepping process because of the filter. The higher this number, the faster 4grep should be than tgrep since less files will be searched.|
143 | | ETA           | Gives an estimate on how long the program will take to finish. This is calculated from the files already searched and so is only an estimate.|
144 | 
145 | 
146 | ## Limitations
147 | 4grep does not handle file modification. When it filters files out of the search with its filter string, 4grep will consider the state of the file as it was when it was first indexed. If a file is modified to contain a string that is then used as a search index, 4grep may wrongly filter the file out of the search and not report matches within the file. There is not currently any way to re-index a file or directory.
148 | 
149 | 4grep can be bottlenecked by the speed of the filesystem that the index file is stored on, say, NFS.
150 | The smaller the files being searched over, the more significant 4grep's overhead becomes, and the less of a performance improvement it will give.
151 | 
152 | As described above, 4grep does not parse regular expressions. It only autodetects a filter string for the easy case where string literals are on the left and/or right of the regex.
153 | 
154 | Strings less than 5 characters cannot be indexed on. Longer strings are best for filtering. The longer the filter string(s), the higher percentage of files should be filtered, and the faster 4grep will go.
155 | 4grep wants to go as fast as possible. One process per core going through files as fast as it can may bring some machines to their knees. We've had one report of 4grep freezing up a machine searching through a checkout of purity with 40 cores.
156 |  
157 |  
158 | ## Where is the Index Saved?
159 | For minimal overhead, we want a global index file shared by everyone. Ideally, log files can be automatically indexed as they are added to fuse, but we do not do this (yet). For now, 4grep has a ranking of directories it would like to store the index in. This ranking goes:
160 | 1. `/4gram` (to be used when we get a proper distribution method)
161 | 1. `~/.cache/4gram` (should fall back to when running most other places)
162 | 
163 | The index is designed to be persistent, multi-process, multi-user, and multi-machine-on-NFS safe. Though you should just be able to `rm -r` it if the index is just stored in your home directory and 4grep is not running.
164 | The index location may be overridden with the `--indexdir` option.
165 |  
166 |  
167 | ## 4grep Log
168 | When 4grep finishes its search, certain statistics will be recorded in 4grep's own log file. This will help to spot patterns between searches and hopefully optimize for the 90% case in the future. This logfile is stored in /path/to/index/.4grep.log.
169 |  
170 |  
171 | ## Other Tools Used by 4grep
172 | 
173 | ### Zstandard
174 | When storing the index files, Zstandard was chosen as the compression algorithm. Zstandard outperformed gzip significantly for compression ratios and decompression speeds on our index files. We also kept the compression level down at 8 (current max = 22) since we found that for our data, which is small data with mostly 0's, this performed best. More info at: [https://github.com/facebook/zstd](https://github.com/facebook/zstd).
175 | 
176 | ### xxHash
177 | To store the index file, we decided to hash its original name into something more uniform. xxHash, developed by the same author of Zstd (Yann Collet), seemed to be the fastest and easiest to use for our program. More info at: [https://github.com/Cyan4973/xxHash](https://github.com/Cyan4973/xxHash)
178 | 
179 | 
180 | ## License
181 | 
182 | The full license for the project can be found [here](https://github.com/purestorage/4grep/blob/master/LICENSE).
183 | 
184 | This project is licensed under the terms of the Apache-2.0 license.
185 | 


--------------------------------------------------------------------------------
/bitmap/.gitignore:
--------------------------------------------------------------------------------
1 | packfile
2 | test
3 | *.o
4 | *.so
5 | 


--------------------------------------------------------------------------------
/bitmap/Makefile:
--------------------------------------------------------------------------------
 1 | CC=gcc
 2 | CFLAGS=-Wall -std=gnu11 -O3 -fPIC
 3 | LIBS=-lz ./lib/zstd/lib/libzstd.a -llockfile -lpthread
 4 | INCLUDES = -I./src -I./lib -I./lib/xxhash -I./lib/zstd/lib
 5 | HEADERS := $(shell find ./src -name "*.h")
 6 | 
 7 | SRCS_FILES := $(shell find ./src -name "*.c")
 8 | SRCS_FILES += ./lib/xxhash/xxhash.c
 9 | EXEDIR=exec
10 | MAINDIR=main
11 | UNAME_S := $(shell uname -s)
12 | LIBFLAGS=
13 | ifeq ($(UNAME_S),Linux)
14 | 	LIBFLAGS += -Wl,-soname,4grep.so
15 | endif
16 | 
17 | ZSTD_STATIC=./lib/zstd/lib/libzstd.a
18 | 
19 | all: $(EXEDIR)/test $(EXEDIR)/generate_bitmap 4grep.so
20 | 
21 | SRCS_OBJECTS := $(patsubst %.c, %.o, $(SRCS_FILES))
22 | 
23 | %.o: %.c $(HEADERS)
24 | 	@$(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@
25 | 
26 | $(ZSTD_STATIC):
27 | 	@$(MAKE) -s -C ./lib/zstd/lib CFLAGS="-fPIC -O3" libzstd.a
28 | 
29 | 4grep.so: $(SRCS_OBJECTS) $(ZSTD_STATIC)
30 | 	@$(CC) $(CFLAGS) $(LIBFLAGS) -o 4grep.so $(SRCS_OBJECTS) -shared $(LIBS)
31 | 
32 | $(EXEDIR)/generate_bitmap: $(MAINDIR)/generate_bitmap.o $(SRCS_OBJECTS) $(ZSTD_STATIC)
33 | 	@$(CC) $(CFLAGS) $(SRCS_OBJECTS) $(MAINDIR)/generate_bitmap.o -o \
34 | 		$(EXEDIR)/generate_bitmap $(LIBS)
35 | 
36 | $(EXEDIR)/test: $(MAINDIR)/test.o $(SRCS_OBJECTS) $(ZSTD_STATIC)
37 | 	@$(CC) $(CFLAGS) $(SRCS_OBJECTS) $(MAINDIR)/test.o -o $(EXEDIR)/test $(LIBS)
38 | 
39 | clean:
40 | 	@$(RM) $(EXEDIR)/generate_bitmap $(EXEDIR)/4gram_filter $(EXEDIR)/test */*.o 4grep.so $(ZSTD_STATIC) ./lib/xxhash/*.o
41 | 	@$(MAKE) -C ./lib/zstd clean
42 | 
43 | .PHONY: all clean
44 | 


--------------------------------------------------------------------------------
/bitmap/exec/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/bitmap/lib/minunit.h:
--------------------------------------------------------------------------------
1 | /** MinUnit
2 |  * As taken from http://www.jera.com/techinfo/jtns/jtn002.html
3 |  * License: You may use the code in this tech note for any purpose, with
4 |  * the understanding that it comes with NO WARRANTY. */
5 | #define mu_assert(message, test) do { if (!(test)) return message; } while (0)
6 | #define mu_run_test(test) do { char *message = test(); tests_run++; \
7 |                                   if (message) return message; } while (0)
8 | extern int tests_run;
9 | 


--------------------------------------------------------------------------------
/bitmap/lib/portable_endian.h:
--------------------------------------------------------------------------------
  1 | // From https://gist.github.com/panzi/6856583
  2 | // "License": Public Domain
  3 | // I, Mathias Panzenböck, place this file hereby into the public domain. Use it at your own risk for whatever you like.
  4 | // In case there are jurisdictions that don't support putting things in the public domain you can also consider it to
  5 | // be "dual licensed" under the BSD, MIT and Apache licenses, if you want to. This code is trivial anyway. Consider it
  6 | // an example on how to get the endian conversion functions on different platforms.
  7 | 
  8 | #ifndef PORTABLE_ENDIAN_H__
  9 | #define PORTABLE_ENDIAN_H__
 10 | 
 11 | #if (defined(_WIN16) || defined(_WIN32) || defined(_WIN64)) && !defined(__WINDOWS__)
 12 | 
 13 | #	define __WINDOWS__
 14 | 
 15 | #endif
 16 | 
 17 | #if defined(__linux__) || defined(__CYGWIN__)
 18 | 
 19 | #	include <endian.h>
 20 | 
 21 | #elif defined(__APPLE__)
 22 | 
 23 | #	include <libkern/OSByteOrder.h>
 24 | 
 25 | #	define htobe16(x) OSSwapHostToBigInt16(x)
 26 | #	define htole16(x) OSSwapHostToLittleInt16(x)
 27 | #	define be16toh(x) OSSwapBigToHostInt16(x)
 28 | #	define le16toh(x) OSSwapLittleToHostInt16(x)
 29 |  
 30 | #	define htobe32(x) OSSwapHostToBigInt32(x)
 31 | #	define htole32(x) OSSwapHostToLittleInt32(x)
 32 | #	define be32toh(x) OSSwapBigToHostInt32(x)
 33 | #	define le32toh(x) OSSwapLittleToHostInt32(x)
 34 |  
 35 | #	define htobe64(x) OSSwapHostToBigInt64(x)
 36 | #	define htole64(x) OSSwapHostToLittleInt64(x)
 37 | #	define be64toh(x) OSSwapBigToHostInt64(x)
 38 | #	define le64toh(x) OSSwapLittleToHostInt64(x)
 39 | 
 40 | #	define __BYTE_ORDER    BYTE_ORDER
 41 | #	define __BIG_ENDIAN    BIG_ENDIAN
 42 | #	define __LITTLE_ENDIAN LITTLE_ENDIAN
 43 | #	define __PDP_ENDIAN    PDP_ENDIAN
 44 | 
 45 | #elif defined(__OpenBSD__)
 46 | 
 47 | #	include <sys/endian.h>
 48 | 
 49 | #elif defined(__NetBSD__) || defined(__FreeBSD__) || defined(__DragonFly__)
 50 | 
 51 | #	include <sys/endian.h>
 52 | 
 53 | #	define be16toh(x) betoh16(x)
 54 | #	define le16toh(x) letoh16(x)
 55 | 
 56 | #	define be32toh(x) betoh32(x)
 57 | #	define le32toh(x) letoh32(x)
 58 | 
 59 | #	define be64toh(x) betoh64(x)
 60 | #	define le64toh(x) letoh64(x)
 61 | 
 62 | #elif defined(__WINDOWS__)
 63 | 
 64 | #	include <winsock2.h>
 65 | #	include <sys/param.h>
 66 | 
 67 | #	if BYTE_ORDER == LITTLE_ENDIAN
 68 | 
 69 | #		define htobe16(x) htons(x)
 70 | #		define htole16(x) (x)
 71 | #		define be16toh(x) ntohs(x)
 72 | #		define le16toh(x) (x)
 73 |  
 74 | #		define htobe32(x) htonl(x)
 75 | #		define htole32(x) (x)
 76 | #		define be32toh(x) ntohl(x)
 77 | #		define le32toh(x) (x)
 78 |  
 79 | #		define htobe64(x) htonll(x)
 80 | #		define htole64(x) (x)
 81 | #		define be64toh(x) ntohll(x)
 82 | #		define le64toh(x) (x)
 83 | 
 84 | #	elif BYTE_ORDER == BIG_ENDIAN
 85 | 
 86 | 		/* that would be xbox 360 */
 87 | #		define htobe16(x) (x)
 88 | #		define htole16(x) __builtin_bswap16(x)
 89 | #		define be16toh(x) (x)
 90 | #		define le16toh(x) __builtin_bswap16(x)
 91 |  
 92 | #		define htobe32(x) (x)
 93 | #		define htole32(x) __builtin_bswap32(x)
 94 | #		define be32toh(x) (x)
 95 | #		define le32toh(x) __builtin_bswap32(x)
 96 |  
 97 | #		define htobe64(x) (x)
 98 | #		define htole64(x) __builtin_bswap64(x)
 99 | #		define be64toh(x) (x)
100 | #		define le64toh(x) __builtin_bswap64(x)
101 | 
102 | #	else
103 | 
104 | #		error byte order not supported
105 | 
106 | #	endif
107 | 
108 | #	define __BYTE_ORDER    BYTE_ORDER
109 | #	define __BIG_ENDIAN    BIG_ENDIAN
110 | #	define __LITTLE_ENDIAN LITTLE_ENDIAN
111 | #	define __PDP_ENDIAN    PDP_ENDIAN
112 | 
113 | #else
114 | 
115 | #	error platform not supported
116 | 
117 | #endif
118 | 
119 | #endif
120 | 


--------------------------------------------------------------------------------
/bitmap/main/generate_bitmap.c:
--------------------------------------------------------------------------------
 1 | #include <unistd.h>
 2 | 
 3 | #include "../src/bitmap.h"
 4 | #include "../src/util.h"
 5 | 
 6 | int main(int argc, char **argv){
 7 |   FILE *f;
 8 | 
 9 |   if (argc == 2) {
10 |     f = fopen(argv[1], "r");
11 |   } else if (argc < 2 && !isatty(fileno(stdin))) {
12 |     f = stdin;
13 |   } else {
14 |     printf("Usage: \n"
15 |            " %s <logfile>\n"
16 | 	   " echo <string> | %s\n", argv[0], argv[0]);
17 |     return 1;
18 |   }
19 | 
20 |   if(f == NULL) {
21 |       perror("Error: File not opened");
22 |       return(-1);
23 |   }
24 | 
25 |   uint8_t *bitmap = init_bitmap();
26 |   int ret = apply_file_to_bitmap(bitmap, f);
27 |   if (ret == GZ_TRUNCATED) {
28 |     fprintf(stderr, "gzip stream truncated\n");
29 |     return GZ_TRUNCATED;
30 |   }
31 | 
32 |   write_bitmap(bitmap, stdout);
33 | 
34 |   return 0;
35 | }
36 | 


--------------------------------------------------------------------------------
/bitmap/main/test.c:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <unistd.h>
  3 | #include <string.h>
  4 | #include <limits.h>
  5 | #include <sys/file.h>
  6 | #include <sys/wait.h>
  7 | #include <zstd.h>
  8 | #include <dirent.h>
  9 | #include <lockfile.h>
 10 | 
 11 | #include "../lib/minunit.h"
 12 | #include "../src/filter.h"
 13 | #include "../src/bitmap.h"
 14 | #include "../src/util.h"
 15 | #include "../src/packfile.h"
 16 | #include "portable_endian.h"
 17 | 
 18 | /*--------------------------------------------------------------------*/
 19 | 
 20 | int tests_run = 0;
 21 | 
 22 | /*--------------------------------------------------------------------*/
 23 | 
 24 | /*
 25 |  * Returns a file descriptor pointing towards a pipe containing solely the
 26 |  * passed-in string.
 27 |  *
 28 |  * Works by forking the current process and writing the string to a pipe in the
 29 |  * child process.
 30 |  */
 31 | FILE *get_pipe(char *string) {
 32 |   int p[2];
 33 |   if (pipe(p) != 0) {
 34 |     perror("Error: pipe failed");
 35 |     exit(-1);
 36 |   }
 37 |   if (!fork()) {
 38 |     int len = strlen(string);
 39 |     for (int written = 0; len > written;) {
 40 |       int result = write(p[1], string + written, len - written);
 41 |       if (result < 0) {
 42 |         perror("Error converting string to pipe\n");
 43 |         exit(1);
 44 |       }
 45 |       written += result;
 46 |     }
 47 |     exit(0);
 48 |   }
 49 | 
 50 |   close(p[1]);
 51 |   FILE *f = fdopen(p[0], "r");
 52 |   return f;
 53 | }
 54 | 
 55 | /*--------------------------------------------------------------------*/
 56 | /**
 57 |  * Sets the bits in the bitmap for all the 4grams stored in the string.
 58 |  */
 59 | void apply_string_to_bitmap(uint8_t *bitmap, char *string) {
 60 |   FILE *file = get_pipe(string);
 61 |   apply_file_to_bitmap(bitmap, file);
 62 |   fclose(file);
 63 | }
 64 | 
 65 | int bitmaps_are_the_same(uint8_t *bitmap1, uint8_t *bitmap2) {
 66 |   for (int i = 0; i < SIZEOF_BITMAP; i++) {
 67 |     if (bitmap1[i] != bitmap2[i]) {
 68 |       return 0;
 69 |     }
 70 |   }
 71 |   return 1;
 72 | }
 73 | 
 74 | /*--------------------------------------------------------------------*/
 75 | 
 76 | static char *test_init_bitmap() {
 77 |   uint8_t *bitmap = init_bitmap();
 78 |   for (size_t i = 0; i < SIZEOF_BITMAP; i++) {
 79 |     mu_assert("Initialized bitmap has nonzero byte", bitmap[i] == 0);
 80 |   }
 81 |   free(bitmap);
 82 |   return 0;
 83 | }
 84 | 
 85 | /*--------------------------------------------------------------------*/
 86 | 
 87 | static char *test_set_bit() {
 88 |   uint8_t *bitmap = init_bitmap();
 89 |   set_bit(bitmap, 0);
 90 |   mu_assert("Bitmap bit set failed", bitmap[0] == 1);
 91 |   set_bit(bitmap, 1);
 92 |   mu_assert("Bitmap bit set failed", bitmap[0] == 0b11);
 93 |   set_bit(bitmap, 15);
 94 |   mu_assert("Bitmap bit set failed", bitmap[1] == 0b10000000);
 95 |   set_bit(bitmap, 8 * SIZEOF_BITMAP - 1);
 96 |   mu_assert("Bitmap bit set failed", bitmap[SIZEOF_BITMAP - 1] == 0b10000000);
 97 |   free(bitmap);
 98 |   return 0;
 99 | }
100 | 
101 | static char *test_string_to_bitmap_empty() {
102 |   uint8_t *bitmap = init_bitmap();
103 |   apply_string_to_bitmap(bitmap, "");
104 |   for (size_t i = 0; i < SIZEOF_BITMAP; i++) {
105 |     mu_assert("Extra bits added in string to bitmap", bitmap[i] == 0);
106 |   }
107 |   free(bitmap);
108 |   return 0;
109 | }
110 | 
111 | static char *test_string_to_bitmap_tiny() {
112 |   uint8_t *bitmap = init_bitmap();
113 |   apply_string_to_bitmap(bitmap, "as");
114 |   for (size_t i = 0; i < SIZEOF_BITMAP; i++) {
115 |     mu_assert("Extra bits added in string to bitmap", bitmap[i] == 0);
116 |   }
117 |   free(bitmap);
118 |   return 0;
119 | }
120 | 
121 | static char *test_string_to_bitmap_nchars() {
122 |   uint8_t *bitmap = init_bitmap();
123 |   char str[NGRAM_CHARS + 1];
124 |   int n = 0;
125 |   for (int i = 0; i < NGRAM_CHARS; i++) {
126 |     str[i] = 'a';
127 |     n = (n << NGRAM_CHAR_BITS) + ('a' & CHAR_MASK);
128 |   }
129 |   str[NGRAM_CHARS] = '\0';
130 |   apply_string_to_bitmap(bitmap, str);
131 |   mu_assert("test_string_to_bitmap_nchars: bit unset", bitmap[n / 8] == 1 << (n % 8));
132 |   for (size_t i = 0; i < SIZEOF_BITMAP; i++) {
133 |     if (i != n / 8) {
134 |       mu_assert("test_string_to_bitmap_nchars: extra bit set", bitmap[i] == 0);
135 |     }
136 |   }
137 |   free(bitmap);
138 |   return 0;
139 | }
140 | 
141 | static char *test_string_to_bitmap_long() {
142 |   uint8_t *bitmap = init_bitmap();
143 |   apply_string_to_bitmap(bitmap, "aaaaaaaaaaaaaaaaaaaz");
144 | 
145 |   int n = 0;
146 |   for (int i = 0; i < NGRAM_CHARS; i++) {
147 |     n = (n << NGRAM_CHAR_BITS) + ('a' & CHAR_MASK);
148 |   }
149 |   int m = 0;
150 |   for (int i = 0; i < NGRAM_CHARS - 1; i++) {
151 |     m = (m << NGRAM_CHAR_BITS) + ('a' & CHAR_MASK);
152 |   }
153 |   m = (m << NGRAM_CHAR_BITS) + ('z' & CHAR_MASK);
154 | 
155 |   mu_assert("test_string_to_bitmap_long: n unset", bitmap[n / 8] == 1 << (n % 8));
156 |   mu_assert("test_string_to_bitmap_long: m unset", bitmap[m / 8] == 1 << (m % 8));
157 |   for (size_t i = 0; i < SIZEOF_BITMAP; i++) {
158 |     if (i != n / 8 && i != m / 8) {
159 |       mu_assert("test_string_to_bitmap_long: extra bit set", bitmap[i] == 0);
160 |     }
161 |   }
162 |   free(bitmap);
163 |   return 0;
164 | }
165 | 
166 | static char *test_string_to_bitmap() {
167 |   mu_run_test(test_string_to_bitmap_empty);
168 |   mu_run_test(test_string_to_bitmap_tiny);
169 |   mu_run_test(test_string_to_bitmap_nchars);
170 |   mu_run_test(test_string_to_bitmap_long);
171 |   return 0;
172 | }
173 | 
174 | static char *test_compress_bitmap() {
175 |   // create a simple bitmap
176 |   uint8_t *bitmap = init_bitmap();
177 |   set_bit(bitmap, 0);
178 |   set_bit(bitmap, 8);
179 | 
180 |   // pick a file path to pretend we're compressing
181 |   char *fake_tmpfile_path = "/tmp/asdf";
182 |   int64_t fake_mtime = 0;
183 | 
184 |   char bitmap_tmpfile_path[PATH_MAX] = "/tmp/4gramtmpfile.XXXXXX";
185 |   FILE *bitmap_file = fdopen(mkstemp(bitmap_tmpfile_path), "w");
186 |   // compress to a file in the bitmap store
187 |   mu_assert("Error writing to tmpfile", bitmap_file != NULL);
188 |   compress_to_fp(bitmap, bitmap_file, fake_tmpfile_path, fake_mtime);
189 |   fclose(bitmap_file);
190 | 
191 |   // try decompressing it
192 |   uint8_t *decompressed = init_bitmap();
193 |   int decompress_ret = decompress_file(decompressed, bitmap_tmpfile_path);
194 |   mu_assert("Decompress error", decompress_ret == 0);
195 | 
196 |   // check that it decompressed correctly
197 |   for (int i = 0; i < SIZEOF_BITMAP; i++) {
198 |     // printf("%d %d\n", bitmap[i], decompressed[i]);
199 |     mu_assert("Decompressed bitmap not the same", bitmap[i] == decompressed[i]);
200 |   }
201 | 
202 |   free(bitmap);
203 |   free(decompressed);
204 |   return 0;
205 | }
206 | 
207 | static char *test_compress_to_file_no_collision() {
208 |   uint8_t *bitmap = init_bitmap();
209 |   char *file_path = "/tmp/nonexistent";
210 |   char template[] = "/tmp/4gramtmpdir.XXXXXX";
211 |   char *store = mkdtemp(template);
212 |   mu_assert("Could not create tmpdir", store != NULL);
213 | 
214 |   int ret = compress_to_file(bitmap, file_path, 0, store);
215 |   mu_assert("Compress to file failed", ret == 0);
216 | 
217 |   char hashed_filename[21];
218 |   get_hash(file_path, strlen(file_path), hashed_filename);
219 |   strcat(hashed_filename, "_000");
220 |   char *path_to_bitmap_file = add_path_parts(store, hashed_filename);
221 | 
222 |   mu_assert("Compressed bitmap file doesn't exist",
223 |             access(path_to_bitmap_file, F_OK) == 0);
224 |   uint8_t *decompressed = init_bitmap();
225 |   ret = decompress_file(decompressed, path_to_bitmap_file);
226 |   mu_assert("Error occurred in decompression", ret == 0);
227 |   for (int i = 0; i < SIZEOF_BITMAP; i++) {
228 |     mu_assert("Decompressed bitmap not the same", bitmap[i] == decompressed[i]);
229 |   }
230 |   free(bitmap);
231 |   free(decompressed);
232 |   free(path_to_bitmap_file);
233 |   return 0;
234 | }
235 | 
236 | static char *test_compress_to_file_with_collision() {
237 |   uint8_t *bitmap = init_bitmap();
238 |   char *file_path = "/tmp/nonexistent";
239 |   char template[] = "/tmp/4gramtmpdir.XXXXXX";
240 |   char *store = mkdtemp(template);
241 |   mu_assert("Could not create tmpdir", store != NULL);
242 |   int num_files = 3;
243 | 
244 |   for (int i = 0; i < num_files; i++) {
245 |     int ret = compress_to_file(bitmap, file_path, 0, store);
246 |     mu_assert("Compress to file failed", ret == 0);
247 | 
248 |     char cache_file_name[21];
249 |     char num_extension[5];
250 |     sprintf(num_extension, "_%.3d", i);
251 |     get_hash(file_path, strlen(file_path), cache_file_name);
252 |     strcat(cache_file_name, num_extension);
253 |     char *path_to_bitmap_file = add_path_parts(store, cache_file_name);
254 | 
255 |     mu_assert("Compressed bitmap file doesn't exist",
256 |               access(path_to_bitmap_file, F_OK) == 0);
257 |     uint8_t *decompressed = init_bitmap();
258 |     ret = decompress_file(decompressed, path_to_bitmap_file);
259 |     mu_assert("Error occurred in decompression", ret == 0);
260 |     for (int i = 0; i < SIZEOF_BITMAP; i++) {
261 |       mu_assert("Decompressed bitmap not the same", bitmap[i] == decompressed[i]);
262 |     }
263 |     free(decompressed);
264 |     free(path_to_bitmap_file);
265 |   }
266 |   free(bitmap);
267 |   return 0;
268 | }
269 | 
270 | static char *test_compress_to_file() {
271 |   mu_run_test(test_compress_to_file_no_collision);
272 |   mu_run_test(test_compress_to_file_with_collision);
273 |   return 0;
274 | }
275 | 
276 | static char *test_file_packing_single_file() {
277 |   char template[] = "/tmp/4gramtmpdir.XXXXXX";
278 |   char template2[] = "/tmp/4gramtmpdir.XXXXXX";
279 |   char *store = mkdtemp(template);
280 |   char *tmpfile_dir = mkdtemp(template2);
281 |   mu_assert("Could not create tmpdir", store != NULL);
282 |   char *tmpfile_path = add_path_parts(tmpfile_dir, "1.txt");
283 | 
284 |   // write a small temporary file
285 |   FILE *tmpfile = fopen(tmpfile_path, "w");
286 |   mu_assert("Could not create tmpfile", tmpfile != NULL);
287 |   fputs("asdf", tmpfile);
288 |   fclose(tmpfile);
289 | 
290 |   // create a bitmap for the file
291 |   uint8_t *bitmap = init_bitmap();
292 |   tmpfile = fopen(tmpfile_path, "r");
293 |   apply_file_to_bitmap(bitmap, tmpfile);
294 |   fclose(tmpfile);
295 |   int64_t mtime = get_mtime(tmpfile_path);
296 | 
297 |   // figure out the hash of the file
298 |   char loose_file_name[PATH_MAX];
299 |   get_hash(tmpfile_path, strlen(tmpfile_path), loose_file_name);
300 | 
301 |   // compress the bitmap to a file in the store
302 |   int ret = compress_to_file(bitmap, tmpfile_path, mtime, store);
303 |   mu_assert("Error compressing", ret == 0);
304 | 
305 |   // make sure it exists
306 |   mu_assert("Compressed bitmap file doesn't exist",
307 |       access(loose_file_name, F_OK));
308 | 
309 |   pack_loose_files_in_subdir(store);
310 | 
311 |   uint8_t *read_bitmap = read_from_packfile(tmpfile_path, mtime, store);
312 |   mu_assert("Could not find bitmap in packfile", read_bitmap != NULL);
313 |   for (size_t i = 0; i < SIZEOF_BITMAP; i++) {
314 |     mu_assert("Wrong bitmap returned", (bitmap[i] == read_bitmap[i]));
315 |   }
316 | 
317 |   // make sure loose file was removed
318 |   DIR *dir = opendir(store);
319 |   mu_assert("Error opening bitmap store directory", dir != NULL);
320 |   struct dirent *entry;
321 |   while ((entry = readdir(dir))) {
322 |     if (strcmp(entry->d_name, PACKFILE_NAME) == 0
323 |         || strcmp(entry->d_name, PACKFILE_INDEX_NAME) == 0
324 |         || strcmp(entry->d_name, ".") == 0
325 |         || strcmp(entry->d_name, "..") == 0) {
326 |       continue;
327 |     }
328 |     mu_assert("Loose file still in bitmap store directory", 0);
329 |   }
330 |   closedir(dir);
331 | 
332 |   free(tmpfile_path);
333 |   free(bitmap);
334 |   free(read_bitmap);
335 |   return 0;
336 | }
337 | 
338 | static char *test_file_packing_multiple_files() {
339 |   char template[] = "/tmp/4gramtmpdir.XXXXXX";
340 |   char template2[] = "/tmp/4gramtmpdir.XXXXXX";
341 |   char *store = mkdtemp(template);
342 |   char *tmpfile_dir = mkdtemp(template2);
343 |   mu_assert("Could not create tmpdir", store != NULL);
344 |   mu_assert("Could not create tmpdir", tmpfile_dir != NULL);
345 |   int num_files = 10;
346 |   uint8_t *bitmaps[num_files];
347 |   char *tmpfile_paths[num_files];
348 |   for (int i = 0; i < num_files; i++) {
349 |     char name[PATH_MAX];
350 |     sprintf(name, "%d.txt", i);
351 |     tmpfile_paths[i] = add_path_parts(tmpfile_dir, name);
352 |     FILE *tmpfile = fopen(tmpfile_paths[i], "w");
353 |     mu_assert("Could not create tmpfile", tmpfile != NULL);
354 |     fprintf(tmpfile, "%d", i * 1000);
355 |     fclose(tmpfile);
356 |     tmpfile = fopen(tmpfile_paths[i], "r");
357 |     bitmaps[i] = init_bitmap();
358 |     apply_file_to_bitmap(bitmaps[i], tmpfile);
359 |     fclose(tmpfile);
360 |     char loose_file_name[PATH_MAX];
361 |     get_hash(tmpfile_paths[i], strlen(tmpfile_paths[i]), loose_file_name);
362 |     int64_t mtime = get_mtime(tmpfile_paths[i]);
363 |     int ret = compress_to_file(bitmaps[i], tmpfile_paths[i], mtime, store);
364 |     mu_assert("Error compressing", ret == 0);
365 |   }
366 |   pack_loose_files_in_subdir(store);
367 |   for (int i = 0; i < num_files; i++) {
368 |     int64_t mtime = get_mtime(tmpfile_paths[i]);
369 |     uint8_t *read_bitmap = read_from_packfile(tmpfile_paths[i], mtime, store);
370 |     mu_assert("Could not find bitmap in packfile", read_bitmap != NULL);
371 |     for (size_t j = 0; j < SIZEOF_BITMAP; j++) {
372 |       mu_assert("Wrong bitmap returned", bitmaps[i][j] == read_bitmap[j]);
373 |     }
374 |     free(read_bitmap);
375 |     free(bitmaps[i]);
376 |     free(tmpfile_paths[i]);
377 |   }
378 | 
379 |   // make sure loose file was removed
380 |   DIR *dir = opendir(store);
381 |   mu_assert("Error opening bitmap store directory", dir != NULL);
382 |   struct dirent *entry;
383 |   while ((entry = readdir(dir))) {
384 |     if (strcmp(entry->d_name, PACKFILE_NAME) == 0
385 |         || strcmp(entry->d_name, PACKFILE_INDEX_NAME) == 0
386 |         || strcmp(entry->d_name, ".") == 0
387 |         || strcmp(entry->d_name, "..") == 0) {
388 |       continue;
389 |     }
390 |     mu_assert("Loose file still in bitmap store directory", 0);
391 |   }
392 |   closedir(dir);
393 | 
394 |   return 0;
395 | }
396 | 
397 | static char *test_file_packing_existing_packfile() {
398 |   char template[] = "/tmp/4gramtmpdir.XXXXXX";
399 |   char template2[] = "/tmp/4gramtmpdir.XXXXXX";
400 |   char *store = mkdtemp(template);
401 |   char *tmpfile_dir = mkdtemp(template2);
402 |   mu_assert("Could not create tmpdir", store != NULL);
403 |   mu_assert("Could not create tmpdir", tmpfile_dir != NULL);
404 |   int num_files = 20;
405 |   uint8_t *bitmaps[num_files];
406 |   char *tmpfile_paths[num_files];
407 |   for (int i = 0; i < num_files; i++) {
408 |     char name[PATH_MAX];
409 |     sprintf(name, "%d.txt", i);
410 |     tmpfile_paths[i] = add_path_parts(tmpfile_dir, name);
411 |     FILE *tmpfile = fopen(tmpfile_paths[i], "w");
412 |     mu_assert("Could not create tmpfile", tmpfile != NULL);
413 |     fprintf(tmpfile, "%d", i * 1000);
414 |     fclose(tmpfile);
415 |     tmpfile = fopen(tmpfile_paths[i], "r");
416 |     bitmaps[i] = init_bitmap();
417 |     apply_file_to_bitmap(bitmaps[i], tmpfile);
418 |     fclose(tmpfile);
419 |     char loose_file_name[PATH_MAX];
420 |     get_hash(tmpfile_paths[i], strlen(tmpfile_paths[i]), loose_file_name);
421 |     int64_t mtime = get_mtime(tmpfile_paths[i]);
422 |     int ret = compress_to_file(bitmaps[i], tmpfile_paths[i], mtime, store);
423 |     mu_assert("Error compressing", ret == 0);
424 |     if (i == num_files / 2) {
425 |       pack_loose_files_in_subdir(store);
426 |     }
427 |   }
428 |   pack_loose_files_in_subdir(store);
429 |   for (int i = 0; i < num_files; i++) {
430 |     int64_t mtime = get_mtime(tmpfile_paths[i]);
431 |     uint8_t *read_bitmap = read_from_packfile(tmpfile_paths[i], mtime, store);
432 |     mu_assert("Could not find bitmap in packfile", read_bitmap != NULL);
433 |     for (size_t j = 0; j < SIZEOF_BITMAP; j++) {
434 |       mu_assert("Wrong bitmap returned", bitmaps[i][j] == read_bitmap[j]);
435 |     }
436 |     free(read_bitmap);
437 |     free(bitmaps[i]);
438 |     free(tmpfile_paths[i]);
439 |   }
440 | 
441 |   // make sure loose file was removed
442 |   DIR *dir = opendir(store);
443 |   mu_assert("Error opening bitmap store directory", dir != NULL);
444 |   struct dirent *entry;
445 |   while ((entry = readdir(dir))) {
446 |     if (strcmp(entry->d_name, PACKFILE_NAME) == 0
447 |         || strcmp(entry->d_name, PACKFILE_INDEX_NAME) == 0
448 |         || strcmp(entry->d_name, ".") == 0
449 |         || strcmp(entry->d_name, "..") == 0) {
450 |       continue;
451 |     }
452 |     mu_assert("Loose file still in bitmap store directory", 0);
453 |   }
454 |   closedir(dir);
455 | 
456 |   return 0;
457 | }
458 | 
459 | static char *test_file_packing() {
460 |   mu_run_test(test_file_packing_single_file);
461 |   mu_run_test(test_file_packing_multiple_files);
462 |   mu_run_test(test_file_packing_existing_packfile);
463 |   return 0;
464 | }
465 | 
466 | static char *test_filter_checks_emptydir() {
467 |   char template[] = "/tmp/4gramtmpdir.XXXXXX";
468 |   char template2[] = "/tmp/4gramtmpdir.XXXXXX";
469 |   char *store = mkdtemp(template);
470 |   char *tmpfile_dir = mkdtemp(template2);
471 |   mu_assert("Could not create tmpdir", store != NULL);
472 |   mu_assert("Could not create tmpdir", tmpfile_dir != NULL);
473 | 
474 |   // write a small temporary file
475 |   char *tmpfile_path = add_path_parts(tmpfile_dir, "1.txt");
476 |   FILE *tmpfile = fopen(tmpfile_path, "w");
477 |   mu_assert("Could not create tmpfile", tmpfile != NULL);
478 |   fputs("asdf", tmpfile);
479 |   fclose(tmpfile);
480 |   int64_t mtime = get_mtime(tmpfile_path);
481 | 
482 |   uint8_t *bitmap = init_bitmap();
483 |   mu_assert("Should not detect loose file",
484 |             check_loose_files(tmpfile_path, mtime, bitmap, store) != 0);
485 |   mu_assert("Should not detect entry in pack file",
486 |             check_pack_files(tmpfile_path, mtime, bitmap, store) != 0);
487 |   free(tmpfile_path);
488 |   free(bitmap);
489 |   return 0;
490 | }
491 | 
492 | static char *test_filter_checks_loose_file() {
493 |   char template[] = "/tmp/4gramtmpdir.XXXXXX";
494 |   char template2[] = "/tmp/4gramtmpdir.XXXXXX";
495 |   char *store = mkdtemp(template);
496 |   char *tmpfile_dir = mkdtemp(template2);
497 |   mu_assert("Could not create 4gramtmpdir", store != NULL);
498 |   mu_assert("Could not create 4gramtmpdir", tmpfile_dir != NULL);
499 | 
500 |   // write a small temporary file
501 |   char *tmpfile_path = add_path_parts(tmpfile_dir, "1.txt");
502 |   FILE *tmpfile = fopen(tmpfile_path, "w");
503 |   mu_assert("Could not create tmpfile", tmpfile != NULL);
504 |   fputs("asdf", tmpfile);
505 |   fclose(tmpfile);
506 |   int64_t mtime = get_mtime(tmpfile_path);
507 | 
508 |   // compress it to loose file
509 |   uint8_t *bitmap = init_bitmap();
510 |   tmpfile = fopen(tmpfile_path, "r");
511 |   mu_assert("Could not open tmpfile", tmpfile != NULL);
512 |   apply_file_to_bitmap(bitmap, tmpfile);
513 |   fclose(tmpfile);
514 |   compress_to_file(bitmap, tmpfile_path, mtime, store);
515 | 
516 |   uint8_t *read_bitmap = init_bitmap();
517 |   mu_assert("Should detect loose file",
518 |             check_loose_files(tmpfile_path, mtime, bitmap, store) == 0);
519 |   mu_assert("Should not detect entry in pack file",
520 |             check_pack_files(tmpfile_path, mtime, bitmap, store) != 0);
521 |   free(bitmap);
522 |   free(read_bitmap);
523 |   free(tmpfile_path);
524 |   return 0;
525 | }
526 | 
527 | static char *test_filter_checks_packfile() {
528 |   char template[] = "/tmp/4gramtmpdir.XXXXXX";
529 |   char template2[] = "/tmp/4gramtmpdir.XXXXXX";
530 |   char *store = mkdtemp(template);
531 |   char *tmpfile_dir = mkdtemp(template2);
532 |   mu_assert("Could not create 4gramtmpdir", store != NULL);
533 |   mu_assert("Could not create 4gramtmpdir", tmpfile_dir != NULL);
534 | 
535 |   // write a small temporary file
536 |   char *tmpfile_path = add_path_parts(tmpfile_dir, "1.txt");
537 |   FILE *tmpfile = fopen(tmpfile_path, "w");
538 |   mu_assert("Could not create tmpfile", tmpfile != NULL);
539 |   fputs("asdf", tmpfile);
540 |   fclose(tmpfile);
541 |   int64_t mtime = get_mtime(tmpfile_path);
542 | 
543 |   // compress it to loose file
544 |   uint8_t *bitmap = init_bitmap();
545 |   tmpfile = fopen(tmpfile_path, "r");
546 |   mu_assert("Could not open tmpfile", tmpfile != NULL);
547 |   apply_file_to_bitmap(bitmap, tmpfile);
548 |   fclose(tmpfile);
549 |   compress_to_file(bitmap, tmpfile_path, mtime, store);
550 | 
551 |   pack_loose_files_in_subdir(store);
552 | 
553 |   uint8_t *read_bitmap = init_bitmap();
554 |   mu_assert("Should not detect loose file",
555 |             check_loose_files(tmpfile_path, mtime, bitmap, store) != 0);
556 |   mu_assert("Should detect entry in pack file",
557 |             check_pack_files(tmpfile_path, mtime, bitmap, store) == 0);
558 |   free(bitmap);
559 |   free(read_bitmap);
560 |   free(tmpfile_path);
561 |   return 0;
562 | }
563 | 
564 | static char *test_filter_checks() {
565 |   mu_run_test(test_filter_checks_emptydir);
566 |   mu_run_test(test_filter_checks_loose_file);
567 |   mu_run_test(test_filter_checks_packfile);
568 |   return 0;
569 | }
570 | 
571 | static char *test_packfile_locking() {
572 |   uint8_t *bitmap = init_bitmap();
573 |   char *file_path = "/tmp/nonexistent";
574 |   char template[] = "/tmp/4gramtmpdir.XXXXXX";
575 |   char *store = mkdtemp(template);
576 |   mu_assert("Could not create tmpdir", store != NULL);
577 | 
578 |   int ret = compress_to_file(bitmap, file_path, 0, store);
579 |   mu_assert("Compress to file failed", ret == 0);
580 | 
581 |   char hashed_filename[21];
582 |   get_hash(file_path, strlen(file_path), hashed_filename);
583 |   strcat(hashed_filename, "_000");
584 |   char *path_to_bitmap_file = add_path_parts(store, hashed_filename);
585 |   mu_assert("Loose file not created\n", access(path_to_bitmap_file, F_OK) == 0);
586 | 
587 |   char *packfile_path = add_path_parts(store, PACKFILE_NAME);
588 | 
589 |   mu_assert("Could not lock packfile", lockfile_create(packfile_path, 0, 0) == 0);
590 | 
591 |   if (fork() == 0) {
592 |     pack_loose_files_in_subdir(store);
593 |     exit(0);
594 |   }
595 |   int wait_status;
596 |   wait(&wait_status);
597 | 
598 |   mu_assert("Loose files were packed despite lock",
599 |       access(path_to_bitmap_file, F_OK) == 0);
600 |   lockfile_remove(packfile_path);
601 |   free(path_to_bitmap_file);
602 |   free(packfile_path);
603 |   free(bitmap);
604 |   return 0;
605 | }
606 | 
607 | static char *test_get_4gram_indices() {
608 |   char *strings[] = {
609 |     "qwertyuiop",
610 |     "asdfghjkl",
611 |     "zxcvbnm!@#$%^&*()",
612 |   };
613 |   for (int i = 0; i < 3; i++) {
614 |     uint8_t *bitmap = init_bitmap();
615 |     apply_string_to_bitmap(bitmap, strings[i]);
616 |     int *indices = get_4gram_indices(strings[i]);
617 |     int len = strlen(strings[i]) - NGRAM_CHARS + 1;
618 |     for (int j = 0; j < len; j++) {
619 |       int k = indices[j];
620 |       mu_assert("Invalid 4gram indices", get_bit(bitmap, k));
621 |     }
622 |     free(bitmap);
623 |     free(indices);
624 |   }
625 |   return 0;
626 | }
627 | 
628 | static char *test_corruption_size() {
629 |   uint8_t *bitmap = init_bitmap();
630 |   apply_string_to_bitmap(bitmap, "hello");
631 |   char *orig_filename = "should be 12";
632 |   FILE *temp = fopen("tmp.txt", "w");
633 | 
634 |   uint16_t len = strlen(orig_filename);
635 |   void* compressed = malloc(131616);
636 |   uint32_t compressed_size = ZSTD_compress(compressed, 131616,
637 |                                            bitmap, SIZEOF_BITMAP, 3);
638 | 
639 |   uint16_t len_be = htobe16(len);
640 |   uint32_t compressed_size_be = htobe32(compressed_size);
641 |   fwrite(&len_be, sizeof(uint16_t), 1, temp);
642 |   fwrite(orig_filename, len, 1, temp);
643 |   fwrite(&compressed_size_be, sizeof(uint32_t), 1, temp);
644 |   fwrite(compressed, compressed_size, 1, temp);
645 |   fclose(temp);
646 | 
647 |   FILE *temp2 = fopen("tmp.txt", "r");
648 |   fseek(temp2, 0, SEEK_END);
649 |   unsigned long written_size = ftell(temp2);
650 |   rewind(temp2);
651 |   fclose(temp2);
652 |   remove("tmp.txt");
653 |   mu_assert("Size of file not same as written size",
654 |             (len + compressed_size + 6 == written_size));
655 |   free(compressed);
656 |   free(bitmap);
657 |   return 0;
658 | }
659 | 
660 | static char *test_loose_file_locking() {
661 |   uint8_t *bitmap = init_bitmap();
662 |   char *filename = "/tmp/nonexistent";
663 |   char template[] = "/tmp/4gramtmpdir.XXXXXX";
664 |   char *store = mkdtemp(template);
665 |   mu_assert("Could not create tmpdir", store != NULL);
666 |   compress_to_file(bitmap, filename, 0, store);
667 | 
668 |   char hash[21];
669 |   uint16_t len = strlen(filename);
670 |   get_hash(filename, len, hash);
671 |   char loose_file_name[PATH_MAX];
672 |   strcpy(loose_file_name, hash);
673 |   strcat(loose_file_name, "_000");
674 |   char *lockfile_path = get_lock_path(store, loose_file_name);
675 |   mu_assert("Could not lock file", lockfile_create(lockfile_path, 0, 0) == 0);
676 | 
677 |   if (fork() == 0) {
678 |     pack_loose_files_in_subdir(store);
679 |     exit(0);
680 |   }
681 |   int wait_status;
682 |   wait(&wait_status);
683 | 
684 |   char *loose_file_path = add_path_parts(store, loose_file_name);
685 |   mu_assert("Loose file was packed despite lock",
686 |       access(loose_file_path, F_OK) == 0);
687 |   free(loose_file_path);
688 |   lockfile_remove(lockfile_path);
689 |   free(lockfile_path);
690 |   free(bitmap);
691 | 
692 |   return 0;
693 | }
694 | 
695 | static char *test_strings_to_sorted_indices() {
696 |   char *strings[] = {
697 |     "qwertyuiop",
698 |     "asdfghjkl",
699 |     "zxcvbnm!@#$%^&*()",
700 |   };
701 |   uint8_t *bitmap = init_bitmap();
702 |   for (int i = 0; i < 3; i++) {
703 |     apply_string_to_bitmap(bitmap, strings[i]);
704 |   }
705 |   struct intarray indices;
706 |   indices = strings_to_sorted_indices(strings, 3);
707 |   for (int i = 0; i < POSSIBLE_NGRAMS; i++) {
708 |     if (get_bit(bitmap, i)) {
709 |       int contained = 0;
710 |       for (int j = 0; j < indices.length; j++) {
711 |         if (indices.data[j] == i) {
712 |           contained = 1;
713 |         }
714 |       }
715 |       mu_assert("strings_to_sorted_indices: Index not found", contained);
716 |     }
717 |   }
718 |   for (int i = 1; i < indices.length; i++) {
719 |     mu_assert("strings_to_sorted_indices: unsorted",
720 |         indices.data[i-1] <= indices.data[i]);
721 |   }
722 |   free(bitmap);
723 |   free_intarray(indices);
724 |   return 0;
725 | }
726 | 
727 | static char *test_mtime() {
728 |   char template[] = "/tmp/4gramtmpdir.XXXXXX";
729 |   char template2[] = "/tmp/4gramtmpdir.XXXXXX";
730 |   char *store = mkdtemp(template);
731 |   char *tmpfile_dir = mkdtemp(template2);
732 |   mu_assert("Could not create tmpdir", store != NULL);
733 |   char *tmpfile_path = add_path_parts(tmpfile_dir, "1.txt");
734 | 
735 |   // write a small temporary file
736 |   FILE *tmpfile = fopen(tmpfile_path, "w");
737 |   mu_assert("Could not create tmpfile", tmpfile != NULL);
738 |   fputs("qwertyuiop", tmpfile);
739 |   fclose(tmpfile);
740 |   int64_t mtime = 0;
741 | 
742 |   // create a bitmap for the file
743 |   uint8_t *bitmap = init_bitmap();
744 |   tmpfile = fopen(tmpfile_path, "r");
745 |   apply_file_to_bitmap(bitmap, tmpfile);
746 |   fclose(tmpfile);
747 | 
748 |   // figure out the hash of the file
749 |   char loose_file_name[PATH_MAX];
750 |   get_hash(tmpfile_path, strlen(tmpfile_path), loose_file_name);
751 | 
752 |   // compress the bitmap to a file in the store
753 |   int ret = compress_to_file(bitmap, tmpfile_path, mtime, store);
754 |   mu_assert("Error compressing", ret == 0);
755 | 
756 |   // make sure it exists
757 |   mu_assert("Compressed bitmap file doesn't exist",
758 |       access(loose_file_name, F_OK));
759 | 
760 |   uint8_t *bitmap1 = init_bitmap();
761 |   // make sure we can access the file with our mtime
762 |   mu_assert("Could not access loose_file with mtime 0",
763 |       check_loose_files(tmpfile_path, mtime, bitmap1, store) == 0);
764 |   mu_assert("Didn't get same bitmap back",
765 |       bitmaps_are_the_same(bitmap, bitmap1));
766 |   free(bitmap1);
767 |   bitmap1 = init_bitmap();
768 |   mu_assert("Got bitmap with invalid mtime",
769 |       check_loose_files(tmpfile_path, 123, bitmap1, store) != 0);
770 |   free(bitmap1);
771 | 
772 |   pack_loose_files_in_subdir(store);
773 | 
774 |   uint8_t *read_bitmap = read_from_packfile(tmpfile_path, mtime, store);
775 |   mu_assert("Could not find bitmap in packfile", read_bitmap != NULL);
776 |   mu_assert("Didn't get same bitmap back",
777 |       bitmaps_are_the_same(bitmap, read_bitmap));
778 |   mu_assert("Got bitmap with invalid mtime",
779 |       read_from_packfile(tmpfile_path, 1, store) == NULL);
780 | 
781 |   free(tmpfile_path);
782 |   free(bitmap);
783 |   free(read_bitmap);
784 |   return 0;
785 | }
786 | 
787 | static char *test_get_index_subdirectory() {
788 |   char *indexdir = "/4gram";
789 |   char *subdir = get_index_subdirectory(indexdir, 0);
790 |   mu_assert("get_index_subdirectory epoch failed",
791 |       strcmp(subdir, "/4gram/1970_01") == 0);
792 |   free(subdir);
793 |   subdir = get_index_subdirectory(indexdir, -1);
794 |   mu_assert("get_index_subdirectory negative failed",
795 |       strcmp(subdir, "/4gram/1969_12") == 0);
796 |   free(subdir);
797 |   subdir = get_index_subdirectory(indexdir, 1502920742);
798 |   mu_assert("get_index_subdirectory normal date failed",
799 |       strcmp(subdir, "/4gram/2017_08") == 0);
800 |   free(subdir);
801 |   subdir = get_index_subdirectory(indexdir, 1L << 31);
802 |   mu_assert("get_index_subdirectory overflow test failed",
803 |       strcmp(subdir, "/4gram/2038_01") == 0);
804 |   free(subdir);
805 |   return 0;
806 | }
807 | 
808 | 
809 | static char *run_tests() {
810 |   mu_run_test(test_init_bitmap);
811 |   mu_run_test(test_set_bit);
812 |   mu_run_test(test_string_to_bitmap);
813 |   mu_run_test(test_compress_to_file);
814 |   mu_run_test(test_compress_bitmap);
815 |   mu_run_test(test_file_packing);
816 |   mu_run_test(test_filter_checks);
817 |   mu_run_test(test_packfile_locking);
818 |   mu_run_test(test_get_4gram_indices);
819 |   mu_run_test(test_corruption_size);
820 |   mu_run_test(test_loose_file_locking);
821 |   mu_run_test(test_strings_to_sorted_indices);
822 |   mu_run_test(test_mtime);
823 |   mu_run_test(test_get_index_subdirectory);
824 |   return 0;
825 | }
826 | 
827 | int main() {
828 |   char *result = run_tests();
829 |   if (result != 0) {
830 |     printf("%s\n", result);
831 |   } else {
832 |     printf("All tests passed!\n");
833 |   }
834 |   if (system("rm -rf /tmp/4gramtmpdir.*") != 0) {
835 |     printf("Warning: error cleaning temporary directories.");
836 |   }
837 |   if (system("rm -rf /tmp/4gramtmpfile.*") != 0) {
838 |     printf("Warning: error cleaning temporary files.");
839 |   }
840 |   printf("Tests run: %d\n", tests_run);
841 |   return result != 0;
842 | }
843 | 


--------------------------------------------------------------------------------
/bitmap/src/bitmap.c:
--------------------------------------------------------------------------------
  1 | #include <unistd.h>
  2 | #include <errno.h>
  3 | #include <stdlib.h>
  4 | #include <immintrin.h>
  5 | #include <zstd.h>
  6 | #include <zlib.h>
  7 | #include <dirent.h>
  8 | #include <limits.h>
  9 | #include <fcntl.h>
 10 | #include <stdint.h>
 11 | #include <string.h>
 12 | #include <lockfile.h>
 13 | 
 14 | #include "bitmap.h"
 15 | #include "xxhash.h"
 16 | #include "util.h"
 17 | #include "portable_endian.h"
 18 | 
 19 | /*--------------------------------------------------------------------*/
 20 | 
 21 | #define ESTIMATED_ZSTD_SIZE (ZSTD_compressBound(SIZEOF_BITMAP))
 22 | 
 23 | /*--------------------------------------------------------------------*/
 24 | 
 25 | /**
 26 |  * Initalizes memory for bitmap
 27 |  */
 28 | uint8_t *init_bitmap(){
 29 |   uint8_t *bitmap = calloc(SIZEOF_BITMAP, 1);
 30 |   if (bitmap == NULL){
 31 |   	perror("Error: Bitmap not initialized");
 32 |     return(NULL);
 33 |   }
 34 |   return bitmap;
 35 | }
 36 | 
 37 | /*--------------------------------------------------------------------*/
 38 | 
 39 | void set_bit(uint8_t *bitmap, int bit_index){
 40 |   bitmap[bit_index / 8] |= (1 << bit_index % 8);
 41 | }
 42 | 
 43 | /*--------------------------------------------------------------------*/
 44 | 
 45 | uint8_t get_bit(uint8_t *bitmap, int bit_index) {
 46 |   return (bitmap[bit_index / 8] >> (bit_index % 8)) & 1;
 47 | }
 48 | 
 49 | /*--------------------------------------------------------------------*/
 50 | 
 51 | void write_bitmap(uint8_t *bitmap, FILE *file){
 52 |   fwrite(bitmap, 1, SIZEOF_BITMAP, file);
 53 | }
 54 | 
 55 | /*--------------------------------------------------------------------*/
 56 | 
 57 | char *get_lock_path(char *directory, char *filename) {
 58 |   char lock_filename[PATH_MAX];
 59 |   sprintf(lock_filename, ".%s.lock", filename);
 60 |   return add_path_parts(directory, lock_filename);
 61 | }
 62 | 
 63 | /*--------------------------------------------------------------------*/
 64 | 
 65 | /**
 66 |  * Finds the first path of the form "directory/filename_XXX" that doesn't
 67 |  * already exist, counting up from 000 to 999.
 68 |  *
 69 |  * When found, it creates a new file with mode 0666 and returns a file
 70 |  * descriptor.
 71 |  */
 72 | int available_name(char *filename, char *directory){
 73 |   char tmp[21];
 74 |   int i = 0;
 75 |   //Max hash collision will be _999
 76 |   while(i < 1000){
 77 |     sprintf(tmp, "%s_%.3d", filename, i);
 78 |     char *full_path = add_path_parts(directory, tmp);
 79 |     int fd = open(full_path, O_WRONLY | O_CREAT | O_EXCL, 0666);
 80 |     free(full_path);
 81 |     if(fd != -1) {//exists
 82 |       char *lock_path = get_lock_path(directory, tmp);
 83 |       int a = lockfile_create(lock_path, 0, 0);
 84 |       free(lock_path);
 85 |       if(a != 0){
 86 |         i++;
 87 |         continue;
 88 |       }
 89 |       strcpy(filename, tmp);
 90 |       return fd;
 91 |     }
 92 |     i++;
 93 |   }
 94 |   return(-1);
 95 | }
 96 | 
 97 | /*--------------------------------------------------------------------*/
 98 | 
 99 | /**
100 |  * Fucnction gets the xxhash of the filename and stores it
101 |  * in hash_hex_str
102 |  */
103 | int get_hash(char *filename, size_t len, char *hash_hex_str){
104 |   XXH64_canonical_t* dst = malloc(sizeof(XXH64_canonical_t));
105 |   if (dst == NULL){
106 |     perror("Error: Memory not allocated");
107 |     return(-1);
108 |   }
109 |   uint64_t hashed = XXH64(filename, len, HASH_SEED);
110 |   XXH64_canonicalFromHash(dst, hashed);
111 |   for(int i = 0; i < 8; i++){
112 |     sprintf((hash_hex_str+2*i), "%02X", dst->digest[i]);
113 |   }
114 |   free(dst);
115 |   return 0;
116 | }
117 | 
118 | /*--------------------------------------------------------------------*/
119 | 
120 | /**
121 |  * Function will write the bitmap that has been compressed in filename to
122 |  * decompressed.
123 |  * Saved data comprises of length of filename, filename, compressed data,
124 |  * decompressed size.
125 |  */
126 | int decompress_file(uint8_t *decompressed, char *full_path){
127 |   uint16_t len;
128 |   uint32_t compressed_size;
129 |   int ret_val = -1;
130 |   FILE *f = fopen(full_path, "r");
131 |   if(f == NULL) {
132 |     if (errno != ENOENT) {
133 |       perrorf("Error: File not opened: %s", full_path);
134 |     }
135 |     return ret_val;
136 |   }
137 |   if (fread(&len, sizeof(uint16_t), 1, f) != 1) {
138 |     perrorf("Error in reading file size: %s", full_path);
139 |     fclose(f);
140 |     return ret_val;
141 |   }
142 |   len = be16toh(len);
143 |   char orig_filename[len+1];
144 |   if (fread(orig_filename, len, 1, f) != 1){
145 |     perrorf("Error in reading filename: %s", full_path);
146 |     fclose(f);
147 |     return ret_val;
148 |   }
149 |   int64_t mtime;
150 |   if (fread(&mtime, sizeof(int64_t), 1, f) != 1) {
151 |     perrorf("Error in reading mtime: %s", full_path);
152 |     fclose(f);
153 |     return ret_val;
154 |   }
155 |   mtime = be64toh(mtime);
156 |   if (fread(&compressed_size, sizeof(uint32_t), 1, f) != 1){
157 |     perrorf("Error in reading decompressed size: %s", full_path);
158 |     fclose(f);
159 |     return ret_val;
160 |   }
161 |   compressed_size = be32toh(compressed_size);
162 |   char stream[compressed_size+1];
163 |   if (fread(&stream, compressed_size, 1, f) != 1){
164 |     perrorf("Error in reading decompressed file: %s", full_path);
165 |     goto OUT1;
166 |   }
167 |   size_t decompressed_size = ZSTD_decompress(decompressed, SIZEOF_BITMAP,
168 |                                              stream,compressed_size);
169 |   if(ZSTD_isError(decompressed_size) == 1){
170 |     perrorf("Error in decompression of %s: %s",
171 |             full_path, ZSTD_getErrorName(decompressed_size));
172 |   	goto OUT1;
173 |   }
174 |   ret_val = 0;
175 |   goto OUT1;
176 | 
177 |   OUT1:
178 |     fclose(f);
179 |     return ret_val;
180 | }
181 | 
182 | /*--------------------------------------------------------------------*/
183 | 
184 | /**
185 |  * Compresses the bitmap into the file described by fp using ZSTD
186 |  * The original filename's length is stored followed by the filename, followed
187 |  * by the compressed size, followed by the actual compressed data.
188 |  */
189 | int compress_to_fp(uint8_t *bitmap, FILE *fp, char *orig_filename,
190 |     int64_t mtime) {
191 |   uint16_t len = strlen(orig_filename);
192 |   void* compressed = malloc(ESTIMATED_ZSTD_SIZE);
193 |   int ret_val = -1;
194 |   if (compressed == NULL){
195 |   	perror("Error: Memory not allocated");
196 |     return ret_val;
197 |   }
198 | 
199 |   uint32_t compressed_size = ZSTD_compress(compressed, ESTIMATED_ZSTD_SIZE,
200 |                                            bitmap, SIZEOF_BITMAP, 8);
201 | 
202 |   if(ZSTD_isError(compressed_size) == 1) {
203 |   	perror("Error in compression");
204 |     goto OUT2;
205 |   }
206 |   uint16_t len_be = htobe16(len);
207 |   if (fwrite(&len_be, sizeof(uint16_t), 1, fp) != 1){
208 |     goto OUT2;
209 |   }
210 |   if (fwrite(orig_filename, len, 1, fp) != 1){
211 |     perror("Error: Filename not written");
212 |     goto OUT2;
213 |   }
214 |   int64_t mtime_be = htobe64(mtime);
215 |   if (fwrite(&mtime_be, sizeof(int64_t), 1, fp) != 1){
216 |     perror("Error: mtime not written");
217 |     goto OUT2;
218 |   }
219 |   uint32_t compressed_size_be = htobe32(compressed_size);
220 |   if (fwrite(&compressed_size_be, sizeof(uint32_t), 1, fp) != 1){
221 |     perror("Error: Compressed size not written");
222 |     goto OUT2;
223 |   }
224 |   if (fwrite(compressed, compressed_size, 1, fp) != 1){
225 |     perror("Error: Compressed file not written");
226 |     goto OUT2;
227 |   }
228 |   ret_val = 0;
229 |   goto OUT2;
230 | 
231 |   OUT2:
232 |     free(compressed);
233 |     return ret_val;
234 | }
235 | 
236 | /*--------------------------------------------------------------------*/
237 | 
238 | /**
239 |  * Function will compress the bitmap into a loosefile which is
240 |  * named after the filename's hash and number of occurences.
241 |  */
242 | int compress_to_file(uint8_t *bitmap, char *filename, int64_t mtime,
243 |     char *indexdir) {
244 |   char hashed_filename[21], lock[27];
245 |   uint16_t len = strlen(filename);
246 |   get_hash(filename, len, hashed_filename);
247 |   int fd = available_name(hashed_filename, indexdir);
248 |   FILE *fp = fdopen(fd, "wb");
249 |   if(fp == NULL) {
250 |     perrorf("Error: File not opened: %s", hashed_filename);
251 |     return(-1);
252 |   }
253 |   int ret = compress_to_fp(bitmap, fp, filename, mtime);
254 |   fflush(fp);
255 |   fsync(fd);
256 |   fclose(fp);
257 |   sprintf(lock, ".%s.lock", hashed_filename);
258 |   char *lock_path = add_path_parts(indexdir, lock);
259 |   lockfile_remove(lock_path);
260 |   free(lock_path);
261 |   return ret;
262 | }
263 | 
264 | /*--------------------------------------------------------------------*/
265 | 
266 | __attribute__ ((target("bmi2")))
267 | int init_4gram_state_bmi2(char *text) {
268 |   int n = 0;
269 |   for (int i = 0; i < NGRAM_CHARS; i++){
270 |     int tmp = text[i] & CHAR_MASK;
271 |     n = _pdep_u32(n, NGRAM_SHIFT_LEFT_MASK) + tmp;
272 |   }
273 |   return n;
274 | }
275 | 
276 | /*--------------------------------------------------------------------*/
277 | 
278 | int init_4gram_state_slow(char *text) {
279 |   int n = 0;
280 |   for (int i = 0; i < NGRAM_CHARS; i++){
281 |     int tmp = text[i] & CHAR_MASK;
282 |     n = ((n << NGRAM_CHAR_BITS) & NGRAM_MASK) + tmp;
283 |   }
284 |   return n;
285 | }
286 | 
287 | /*--------------------------------------------------------------------*/
288 | 
289 | /**
290 |  * Returns the ngram index of the first ngram in text.
291 |  */
292 | int init_4gram_state(char *text) {
293 |   if (supports_bmi2()) {
294 |     return init_4gram_state_bmi2(text);
295 |   } else {
296 |     return init_4gram_state_slow(text);
297 |   }
298 | }
299 | 
300 | /*--------------------------------------------------------------------*/
301 | 
302 | __attribute__ ((target("bmi2")))
303 | int apply_to_bitmap_bmi2(uint8_t *bitmap, char *buf, int len, int n) {
304 |   for (int i = 0; i < len / sizeof(char); i++) {
305 |     int tmp = buf[i] & CHAR_MASK;
306 |     n = _pdep_u32(n, NGRAM_SHIFT_LEFT_MASK) + tmp;
307 |     set_bit(bitmap, n);
308 |   }
309 |   return n;
310 | }
311 | 
312 | /*--------------------------------------------------------------------*/
313 | 
314 | int apply_to_bitmap_slow(uint8_t *bitmap, char *buf, int len, int n) {
315 |   for (int i = 0; i < len / sizeof(char); i++) {
316 |     int tmp = buf[i] & CHAR_MASK;
317 |     n = ((n << NGRAM_CHAR_BITS) & NGRAM_MASK) + tmp;
318 |     set_bit(bitmap, n);
319 |   }
320 |   return n;
321 | }
322 | 
323 | /*--------------------------------------------------------------------*/
324 | /**
325 |  * Applies all of the ngrams in buf to bitmap.
326 |  *
327 |  * Checks to see if system supports bmi2 instructions and calls relevant
328 |  * functions
329 |  */
330 | int apply_to_bitmap(uint8_t *bitmap, char *buf, int len, int n) {
331 |   if (supports_bmi2()) {
332 |     return apply_to_bitmap_bmi2(bitmap, buf, len, n);
333 |   } else {
334 |     return apply_to_bitmap_slow(bitmap, buf, len, n);
335 |   }
336 | }
337 | 
338 | /*--------------------------------------------------------------------*/
339 | /**
340 |  * Scans the file at filename and writes bits for its 4grams to bitmap.
341 |  * Decompresses the file to read it if the file is gzip-compressed.
342 |  * Returns GZ_TRUNCATED if the given file was gzip-compressed and the
343 |  * last read ended in the middle of the gzip stream.
344 |  */
345 | int apply_file_to_bitmap(uint8_t *bitmap, FILE *f){
346 |   int n = 0;
347 |   int ret_val = -1;
348 |   int fd = fileno(f);
349 |   char buf[BUFSIZE];
350 |   int read_amount;
351 | 
352 |   // open file with a dup fd so closing gzf doesn't close the file descriptor
353 |   int dup_fd = dup(fd);
354 |   if (dup_fd < 0) {
355 |     perror("Error duplicating fd");
356 |     return ret_val;
357 |   }
358 |   gzFile gzf = gzdopen(dup_fd, "r");
359 |   if (gzf == NULL) {
360 |     perror("Error opening gzip stream");
361 |     goto OUT1;
362 |   }
363 | 
364 |   // read first four characters to initialize 4gram
365 |   read_amount = gzread(gzf, buf, NGRAM_CHARS * sizeof(char));
366 |   if (read_amount < 0) {
367 |     fprintf(stderr, "gzread error: %s\n", gzerror(gzf, &read_amount));
368 |     gzclose(gzf);
369 |     goto OUT1;
370 |   }
371 |   if (read_amount == NGRAM_CHARS) {
372 |     n = init_4gram_state(buf);
373 |     set_bit(bitmap, n);
374 |   }
375 | 
376 |   // read rest of file
377 |   do {
378 |     read_amount = gzread(gzf, buf, BUFSIZE);
379 |     if (read_amount < 0) {
380 |       fprintf(stderr, "gzread error: %s\n", gzerror(gzf, &read_amount));
381 |       gzclose(gzf);
382 |       goto OUT1;
383 |     }
384 |     n = apply_to_bitmap(bitmap, buf, read_amount, n);
385 |   } while (read_amount > 0 || (read_amount < 0 && errno == EINTR));
386 | 
387 |   int gzclose_ret = gzclose(gzf);
388 |   if (gzclose_ret == Z_BUF_ERROR) {
389 |     return GZ_TRUNCATED;
390 |   } else if (gzclose_ret != Z_OK) {
391 |     perror("Error closing .gz file");
392 |     goto OUT1;
393 |   }
394 |   ret_val = 0;
395 |   goto OUT1;
396 | 
397 |   OUT1:
398 |     close(dup_fd);
399 |     return ret_val;
400 | }
401 | 
402 | /*--------------------------------------------------------------------*/
403 | 
404 | uint8_t *b_or_b(uint8_t *bitmap1, uint8_t *bitmap2){
405 |   uint8_t *b1_or_b2 = init_bitmap();
406 |   for (int i = 0 ; i < SIZEOF_BITMAP; i++) {
407 |     uint8_t b1 = get_bit(bitmap1, i);
408 |     uint8_t b2 = get_bit(bitmap2, i);
409 |     set_bit(b1_or_b2, (b1 | b2));
410 |   }
411 |   return b1_or_b2;
412 | }
413 | 
414 | /*--------------------------------------------------------------------*/
415 | 
416 | 
417 | 
418 | 
419 | 
420 | 


--------------------------------------------------------------------------------
/bitmap/src/bitmap.h:
--------------------------------------------------------------------------------
 1 | #ifndef BITMAP_INCLUDED
 2 | #define BITMAP_INCLUDED
 3 | 
 4 | /*--------------------------------------------------------------------*/
 5 | 
 6 | #include <stdint.h>
 7 | #include <stdio.h>
 8 | 
 9 | /*--------------------------------------------------------------------*/
10 | 
11 | uint8_t *init_bitmap();
12 | 
13 | void set_bit(uint8_t *bitmap, int bit_index);
14 | 
15 | uint8_t get_bit(uint8_t *bitmap, int bit_index);
16 | 
17 | void write_bitmap(uint8_t *bitmap, FILE *file);
18 | 
19 | int get_hash(char *filename, size_t len, char *hash_hex_str);
20 | 
21 | int decompress_file(uint8_t *decompressed, char *full_path);
22 | 
23 | int compress_to_fp(uint8_t *bitmap, FILE *fp, char *orig_filename, int64_t mtime);
24 | 
25 | int compress_to_file(uint8_t *bitmap, char *filename, int64_t mtime, char *indexdir);
26 | 
27 | int apply_file_to_bitmap(uint8_t *bitmap, FILE *f);
28 | 
29 | uint8_t *b_or_b(uint8_t *bitmap1, uint8_t *bitmap2);
30 | 
31 | /*--------------------------------------------------------------------*/
32 | 
33 | #endif
34 | 


--------------------------------------------------------------------------------
/bitmap/src/filter.c:
--------------------------------------------------------------------------------
  1 | #include <sys/stat.h>
  2 | #include <sys/types.h>
  3 | #include <stdio.h>
  4 | #include <string.h>
  5 | #include <stdlib.h>
  6 | #include <limits.h>
  7 | #include <stdint.h>
  8 | #include <dirent.h>
  9 | #include <fcntl.h>
 10 | #include <unistd.h>
 11 | #include <immintrin.h>
 12 | #include <lockfile.h>
 13 | #include <errno.h>
 14 | 
 15 | #include "bitmap.h"
 16 | #include "filter.h"
 17 | #include "packfile.h"
 18 | #include "util.h"
 19 | #include "xxhash.h"
 20 | #include "portable_endian.h"
 21 | 
 22 | /*--------------------------------------------------------------------*/
 23 | 
 24 | #define BITMAP_CREATED 2
 25 | 
 26 | /*--------------------------------------------------------------------*/
 27 | 
 28 | /**
 29 |  * Checks packfiles for filename.
 30 |  *
 31 |  * If an entry with the given filename and mtime is found in a packfile, it is
 32 |  * applied to the given bitmap.
 33 |  */
 34 | int check_pack_files(char *filename, int64_t mtime, uint8_t *bitmap,
 35 |     char *dir){
 36 |   errno = 0;
 37 |   uint8_t *read_bitmap = read_from_packfile(filename, mtime, dir);
 38 |   if(read_bitmap == NULL) {
 39 |     if (errno == ESTALE) {
 40 |       // retry once on stale NFS file handle
 41 |       read_bitmap = read_from_packfile(filename, mtime, dir);
 42 |       if (read_bitmap == NULL) {
 43 |         if (errno == ESTALE) {
 44 |           perrorf("Error checking packfile for %s", filename);
 45 |         }
 46 |         return(-1);
 47 |       }
 48 |     }
 49 |     return(-1);
 50 |   }
 51 |   memcpy(bitmap, read_bitmap, SIZEOF_BITMAP);
 52 |   free(read_bitmap);
 53 |   return 0;
 54 | }
 55 | 
 56 | /*--------------------------------------------------------------------*/
 57 | 
 58 | /**
 59 |  * Checks the loosefiles in the directory to see if the bitmap exists.
 60 |  *
 61 |  * If an entry with the given filename and mtime is found in a loose file, it
 62 |  * is applied to the given bitmap.
 63 |  */
 64 | int check_loose_files(char *filename, int64_t mtime, uint8_t *bitmap, char *directory){
 65 |   int ret_val = -1;
 66 |   uint16_t orig_len;
 67 |   char hashed_filename[21];
 68 |   char tmp[27];
 69 |   DIR *dir;
 70 |   FILE *possible;
 71 | 
 72 |   uint16_t len = strlen(filename);
 73 |   get_hash(filename, len, hashed_filename);
 74 | 
 75 |   if ((dir = opendir(directory)) == NULL) {
 76 |     perrorf("Error opening directory: %s", directory);
 77 |     return ret_val;
 78 |   }
 79 | 
 80 |   int i = 0;
 81 |   char *tmp_real_path;
 82 | 
 83 |   while(i < 1000){
 84 |     sprintf(tmp, "%s_%.3d", hashed_filename, i);
 85 |     tmp_real_path = add_path_parts(directory, tmp);
 86 | 
 87 |     possible = fopen(tmp_real_path, "r");
 88 |     if (possible == NULL) {
 89 |       free(tmp_real_path);
 90 |       break;
 91 |     }
 92 | 
 93 |     char *lock_path = get_lock_path(directory, tmp);
 94 |     int ret = lockfile_check(lock_path, 0);
 95 |     free(lock_path);
 96 |     if(ret == 0){
 97 |       free(tmp_real_path);
 98 |       break;
 99 |     }
100 | 
101 |     if(remove_if_corrupted(possible, tmp_real_path)) {
102 |       i++;
103 |       free(tmp_real_path);
104 |       fclose(possible);
105 |       continue;
106 |     }
107 | 
108 |     if (fread(&orig_len, 2, 1, possible) != 1) {
109 |       perrorf("Error in reading file size: %s", tmp_real_path);
110 |       goto OUT1;
111 |     }
112 |     orig_len = be16toh(orig_len);
113 |     char orig_filename[orig_len];
114 |     if (fread(orig_filename, orig_len, 1, possible) != 1){
115 |       perrorf("Error in reading filename: %s", tmp_real_path);
116 |       goto OUT1;
117 |       
118 |     }
119 |     if(strncmp(orig_filename, filename, len) != 0){
120 |       goto OUT1;
121 |     }
122 | 
123 |     int64_t loose_mtime;
124 |     if (fread(&loose_mtime, sizeof(int64_t), 1, possible) != 1) {
125 |       perrorf("Error in reading mtime: %s", tmp_real_path);
126 |       goto OUT1;
127 |     }
128 |     loose_mtime = be64toh(loose_mtime);
129 |     if (loose_mtime != mtime) {
130 |       goto OUT1;
131 |     }
132 | 
133 |     if(decompress_file(bitmap, tmp_real_path) == 0){
134 |       ret_val = 0;
135 |       goto OUT1;
136 |     }
137 |     
138 |     fclose(possible);
139 |     free(tmp_real_path);
140 |     i++;
141 |   }
142 |   closedir(dir);
143 |   return ret_val;
144 | 
145 |   OUT1:
146 |     closedir(dir);
147 |     free(tmp_real_path);
148 |     fclose(possible);
149 |     return ret_val;
150 | 
151 | }
152 | 
153 | /*--------------------------------------------------------------------*/
154 | 
155 | /**
156 |  * Scans the file at filename and writes bits for its 4grams to bitmap.
157 |  * Decompresses the file to read it if the file is gzip-compressed.
158 |  *
159 |  * If the bitmap is cached in the index directory, the bitmap is read from the
160 |  * cache and the file at filename is ignored.
161 |  *
162 |  * Returns 0 upon success.
163 |  * Returns GZ_TRUNCATED if the given file was gzip-compressed and the
164 |  * last read ended in the middle of the gzip stream.
165 |  * Returns 3 if the given file does not exist.
166 |  */
167 | int get_bitmap_for_file(uint8_t *bitmap, char *filename, char *indexdir) {
168 |   char *real_path = realpath(filename, NULL);
169 |   if (real_path == NULL) {
170 |     return 3;
171 |   }
172 |   int64_t mtime = get_mtime(real_path);
173 |   char *index_subdir = get_index_subdirectory(indexdir, mtime);
174 |   int ret_val = 0;
175 |   //check loosefiles
176 |   if (check_loose_files(real_path, mtime, bitmap, index_subdir) == 0){
177 |     goto OUT2;
178 |   }
179 |   // not in loosefiles so check packfiles
180 |   if(check_pack_files(real_path, mtime, bitmap, index_subdir) == 0){
181 |     goto OUT2;
182 |   }
183 | 
184 |   FILE *file = fopen(real_path, "r");
185 |   if (file == NULL) {
186 |     perrorf("Could not open file %s", real_path);
187 |     ret_val = 1;
188 |     goto OUT2;
189 |   }
190 | 
191 |   int ret = apply_file_to_bitmap(bitmap, file);
192 |   fclose(file);
193 |   if (ret != 0) {
194 |     ret_val = ret;
195 |     goto OUT2;
196 |   }
197 |   compress_to_file(bitmap, real_path, mtime, index_subdir);
198 |   return BITMAP_CREATED;
199 | 
200 |   OUT2:
201 |     free(real_path);
202 |     return ret_val;
203 | }
204 | 
205 | /*--------------------------------------------------------------------*/
206 | 
207 | int *get_4gram_indices_slow(char *string) {
208 |   int len = strlen(string);
209 |   int n = 0;
210 |   if (len <= 0)
211 |     return NULL;
212 | 
213 |   if (len < NGRAM_CHARS) {
214 |     int *indices = malloc(sizeof(int));
215 |     for (int i = 0; i < len; i++){
216 |       int tmp = string[i] & CHAR_MASK;
217 |       n = ((n << NGRAM_CHAR_BITS) & NGRAM_MASK) + tmp;
218 |     }
219 |     indices[0] = n;
220 |     return indices;
221 |   }
222 |   int *indices = malloc((strlen(string) - NGRAM_CHARS + 1) * sizeof(int));
223 |   for (int i = 0; i < NGRAM_CHARS - 1; i++){
224 |     int tmp = string[i] & CHAR_MASK;
225 |     n = ((n << NGRAM_CHAR_BITS) & NGRAM_MASK) + tmp;
226 |   }
227 |   for (int i = NGRAM_CHARS - 1; i < len; i++) {
228 |     int tmp = string[i] & CHAR_MASK;
229 |     n = ((n << NGRAM_CHAR_BITS) & NGRAM_MASK) + tmp;
230 |     indices[i - NGRAM_CHARS + 1] = n;
231 |   }
232 |   return indices;
233 | }
234 | 
235 | /*--------------------------------------------------------------------*/
236 | 
237 | __attribute__ ((target("bmi2")))
238 | int *get_4gram_indices_bmi2(char *string) {
239 |   int len = strlen(string);
240 |   int n = 0;
241 |   if (len <= 0)
242 |     return NULL;
243 |   if (len < NGRAM_CHARS) {
244 |     int *indices = malloc(sizeof(int));
245 |     for (int i = 0; i < len; i++){
246 |       int tmp = string[i] & CHAR_MASK;
247 |       n = ((n << NGRAM_CHAR_BITS) & NGRAM_MASK) + tmp;
248 |     }
249 |     indices[0] = n;
250 |     return indices;
251 |   }
252 |   int *indices = malloc((strlen(string) + 1 - NGRAM_CHARS) * sizeof(int));
253 |   for (int i = 0; i < NGRAM_CHARS - 1; i++){
254 |     int tmp = string[i] & CHAR_MASK;
255 |     n = _pdep_u32(n, NGRAM_SHIFT_LEFT_MASK) + tmp;
256 |   }
257 |   for (int i = NGRAM_CHARS - 1; i < len; i++) {
258 |     int tmp = string[i] & CHAR_MASK;
259 |     n = _pdep_u32(n, NGRAM_SHIFT_LEFT_MASK) + tmp;
260 |     indices[i - NGRAM_CHARS + 1] = n;
261 |   }
262 |   return indices;
263 | }
264 | 
265 | /*--------------------------------------------------------------------*/
266 | 
267 | /**
268 |  * Returns an array of ngram indices found in the provided string.
269 |  */
270 | int *get_4gram_indices(char *string) {
271 |   if (supports_bmi2()) {
272 |     return get_4gram_indices_bmi2(string);
273 |   } else {
274 |     return get_4gram_indices_slow(string);
275 |   }
276 | }
277 | 
278 | /*--------------------------------------------------------------------*/
279 | 
280 | int compare_ints(const void *a, const void *b) {
281 |   const int *ia = (const int *) a;
282 |   const int *ib = (const int *) b;
283 |   return (*ia > *ib) - (*ia < *ib);
284 | }
285 | 
286 | /*--------------------------------------------------------------------*/
287 | 
288 | /**
289 |  * Merges the two sorted arrays of ints arr1 and arr2 and stores the result in
290 |  * result.
291 |  */
292 | void two_finger_merge_int(int *arr1, int arr1size,
293 |                           int *arr2, int arr2size,
294 |                           int *result) {
295 |   int i1 = 0;
296 |   int i2 = 0;
297 |   for (int r = 0; r < arr1size + arr2size; r++) {
298 |     if (i1 < arr1size
299 |         && (i2 >= arr2size || arr1[i1] < arr2[i2])) {
300 |       result[r] = arr1[i1];
301 |       i1++;
302 |     } else {
303 |       result[r] = arr2[i2];
304 |       i2++;
305 |     }
306 |   }
307 | }
308 | 
309 | /*--------------------------------------------------------------------*/
310 | 
311 | /**
312 |  * Gets the indices of the grams in the given index string, puts them in an
313 |  * array, sorts them, and returns them.
314 |  */
315 | struct intarray string_to_sorted_indices(char *index_string){
316 |   int len_index_string = strlen(index_string);
317 |   int *index_string_4gram_indices = get_4gram_indices(index_string);
318 |   struct intarray arr = {
319 |     .length = len_index_string - NGRAM_CHARS + 1,
320 |     .data = index_string_4gram_indices,
321 |   };
322 |   qsort(arr.data, arr.length, sizeof(int), compare_ints);
323 |   return arr;
324 | }
325 | 
326 | /*--------------------------------------------------------------------*/
327 | /**
328 |  * Returns a sorted list of all ngram indices found in the index strings.
329 |  */
330 | struct intarray strings_to_sorted_indices(char **index_strings,
331 |                                           int num_index_strings) {
332 | 
333 |   struct intarray indices;
334 |   indices = string_to_sorted_indices(index_strings[0]);
335 |   for (int i = 1; i < num_index_strings; i++) {
336 |     struct intarray old_indices = indices;
337 |     struct intarray new_indices;
338 |     new_indices = string_to_sorted_indices(index_strings[i]);
339 |     indices.length = old_indices.length + new_indices.length;
340 |     indices.data = malloc(indices.length * sizeof(int));
341 |     two_finger_merge_int(old_indices.data, old_indices.length,
342 |         new_indices.data, new_indices.length, indices.data);
343 |     free_intarray(old_indices);
344 |     free_intarray(new_indices);
345 |   }
346 |   return indices;
347 | }
348 | 
349 | /**
350 |  * Returns 1 if file_bitmap does not match filter.
351 |  *
352 |  * filter is a 'sum of products' array of arrays of ngram indices. The indices
353 |  * in each subarray are anded together, and each subarray is orred together.
354 |  * Put another way, we filter out files that don't contain all the ngrams in at
355 |  * least one subarray.
356 |  *
357 |  */
358 | int should_filter_out_file(uint8_t *file_bitmap, struct intarrayarray filter) {
359 |   int contained = 0;
360 |   for (int i = 0; i < filter.num_rows; i++) {
361 |     int ngrams_in_subarray_all_present = 1;
362 |     for (int j = 0; j < filter.rows[i].length; j++) {
363 |       if (!get_bit(file_bitmap, filter.rows[i].data[j])) {
364 |         ngrams_in_subarray_all_present = 0;
365 |         break;
366 |       }
367 |     }
368 |     if (ngrams_in_subarray_all_present) {
369 |       contained = 1;
370 |       break;
371 |     }
372 |   }
373 |   return !contained;
374 | }
375 | 
376 | /*--------------------------------------------------------------------*/
377 | /**
378 |  * Function that is called by 4grep to start filtering using search strings
379 |  *
380 |  * See should_filter_out_file for details on ngram_filter.
381 |  *
382 |  * Returns -1 upon failure, 1 if bitmap is found and indices
383 |  * match, 2 if bitmap found but does not match, 3 if no bitmap found and
384 |  * matches, 4 if did not have bitmap and has no match.
385 |  */
386 | int start_filter(struct intarrayarray ngram_filter,
387 |                  char *filename, char *indexdir){
388 | 
389 |   int ret = -1, MTCH = 1, NO_MTCH = 2;
390 |   mode_t old_umask = umask(0);
391 | 
392 |   // now start filtering files
393 |   uint8_t *file_bitmap = init_bitmap();
394 | 
395 |   int bitmap_ret = get_bitmap_for_file(file_bitmap, filename,
396 |                                        indexdir);
397 |   if (bitmap_ret != 0 && bitmap_ret != 2) {
398 |     goto OUT1;
399 |   }
400 | 
401 |   int filtered = should_filter_out_file(file_bitmap, ngram_filter);
402 | 
403 |   if (!filtered)
404 |     ret = MTCH;
405 |   else
406 |     ret = NO_MTCH;
407 | 
408 |   if (bitmap_ret == BITMAP_CREATED)
409 |     ret += BITMAP_CREATED;
410 | 
411 |   OUT1:
412 |     free(file_bitmap);
413 |     umask(old_umask);
414 |     return ret;
415 | }
416 | 


--------------------------------------------------------------------------------
/bitmap/src/filter.h:
--------------------------------------------------------------------------------
 1 | #ifndef FILTER_INCLUDED
 2 | #define FILTER_INCLUDED
 3 | 
 4 | /*--------------------------------------------------------------------*/
 5 | 
 6 | #include <stdint.h>
 7 | #include <stdio.h>
 8 | #include <util.h>
 9 | 
10 | /*--------------------------------------------------------------------*/
11 | 
12 | int check_pack_files(char *filename, int64_t mtime, uint8_t *bitmap, char *dir);
13 | 
14 | int check_loose_files(char *filename, int64_t mtime, uint8_t *bitmap, char *directory);
15 | 
16 | int *get_4gram_indices(char *string);
17 | 
18 | struct intarray strings_to_sorted_indices(char **index_strings,
19 |                                           int num_index_strings);
20 | 
21 | struct intarrayarray strings_to_filter_anded(char **index_strings,
22 |                                         int num_index_strings);
23 | 
24 | struct intarrayarray strings_to_filter_orred(char **index_strings,
25 |                                         int num_index_strings);
26 | 
27 | int should_filter_out_file(uint8_t *file_bitmap, struct intarrayarray filter);
28 | /*--------------------------------------------------------------------*/
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/bitmap/src/packfile.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <sys/file.h>
  3 | #include <sys/stat.h>
  4 | #include <sys/mman.h>
  5 | #include <stdlib.h>
  6 | #include <unistd.h>
  7 | #include <limits.h>
  8 | #include <string.h>
  9 | #include <dirent.h>
 10 | #include <stdint.h>
 11 | #include <assert.h>
 12 | #include <zstd.h>
 13 | #include <errno.h>
 14 | #include <lockfile.h>
 15 | #include <time.h>
 16 | #include <pthread.h>
 17 | 
 18 | #include "bitmap.h"
 19 | #include "util.h"
 20 | #include "xxhash.h"
 21 | #include "packfile.h"
 22 | #include "portable_endian.h"
 23 | 
 24 | /*--------------------------------------------------------------------*/
 25 | 
 26 | /** An entry in the index.
 27 |  * Note: in order to allow mmaping the index file, this struct stores
 28 |  * packfile_offset as big-endian!
 29 |  */
 30 | struct index_entry {
 31 |   uint64_t hash;
 32 |   uint64_t packfile_offset;
 33 | };
 34 | 
 35 | /*--------------------------------------------------------------------*/
 36 | 
 37 | /**
 38 |  * Checks to see if loosefile was interrupted when writing by checking
 39 |  * actual size vs expected size
 40 |  * Returns 0 if no error, returns EMPTY_FILE if loose file is empty,
 41 |  * returns -1 if other error or file corrupted.
 42 |  */
 43 | int is_corrupted(FILE* loosefile) {
 44 |   uint16_t len;
 45 |   uint32_t compressed_size;
 46 |   int64_t mtime;
 47 | 
 48 |   struct stat loosefile_stat;
 49 |   fstat(fileno(loosefile), &loosefile_stat);
 50 |   off_t loosefile_size = loosefile_stat.st_size;
 51 | 
 52 |   if (loosefile_size == 0) {
 53 |     return EMPTY_FILE;
 54 |   }
 55 | 
 56 |   if (fread(&len, sizeof(uint16_t), 1, loosefile) != 1) {
 57 |     perror("Error in reading filename size");
 58 |     return(-1);
 59 |   }
 60 |   len = be16toh(len);
 61 |   if (fseek(loosefile, len, SEEK_CUR) != 0) {
 62 |     perror("Error reading loose file");
 63 |     return(-1);
 64 |   }
 65 |   if (fread(&mtime, sizeof(int64_t), 1, loosefile) != 1) {
 66 |     perror("Error in reading mtime");
 67 |     return(-1);
 68 |   }
 69 |   mtime = be64toh(mtime);
 70 |   if (fread(&compressed_size, sizeof(uint32_t), 1, loosefile) != 1){
 71 |     perror("Error in reading decompressed size");
 72 |     return(-1);
 73 |   }
 74 |   compressed_size = be32toh(compressed_size);
 75 | 
 76 |   if((len + compressed_size + sizeof(uint16_t) + sizeof(uint32_t) +
 77 |         sizeof(int64_t)) != loosefile_size){
 78 |     fprintf(stderr, "Corrupted file: l:%u, cs:%u, filesize:%lld\n",
 79 |             len, compressed_size, (long long) loosefile_size);
 80 |     return(-1);
 81 |   }
 82 | 
 83 |   rewind(loosefile);
 84 |   return 0;
 85 | }
 86 | 
 87 | /*--------------------------------------------------------------------*/
 88 | 
 89 | /**
 90 |  * Returns 1 if file was detected as corrupted, deleted and should be skipped.
 91 |  * Returns 2 if file was empty and should be skipped, but wasn't deleted.
 92 |  * Returns 0 otherwise.
 93 |  */
 94 | int remove_if_corrupted(FILE *file, char *file_path) {
 95 |   int corrupt_status = is_corrupted(file);
 96 |   if(corrupt_status != 0){
 97 |     // ignore empty files because they could mean we acquired a read lock
 98 |     // on the file before the writing process could acquire a write lock
 99 |     // TODO: remove empty files that are very old
100 |     if (corrupt_status != EMPTY_FILE) {
101 |       remove(file_path);
102 |       return 1;
103 |     }
104 |     return 2;
105 |   }
106 |   return 0;
107 | }
108 | 
109 | /*--------------------------------------------------------------------*/
110 | 
111 | /**
112 |  * Returns the index into the packfile index entries where the first entry with
113 |  * the given hash is located, or -1 if the hash does not exist in the index.
114 |  */
115 | size_t find_hash_in_index(struct index_entry *index,
116 |     size_t num_entries, uint64_t hash) {
117 | 
118 |   size_t left = 0;
119 |   size_t right = num_entries - 1;
120 |   while (left != right) {
121 |     size_t middle = (right + left) / 2;
122 |     if (index[middle].hash < hash) {
123 |       left = middle + 1;
124 |     } else {
125 |       right = middle;
126 |     }
127 |   }
128 |   if (index[left].hash != hash) {
129 |     return -1;
130 |   } else {
131 |     return left;
132 |   }
133 | }
134 | 
135 | /*--------------------------------------------------------------------*/
136 | 
137 | /**
138 |  * Calculates the number of existing entries in the packfile index based on the
139 |  * size of the packfile index file.
140 |  *
141 |  * Returns -1 on error.
142 |  */
143 | size_t get_num_index_entries(FILE *packfile_index) {
144 |   if (fseek(packfile_index, 0, SEEK_END) != 0) {
145 |     return(-1);
146 |   }
147 |   long index_size = ftell(packfile_index);
148 |   rewind(packfile_index);
149 |   return index_size / sizeof(struct index_entry);
150 | }
151 | 
152 | /*--------------------------------------------------------------------*/
153 | 
154 | /**
155 |  * Reads the data stored in the packfile with the given name.
156 |  * Assumes the data is (the size of a) bitmap when decompressed.
157 |  *
158 |  * filename: name of file to search for in the packfile
159 |  * mtime: mtime of file to search for in the packfile
160 |  * indexdir: index directory
161 |  */
162 | uint8_t *read_from_packfile(char *filename, int64_t mtime, char *indexdir) {
163 |   
164 |   char *packfile_path = add_path_parts(indexdir, PACKFILE_NAME);
165 |   FILE *packfile = fopen(packfile_path, "r");
166 |   uint8_t *packed_file = NULL;
167 |   free(packfile_path);
168 |   if(packfile == NULL) {
169 |     if (errno != ENOENT) {
170 |       perror("Error: could not open packfile");
171 |     }
172 |     return(NULL);
173 |   }
174 |   char *packfile_index_path = add_path_parts(indexdir, PACKFILE_INDEX_NAME);
175 |   FILE *packfile_index = fopen(packfile_index_path, "r");
176 |   free(packfile_index_path);
177 |   if(packfile_index == NULL) {
178 |     fclose(packfile);
179 |     if (errno != ENOENT)
180 |       perror("Error: could not open packfile index");
181 |     return(NULL);
182 |   }
183 | 
184 |   uint64_t hashed = XXH64(filename, strlen(filename), HASH_SEED);
185 |   size_t num_index_entries = get_num_index_entries(packfile_index);
186 |   if (num_index_entries <= 0)
187 |     goto OUT2;
188 |   size_t index_filesize = num_index_entries * sizeof(struct index_entry);
189 |   struct index_entry *index = mmap(NULL, index_filesize, PROT_READ,
190 |                                    MAP_PRIVATE, fileno(packfile_index), 0);
191 |   if(index == MAP_FAILED){
192 |     perror("Error: could not mmap packfile index");
193 |     goto OUT1;
194 |   }
195 |   size_t first_identical_hash_loc = find_hash_in_index(
196 |       index, num_index_entries, hashed);
197 |   if (first_identical_hash_loc == -1) {
198 |     goto OUT1;
199 |   }
200 |   // now to see if any of the identical hashes map to the same filename
201 |   // we need to read the packfile for this
202 |   for (size_t i = first_identical_hash_loc; index[i].hash == hashed; i++) {
203 |     size_t offset = be64toh(index[i].packfile_offset);
204 |     uint16_t name_len;
205 |     fseek(packfile, offset, SEEK_SET);
206 |     if (fread(&name_len, sizeof(uint16_t), 1, packfile) != 1) {
207 |       perror("Error in packfile fread");
208 |       goto OUT1;
209 |     }
210 |     name_len = be16toh(name_len);
211 |     char packed_filename[name_len];
212 |     if (fread(packed_filename, name_len, 1, packfile) != 1) {
213 |       perror("Error in packfile fread");
214 |       goto OUT1;
215 |     }
216 |     if (strncmp(packed_filename, filename, name_len) != 0) {
217 |       continue;
218 |     }
219 |     int64_t packed_mtime;
220 |     if (fread(&packed_mtime, sizeof(int64_t), 1, packfile) != 1) {
221 |       perror("Error in packfile fread");
222 |       goto OUT1;
223 |     }
224 |     packed_mtime = be64toh(packed_mtime);
225 |     if (packed_mtime != mtime) {
226 |       continue;
227 |     }
228 |     uint32_t packed_file_len;
229 |     // we found an entry with the same filename!
230 |     // now we may read the file
231 |     if (fread(&packed_file_len, sizeof(uint32_t), 1, packfile) != 1) {
232 |       perror("Error in packfile fread");
233 |       goto OUT1;
234 |     }
235 |     packed_file_len = be32toh(packed_file_len);
236 |     uint8_t compressed_file[packed_file_len];
237 |     if (fread(&compressed_file, packed_file_len, 1, packfile) != 1) {
238 |       perror("Error in packfile fread");
239 |       goto OUT1;
240 |     }
241 |     // now decompress it
242 |     packed_file = malloc(SIZEOF_BITMAP);
243 |     if (packed_file == NULL){
244 |       perror("Error: Memory not allocated");
245 |       goto OUT1;
246 |     }
247 | 
248 |     size_t s = ZSTD_decompress(packed_file, SIZEOF_BITMAP,
249 |         compressed_file, packed_file_len);
250 |     if(ZSTD_isError(s) == 1){
251 |       fprintf(stderr, "Error in packfile decompression: %s\n",
252 |               ZSTD_getErrorName(s));
253 |       free(packed_file);
254 |       packed_file = NULL;
255 |       goto OUT1;
256 |     }
257 |   }
258 | 
259 |   OUT1:
260 |     if (munmap(index, index_filesize) == -1) {
261 |       perror("Error in packfile index munmap");
262 |     }
263 |   OUT2:
264 |     fclose(packfile);
265 |     fclose(packfile_index);
266 |     return packed_file;
267 | 
268 | }
269 | 
270 | /*--------------------------------------------------------------------*/
271 | 
272 | /**
273 |  * If the file at the given path does not exist, it is created with permissions
274 |  * 0666.
275 |  */
276 | int create_file_if_nonexistent(char *path) {
277 |   int fd = open(path, O_CREAT, 0666);
278 |   if (fd == -1) {
279 |     perrorf("Error creating file: %s", path);
280 |     return(-1);
281 |   }
282 |   close(fd);
283 |   return 0;
284 | }
285 | 
286 | /*--------------------------------------------------------------------*/
287 | 
288 | /**
289 |  * Appends the given data to the end of the packfile, returning the offset at
290 |  * which it is added.
291 |  */
292 | long write_data_to_packfile(void *data, size_t size, FILE *packfile) {
293 |   long packfile_offset = ftell(packfile);
294 |   int write_amount = fwrite(data, size, 1, packfile);
295 |   if (write_amount != 1) {
296 |       perror("Error writing to packfile");
297 |       return -1;
298 |   }
299 |   return packfile_offset;
300 | }
301 | 
302 | /**
303 |  * Adds the file at the given path to the packfile opened in append-mode.
304 |  * Returns the offset into the packfile at which the new file is written.
305 |  */
306 | long add_file_to_packfile(char *filename, char *indexdir, FILE *packfile) {
307 |   char tmp[27];
308 |   int ret_val = -1;
309 |   sprintf(tmp, ".%s.lock", filename);
310 |   char *lock_path = add_path_parts(indexdir, tmp);
311 |   int ret = lockfile_check(lock_path, 0);
312 |   free(lock_path);
313 |   if(ret == 0){
314 |     return ret_val;
315 |   }
316 | 
317 |   char *file_path = add_path_parts(indexdir, filename);
318 |   FILE *f = fopen(file_path, "r");
319 |   if (f == NULL) {
320 |     perrorf("Could not open %s", file_path);
321 |     goto OUT1;
322 |   }
323 | 
324 |   if (remove_if_corrupted(f, file_path)) {
325 |     fclose(f);
326 |     goto OUT1;
327 |   }
328 |   char buf[BUFSIZE];
329 |   long packfile_offset = ftell(packfile);
330 | 
331 |   int read_amount;
332 |   while ((read_amount = fread(buf, 1, BUFSIZE * sizeof(char), f)) > 0) {
333 |     int write_amount = fwrite(buf, 1, read_amount, packfile);
334 |     if (write_amount != read_amount) {
335 |       perror("Error writing to packfile");
336 |     }
337 |   }
338 |   fclose(f);
339 |   if (read_amount < 0) {
340 |     perrorf("Error reading from %s", file_path);
341 |     goto OUT1;
342 |   }
343 |   ret_val = packfile_offset;
344 |   goto OUT1;
345 | 
346 |   OUT1:
347 |     free(file_path);
348 |     return ret_val;
349 | 
350 | }
351 | 
352 | /*--------------------------------------------------------------------*/
353 | 
354 | /**
355 |  * Comparison function for sorting index entries.
356 |  */
357 | int compare_index_entries(const void *a, const void *b) {
358 |   const struct index_entry *iea = (const struct index_entry *)a;
359 |   const struct index_entry *ieb = (const struct index_entry *)b;
360 |   return (iea->hash > ieb->hash) - (iea->hash < ieb->hash);
361 | }
362 | 
363 | /*--------------------------------------------------------------------*/
364 | 
365 | /**
366 |  * Counts files in the directory to be added to packfile
367 |  */
368 | int count_loose_files(char *dir_path) {
369 |   DIR *dir = opendir(dir_path);
370 |   if (dir == NULL){
371 |     perrorf("Error in opening directory: %s", dir_path);
372 |     return(-1);
373 |   }
374 |   int num_loose = 0;
375 |   struct dirent *entry;
376 |   while ((entry = readdir(dir))) {
377 |     if (strcmp(entry->d_name, PACKFILE_NAME) == 0
378 |         || strcmp(entry->d_name, PACKFILE_INDEX_NAME) == 0
379 |         || entry->d_name[0] == '.' ) {
380 |       continue;
381 |     }
382 |     num_loose++;
383 |   }
384 |   closedir(dir);
385 |   return num_loose;
386 | }
387 | 
388 | /*--------------------------------------------------------------------*/
389 | 
390 | /**
391 |  * Writes new_index to a temporary file, then renames it to the index file,
392 |  * replacing the old one atomically.
393 |  */
394 | int write_new_index(struct index_entry *new_index,
395 |                      int new_index_length, char *file_path) {
396 |   // write the new index to a tmp file
397 |   if(create_file_if_nonexistent(file_path) == -1) {
398 |     return -1;
399 |   }
400 |   FILE *file = fopen(file_path, "w");
401 |   if (file == NULL) {
402 |     perror("Error creating tempfile");
403 |     return(-1);
404 |   }
405 |   int write_amount = fwrite(new_index, new_index_length,
406 |       sizeof(struct index_entry), file);
407 |   fclose(file);
408 |   if (write_amount < 0) {
409 |     perror("Error writing tempfile");
410 |     return(-1);
411 |   }
412 |   return 0;
413 | }
414 | 
415 | /*--------------------------------------------------------------------*/
416 | 
417 | /**
418 |  * Gets hash from the saved string
419 |  */
420 | uint64_t string_to_hash(char *filename){
421 |   XXH64_canonical_t *canonical = malloc(sizeof(XXH64_canonical_t));
422 |   char *hash_str = filename;
423 |   for(int i = 0; i < 8; i++){
424 |     sscanf((hash_str+2*i), "%02hhx", &(canonical->digest[i]));
425 |   }
426 |   uint64_t ret_hash = XXH64_hashFromCanonical(canonical);
427 |   free(canonical);
428 |   return ret_hash;
429 | }
430 | 
431 | /*--------------------------------------------------------------------*/
432 | 
433 | struct read_file_result {
434 |   int error;
435 |   void *data;
436 |   size_t length;
437 | };
438 | 
439 | struct read_file_args {
440 |   char *filename;
441 |   char *indexdir;
442 | };
443 | 
444 | void *read_file(void *args) {
445 |   struct read_file_args *real_args = args;
446 |   char *filename = real_args->filename;
447 |   char *indexdir = real_args->indexdir;
448 |   free(args);
449 |   struct read_file_result *result = malloc(sizeof(struct read_file_result));
450 |   result->error = 0;
451 | 
452 |   char *path = add_path_parts(indexdir, filename);
453 |   char tmp[27];
454 |   sprintf(tmp, ".%s.lock", filename);
455 |   char *lock_path = add_path_parts(indexdir, tmp);
456 |   int ret = lockfile_check(lock_path, 0);
457 |   free(lock_path);
458 |   if(ret == 0){
459 |     goto OUT4;
460 |   }
461 | 
462 |   FILE *file = fopen(path, "r");
463 |   if (file == NULL) {
464 |     result->error = errno;
465 |     goto OUT4;
466 |   }
467 |   int corrupt = remove_if_corrupted(file, path);
468 |   if (corrupt) {
469 |     result->error = -corrupt;
470 |     goto OUT3;
471 |   }
472 |   if (fseek(file, 0, SEEK_END) != 0) {
473 |     result->error = errno;
474 |     goto OUT3;
475 |   }
476 |   long size = ftell(file);
477 |   if (size < 0) {
478 |     result->error = errno;
479 |     goto OUT3;
480 |   }
481 |   rewind(file);
482 |   void *buff = malloc(size);
483 |   if (buff == NULL) {
484 |     result->error = errno;
485 |     goto OUT3;
486 |   }
487 |   int read_amount = fread(buff, size, 1, file);
488 |   if (read_amount != 1) {
489 |     result->error = errno;
490 |     free(buff);
491 |     goto OUT3;
492 |   } else {
493 |     result->data = buff;
494 |     result->length = size;
495 |   }
496 |   fclose(file);
497 |   free(path);
498 |   return result;
499 | 
500 |   OUT3:
501 |   fclose(file);
502 |   OUT4:
503 |   free(path);
504 |   result->data = NULL;
505 |   result->length = 0;
506 |   return result;
507 | }
508 | 
509 | /**
510 |  * Reads many files in parallel, starting a separate thread per file.
511 |  */
512 | struct read_file_result *read_files_in_parallel(char **filenames, int num,
513 |     char *indexdir) {
514 |   struct read_file_result *results =
515 |     malloc(num * sizeof(struct read_file_result));
516 |   pthread_t threads[num];
517 |   int threads_created = 0;
518 |   for (int i = 0; i < num; i++) {
519 |     struct read_file_args *args = malloc(sizeof(*args));
520 |     args->filename = filenames[i];
521 |     args->indexdir = indexdir;
522 |     if (pthread_create(&threads[i], NULL, read_file, args) != 0) {
523 |       perror("Could not create thread");
524 |       break;
525 |     }
526 |     threads_created++;
527 |   }
528 |   for (int t = 0; t < threads_created; t++) {
529 |     struct read_file_result *result;
530 |     pthread_join(threads[t], (void **)&result);
531 |     results[t] = *result;
532 |     free(result);
533 |   }
534 |   if (threads_created < num) {
535 |     for (int i = 0; i < num; i++) {
536 |       free(results[i].data);
537 |     }
538 |     free(results);
539 |     return NULL;
540 |   }
541 |   return results;
542 | }
543 | 
544 | /**
545 |  * Appends all file data from results to the packfile, writing the new index
546 |  * entries to new_entries.
547 |  */
548 | int write_to_packfile(
549 |     struct read_file_result *results,
550 |     int num_results,
551 |     struct index_entry *new_entries,
552 |     char *added_file_paths[],
553 |     FILE *packfile,
554 |     char *indexdir,
555 |     char *filenames[]) {
556 |   int files_added = 0;
557 |   for (int i = 0; i < num_results; i++) {
558 |     struct read_file_result result = results[i];
559 |     if (result.length > 0) {
560 |       long offset = write_data_to_packfile(
561 |           result.data, result.length, packfile);
562 |       if (offset < 0) {
563 |         continue;
564 |       }
565 |       new_entries[files_added].hash = string_to_hash(filenames[i]);
566 |       new_entries[files_added].packfile_offset = htobe64(offset);
567 |       added_file_paths[files_added] = add_path_parts(indexdir, filenames[i]);
568 |       files_added++;
569 |     } else {
570 |       if (result.error > 0 && result.error != EACCES) {
571 |         fprintf(stderr, "Error reading file %s: %s\n", filenames[i],
572 |             strerror(result.error));
573 |       } else if (result.error == -1) {
574 |         char *path = add_path_parts(indexdir, filenames[i]);
575 |         fprintf(stderr, "File was corrupted and removed: %s", path);
576 |         free(path);
577 |       }
578 |     }
579 |   }
580 |   return files_added;
581 | }
582 | 
583 | /**
584 |  * Adds num_loose loose files to the packfile.
585 |  * Returns a pointer to index entries for the now-packed files.
586 |  */
587 | struct index_entry *add_loose_files_to_packfile(
588 |     int *num_loose, char *indexdir, char *file_paths[],
589 |     FILE *packfile, char* lock_path) {
590 |   static const int parallel_reads = 50;
591 | 
592 |   struct index_entry *new_entries = malloc(
593 |       sizeof(struct index_entry) * *num_loose);
594 |   if (new_entries == NULL){
595 |     perror("Error: Memory not allocated");
596 |     return(NULL);
597 |   }
598 |   time_t last_lockfile_touch = time(NULL);
599 | 
600 |   DIR *dir = opendir(indexdir);
601 |   if (dir == NULL){
602 |     perror("Error in opening directory");
603 |     free(new_entries);
604 |     return NULL;
605 |   }
606 | 
607 |   struct dirent *entry;
608 |   int files_added = 0;
609 |   char *filenames_buffer[parallel_reads];
610 |   int buffer_size = 0;
611 |   while (1) {
612 |     entry = readdir(dir);
613 |     if (entry != NULL) {
614 |       if (strcmp(entry->d_name, PACKFILE_NAME) == 0
615 |           || strcmp(entry->d_name, PACKFILE_INDEX_NAME) == 0
616 |           || entry->d_name[0] == '.') {
617 |         continue;
618 |       }
619 | 
620 |       filenames_buffer[buffer_size] = malloc(strlen(entry->d_name) + 1);
621 |       strcpy(filenames_buffer[buffer_size], entry->d_name);
622 |       buffer_size++;
623 |     }
624 |     time_t curr_time = time(NULL);
625 |     if (curr_time > last_lockfile_touch + 60) {
626 |       lockfile_touch(lock_path);
627 |       last_lockfile_touch = curr_time;
628 |     }
629 |     int buffer_full = buffer_size == parallel_reads;
630 |     int enough_files = entry == NULL || files_added + buffer_size == *num_loose;
631 |     if (buffer_full || enough_files) {
632 |       // read some files
633 |       struct read_file_result *results = read_files_in_parallel(
634 |           filenames_buffer, buffer_size, indexdir);
635 |       files_added += write_to_packfile(
636 |           results, buffer_size, new_entries + files_added, file_paths +
637 |           files_added, packfile, indexdir, filenames_buffer);
638 |       for (int i = 0; i < buffer_size; i++) {
639 |         free(filenames_buffer[i]);
640 |         free(results[i].data);
641 |       }
642 |       free(results);
643 |       buffer_size = 0;
644 |     }
645 |     if (enough_files) {
646 |       break;
647 |     }
648 |   }
649 |   *num_loose = files_added;
650 |   closedir(dir);
651 |   return new_entries;
652 | }
653 | 
654 | /*--------------------------------------------------------------------*/
655 | 
656 | /**
657 |  * Merges the two sorted arrays arr1 and arr2 and stores the result in result.
658 |  */
659 | void two_finger_merge(struct index_entry *arr1, int arr1size,
660 |                       struct index_entry *arr2, int arr2size,
661 |                       struct index_entry *result) {
662 |   int i1 = 0;
663 |   int i2 = 0;
664 |   for (int r = 0; r < arr1size + arr2size; r++) {
665 |     if (i1 < arr1size
666 |         && (i2 >= arr2size || arr1[i1].hash < arr2[i2].hash)) {
667 |       result[r] = arr1[i1];
668 |       i1++;
669 |     } else {
670 |       result[r] = arr2[i2];
671 |       i2++;
672 |     }
673 |   }
674 | }
675 | 
676 | /*--------------------------------------------------------------------*/
677 | 
678 | /**
679 |  * Adds all of the new index entries to the packfile index.
680 |  * Re-sorts as needed.
681 |  */
682 | int add_entries_to_index(struct index_entry *new_entries,
683 |     int num_new_entries, char *indexdir) {
684 |   // read old index into new index buffer
685 |   int ret_val = -1;
686 |   char *packfile_index_path = add_path_parts(
687 |       indexdir, PACKFILE_INDEX_NAME);
688 |   create_file_if_nonexistent(packfile_index_path);
689 |   FILE *packfile_index = fopen(packfile_index_path, "r");
690 |   if (packfile_index == NULL) {
691 |     perror("Error opening packfile index");
692 |     free(packfile_index_path);
693 |     return ret_val;
694 |   }
695 | 
696 |   size_t num_existing = get_num_index_entries(packfile_index);
697 |   size_t new_index_length = num_existing + num_new_entries;
698 |   struct index_entry *new_index = malloc(new_index_length *
699 |                                          sizeof(struct index_entry));
700 |   struct index_entry *old_index = malloc(num_existing *
701 |                                          sizeof(struct index_entry));
702 |   int read_amount = fread(old_index, sizeof(struct index_entry),
703 |       num_existing, packfile_index);
704 |   fclose(packfile_index);
705 | 
706 |   if (read_amount < 0) {
707 |     perror("Error reading index file");
708 |     goto OUT1;
709 |   }
710 |   assert(read_amount == num_existing);
711 |   
712 |   qsort(new_entries, num_new_entries, sizeof(struct index_entry),
713 |       compare_index_entries);
714 |   two_finger_merge(old_index, num_existing, new_entries, num_new_entries,
715 |                    new_index);
716 |   char *tmpfile_path = add_path_parts(indexdir,
717 |                                       TEMP_PACKFILE_INDEX_NAME);
718 |   write_new_index(new_index, new_index_length, tmpfile_path);
719 |   rename(tmpfile_path, packfile_index_path);
720 | 
721 |   free(tmpfile_path);
722 |   ret_val = 0;
723 |   goto OUT1;
724 | 
725 |   OUT1:
726 |     free(packfile_index_path);
727 |     free(new_index);
728 |     free(old_index);
729 |     return ret_val;
730 | }
731 | 
732 | /*--------------------------------------------------------------------*/
733 | 
734 | struct delete_files_thread_args {
735 |   char **file_paths;
736 |   int num_to_delete;
737 | };
738 | 
739 | void *delete_files_thread_work(void *args) {
740 |   struct delete_files_thread_args *actual_args = args;
741 |   for (int i = 0; i < actual_args->num_to_delete; i++) {
742 |     remove(actual_args->file_paths[i]);
743 |   }
744 |   free(actual_args);
745 |   return 0;
746 | }
747 | 
748 | /**
749 |  * Delete the files that were in directory but now in packfile.
750 |  *
751 |  * Files are deleted in parallel across 50 threads.
752 |  */
753 | void delete_loose_files(char *file_paths[], int num_loose){
754 |   if (num_loose == 0) {
755 |     return;
756 |   }
757 |   int num_threads = num_loose > 50 ? 50 : num_loose;
758 |   pthread_t threads[num_threads];
759 |   int base_files_per_thread = num_loose / num_threads;
760 |   int deletes_delegated = 0;
761 |   int threads_created = 0;
762 |   while (deletes_delegated < num_loose) {
763 |     struct delete_files_thread_args *args = malloc(sizeof(*args));
764 |     int num_to_delete;
765 |     if (threads_created < num_loose % num_threads) {
766 |       num_to_delete = base_files_per_thread + 1;
767 |     } else {
768 |       num_to_delete = base_files_per_thread;
769 |     }
770 |     args->file_paths = &file_paths[deletes_delegated];
771 |     args->num_to_delete = num_to_delete;
772 |     if (pthread_create(&threads[threads_created], NULL, delete_files_thread_work, args)) {
773 |       free(args);
774 |       perror("Error starting deletion thread");
775 |       goto OUT2;
776 |     }
777 |     deletes_delegated += num_to_delete;
778 |     threads_created++;
779 |   }
780 | 
781 |   OUT2:
782 |   for (int t = 0; t < threads_created; t++) {
783 |     pthread_join(threads[t], NULL);
784 |   }
785 | }
786 | 
787 | /*--------------------------------------------------------------------*/
788 | 
789 | /**
790 |  * Scans the index directory for files not in the packfile.
791 |  * Each found file is read, inserted into the packfile, and deleted.
792 |  * The packfile index is updated as well.
793 |  */
794 | int pack_loose_files_in_subdir(char *index_subdir) {
795 |   // add all the loose files to the packfile and
796 |   // create index entries for them
797 |   int ret_val = -1;
798 |   mode_t old_umask = umask(0);
799 | 
800 |   char *packfile_path = add_path_parts(index_subdir, PACKFILE_NAME);
801 |   create_file_if_nonexistent(packfile_path);
802 | 
803 |   char *packfile_lock = add_path_parts(index_subdir, PACKFILE_LOCK_NAME);
804 |   int ret = lockfile_create(packfile_lock, 0, 0);
805 |   if(ret != 0){
806 |     free(packfile_lock);
807 |     free(packfile_path);
808 |     return(ret_val);
809 |   }
810 | 
811 |   FILE *packfile = fopen(packfile_path, "a");
812 |   free(packfile_path);
813 |   if (packfile == NULL) {
814 |     lockfile_remove(packfile_lock);
815 |     free(packfile_lock);
816 |     perror("Error opening packfile");
817 |     return(ret_val);
818 |   }
819 | 
820 |   // figure our how many loose files there are
821 |   int num_loose = count_loose_files(index_subdir);
822 |   char *file_paths[num_loose];
823 | 
824 |   if (num_loose == 0) {
825 |     goto OUT1;
826 |   }
827 |   struct index_entry *new_entries = add_loose_files_to_packfile(
828 |       &num_loose, index_subdir, file_paths, packfile, packfile_lock);
829 | 
830 |   if (new_entries == NULL){
831 |     goto OUT1;
832 |   } else if (num_loose == 0) {
833 |     free(new_entries);
834 |     goto OUT1;
835 |   }
836 | 
837 |   fflush(packfile);
838 |   int fd = fileno(packfile);
839 |   fsync(fd);
840 | 
841 |   add_entries_to_index(new_entries, num_loose, index_subdir);
842 |   free(new_entries);
843 |   delete_loose_files(file_paths, num_loose);
844 |   for (int i = 0; i < num_loose; i++) {
845 |     free(file_paths[i]);
846 |   }
847 | 
848 |   ret_val = 0;
849 |   goto OUT1;
850 | 
851 |   OUT1:
852 |     fclose(packfile);
853 |     lockfile_remove(packfile_lock);
854 |     free(packfile_lock);
855 |     umask(old_umask);
856 |     return(ret_val);
857 | }
858 | 
859 | int pack_loose_files(char *indexdir) {
860 |   DIR *dir = opendir(indexdir);
861 |   if (dir == NULL){
862 |     perrorf("Error in opening directory: %s", indexdir);
863 |     return(-1);
864 |   }
865 |   struct dirent *entry;
866 |   while ((entry = readdir(dir))) {
867 |     if (entry->d_name[0] == '.' ) {
868 |       continue;
869 |     }
870 |     char *path = add_path_parts(indexdir, entry->d_name);
871 |     if (is_dir(path)) {
872 |       pack_loose_files_in_subdir(path);
873 |     }
874 |     free(path);
875 |   }
876 |   closedir(dir);
877 | 
878 |   return 0;
879 | }
880 | 


--------------------------------------------------------------------------------
/bitmap/src/packfile.h:
--------------------------------------------------------------------------------
 1 | #ifndef PACKFILE_INCLUDED
 2 | #define PACKFILE_INCLUDED
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdint.h>
 6 | 
 7 | /*--------------------------------------------------------------------*/
 8 | 
 9 | #define PACKFILE_NAME "packfile"
10 | #define PACKFILE_INDEX_NAME "packfile_index"
11 | #define TEMP_PACKFILE_INDEX_NAME ".packfile_index.tmp"
12 | #define PACKFILE_LOCK_NAME ".packfile.lock"
13 | #define EMPTY_FILE 1
14 | 
15 | /*--------------------------------------------------------------------*/
16 | 
17 | int is_corrupted(FILE* loosefile);
18 | 
19 | uint8_t *read_from_packfile(char *filename, int64_t mtime, char *store);
20 | 
21 | int pack_loose_files(char *indexdir);
22 | 
23 | int pack_loose_files_in_subdir(char *index_subdir);
24 | 
25 | int remove_if_corrupted(FILE *file, char *file_path);
26 | 
27 | /*--------------------------------------------------------------------*/
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/bitmap/src/util.c:
--------------------------------------------------------------------------------
  1 | #include <sys/stat.h>
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <string.h>
  5 | #include <limits.h>
  6 | #include <unistd.h>
  7 | #include <fcntl.h>
  8 | #include <cpuid.h>
  9 | #include <stdarg.h>
 10 | #include <stdint.h>
 11 | #include <time.h>
 12 | 
 13 | #include "util.h"
 14 | 
 15 | /*--------------------------------------------------------------------*/
 16 | 
 17 | /**
 18 |  * Allocates a new string consisting of dir + '/' + filename.
 19 |  * The combined strings must not exceed PATH_MAX-1 in length.
 20 |  */
 21 | char *add_path_parts(char *dir, char *filename) {
 22 |   char *path = malloc(PATH_MAX);
 23 |   strcpy(path, dir);
 24 |   strcat(path, "/");
 25 |   strcat(path, filename);
 26 |   return path;
 27 | }
 28 | 
 29 | /*--------------------------------------------------------------------*/
 30 | 
 31 | /**
 32 |  * Returns whether we can read from and write to the given directory.
 33 |  */
 34 | int is_directory_readwritable(char *path) {
 35 |   return (access(path, R_OK) == 0
 36 |       && access(path, W_OK) == 0);
 37 | }
 38 | 
 39 | /*--------------------------------------------------------------------*/
 40 | 
 41 | /**
 42 |  * Returns directory where bitmaps are currently stored, by checking
 43 |  * the first available directory from the list.
 44 |  */
 45 | char *get_index_directory() {
 46 |   static char *indexdir = NULL;
 47 |   if (indexdir != NULL) {
 48 |     return indexdir;
 49 |   }
 50 |   if (is_directory_readwritable("/4gram/")) {
 51 |     return (indexdir = "/4gram");
 52 |   }
 53 |   char *home_cache_dir = add_path_parts(getenv("HOME"), ".cache");
 54 |   char *home_4gram_dir = add_path_parts(home_cache_dir, "4gram");
 55 |   mkdir(home_cache_dir, 0700);
 56 |   mkdir(home_4gram_dir, 0777);
 57 |   if (is_directory_readwritable(home_4gram_dir)) {
 58 |     return (indexdir = home_4gram_dir);
 59 |   }
 60 |   perror("Could not find readwritable directory to cache 4grams\n");
 61 |   return(NULL);
 62 | }
 63 | 
 64 | /*--------------------------------------------------------------------*/
 65 | 
 66 | /**
 67 |  * Returns what index subdirectory we should store the index for a file with
 68 |  * the given timestamp.
 69 |  *
 70 |  * These subdirectories are of the form "indexdir/YYYY_MM"
 71 |  */
 72 | char *get_index_subdirectory(char *indexdir, int64_t timestamp) {
 73 |   struct tm *gmt = gmtime(&timestamp);
 74 |   char date_string[8];
 75 |   strftime(date_string, sizeof(date_string), "%Y_%m", gmt);
 76 |   char *index_subdir = add_path_parts(indexdir, date_string);
 77 |   mkdir(index_subdir, 0777);
 78 |   return index_subdir;
 79 | }
 80 | 
 81 | /*--------------------------------------------------------------------*/
 82 | 
 83 | /**
 84 |  * Determines at runtime whether our CPU supports BMI2 instructions.
 85 |  */
 86 | int supports_bmi2() {
 87 |   static int supports_bmi2_cache = -1;
 88 |   if (supports_bmi2_cache != -1) {
 89 |     return supports_bmi2_cache;
 90 |   }
 91 |   unsigned int level = 0;
 92 |   unsigned int eax = 1;
 93 |   unsigned int ebx, ecx, edx;
 94 |   __get_cpuid(level, &eax, &ebx, &ecx, &edx);
 95 |   supports_bmi2_cache = (ebx >> 8) & 1;
 96 |   return supports_bmi2_cache;
 97 | }
 98 | 
 99 | /*--------------------------------------------------------------------*/
100 | 
101 | /**
102 |  * Frees the data stored in the given array.
103 |  */
104 | void free_intarray(struct intarray arr) {
105 |   free(arr.data);
106 | }
107 | 
108 | /**
109 |  * Frees the data stored by the given array array.
110 |  *
111 |  * Recursively frees all sub-arrays.
112 |  */
113 | void free_intarrayarray(struct intarrayarray arr) {
114 |   for (int i = 0; i < arr.num_rows; i++) {
115 |     free_intarray(arr.rows[i]);
116 |   }
117 |   free(arr.rows);
118 | }
119 | 
120 | /**
121 |  * Like perror, but uses a format string.
122 |  */
123 | void perrorf(char *fmt, ...) {
124 |   va_list args;
125 |   va_start(args, fmt);
126 |   vfprintf(stderr, fmt, args);
127 |   va_end(args);
128 |   fprintf(stderr, ": ");
129 |   perror("");
130 | }
131 | 
132 | /**
133 |  * Returns the mtime of the file entry at the given path.
134 |  */
135 | int64_t get_mtime(char *path) {
136 |   struct stat s;
137 |   stat(path, &s);
138 |   return s.st_mtime;
139 | }
140 | 
141 | /**
142 |  * Returns whether path points to a directory or not.
143 |  */
144 | int is_dir(char *path) {
145 |   struct stat s;
146 |   stat(path, &s);
147 |   return S_ISDIR(s.st_mode);
148 | }
149 | 


--------------------------------------------------------------------------------
/bitmap/src/util.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_INCLUDED
 2 | #define UTIL_INCLUDED
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | /*--------------------------------------------------------------------*/
 7 | 
 8 | #define NGRAM_CHARS 5
 9 | #define NGRAM_CHAR_BITS 4
10 | #define POSSIBLE_NGRAMS ((1u) << (NGRAM_CHARS * NGRAM_CHAR_BITS))
11 | #define SIZEOF_BITMAP (POSSIBLE_NGRAMS / 8)
12 | 
13 | #define BUFSIZE 2048
14 | #define CHAR_MASK ((1 << NGRAM_CHAR_BITS) - 1)
15 | #define NGRAM_MASK (POSSIBLE_NGRAMS - 1)
16 | #define NGRAM_SHIFT_LEFT_MASK (NGRAM_MASK - CHAR_MASK)
17 | #define HASH_SEED 0xfe5000 //purestorage color
18 | 
19 | #define GZ_TRUNCATED 1
20 | 
21 | /*--------------------------------------------------------------------*/
22 | 
23 | char *add_path_parts(char *dir, char *filename);
24 | 
25 | char *get_bitmap_store_directory();
26 | 
27 | int supports_bmi2();
28 | 
29 | struct intarray {
30 |   int length;
31 |   int *data;
32 | };
33 | 
34 | void free_intarray(struct intarray arr);
35 | 
36 | struct intarrayarray {
37 |   int num_rows;
38 |   struct intarray *rows;
39 | };
40 | 
41 | void free_intarrayarray(struct intarrayarray arr);
42 | 
43 | void perrorf(char *fmt, ...)
44 | __attribute__((format (printf, 1, 2)));
45 | 
46 | int64_t get_mtime(char *path);
47 | 
48 | char *get_lock_path(char *directory, char *filename);
49 | 
50 | char *get_index_subdirectory(char *indexdir, int64_t timestamp);
51 | 
52 | int is_dir(char *path);
53 | 
54 | /*--------------------------------------------------------------------*/
55 | 
56 | #endif
57 | 


--------------------------------------------------------------------------------
/build_deb.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 | docker build -t 4grep-docker docker_build
3 | docker run --rm -v $(pwd):/build \
4 | 	-e COMMIT_COUNT="$(git rev-list HEAD --count)" \
5 | 	-e COMMIT_HASH="$(git rev-parse HEAD)" \
6 | 	4grep-docker
7 | docker rmi 4grep-docker
8 | 


--------------------------------------------------------------------------------
/debian/4grep.links:
--------------------------------------------------------------------------------
1 | /usr/lib/4grep.so /usr/lib/lib4grep.so
2 | /usr/lib/lib4grep.so /usr/lib/lib4grep.so.0
3 | 


--------------------------------------------------------------------------------
/debian/changelog:
--------------------------------------------------------------------------------
1 | 4grep (0.0.1) trusty; urgency=low
2 | 
3 |   * Initial release.
4 | 
5 |  -- MAINTAINER <mpfeiffer@purestorage.com>  Wed, 09 Aug 2017 17:11:46 -0600
6 | 


--------------------------------------------------------------------------------
/debian/compat:
--------------------------------------------------------------------------------
1 | 9
2 | 


--------------------------------------------------------------------------------
/debian/control:
--------------------------------------------------------------------------------
 1 | Source: 4grep
 2 | Maintainer: Matthew Pfeiffer <mpfeiffer@purestorage.com>
 3 | Build-Depends: debhelper (>=8.0.0), gcc (>=4.9.0), liblockfile-dev, zlib1g-dev
 4 | Standards-Version: 3.9.7
 5 | Section: utils
 6 | 
 7 | Package: 4grep
 8 | Priority: extra
 9 | Architecture: any
10 | Depends: python, liblockfile1, zlib1g, ${shlibs:Depends}, ${misc:Depends}
11 | Description: like tgrep, but better
12 |  Greps over files with a persistent index and progress bar.
13 | 


--------------------------------------------------------------------------------
/debian/copyright:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/purestorage/4grep/1b721ea3ab1f284a4b41083b34f1540a90b76f6a/debian/copyright


--------------------------------------------------------------------------------
/debian/postinst:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # postinst script for 4grep
 3 | #
 4 | # see: dh_installdeb(1)
 5 | 
 6 | set -e
 7 | 
 8 | # summary of how this script can be called:
 9 | #        * <postinst> `configure' <most-recently-configured-version>
10 | #        * <old-postinst> `abort-upgrade' <new version>
11 | #        * <conflictor's-postinst> `abort-remove' `in-favour' <package>
12 | #          <new-version>
13 | #        * <postinst> `abort-remove'
14 | #        * <deconfigured's-postinst> `abort-deconfigure' `in-favour'
15 | #          <failed-install-package> <version> `removing'
16 | #          <conflicting-package> <version>
17 | # for details, see https://www.debian.org/doc/debian-policy/ or
18 | # the debian-policy package
19 | 
20 | 
21 | case "$1" in
22 |     configure)
23 |         ldconfig
24 |     ;;
25 | 
26 |     abort-upgrade|abort-remove|abort-deconfigure)
27 |     ;;
28 | 
29 |     *)
30 |         echo "postinst called with unknown argument \`$1'" >&2
31 |         exit 1
32 |     ;;
33 | esac
34 | 
35 | # dh_installdeb will replace this with shell code automatically
36 | # generated by other debhelper scripts.
37 | 
38 | #DEBHELPER#
39 | 
40 | exit 0
41 | 


--------------------------------------------------------------------------------
/debian/rules:
--------------------------------------------------------------------------------
1 | #!/usr/bin/make -f
2 | %:
3 | 	dh $@
4 | 


--------------------------------------------------------------------------------
/debian/source/format:
--------------------------------------------------------------------------------
1 | 3.0 (native)
2 | 


--------------------------------------------------------------------------------
/debian/source/options:
--------------------------------------------------------------------------------
1 | tar-ignore = "4grepc"
2 | 


--------------------------------------------------------------------------------
/description:
--------------------------------------------------------------------------------
1 | 4grep


--------------------------------------------------------------------------------
/disp_bitmap.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import print_function
  3 | from itertools import izip
  4 | from Queue import Queue
  5 | from PIL import Image
  6 | 
  7 | import multiprocessing
  8 | import subprocess
  9 | import argparse
 10 | import sys
 11 | import os
 12 | 
 13 | HELP = '''
 14 | 	This is how you use it.
 15 | 	'''
 16 | 
 17 | def write_bitmaps():
 18 | 	for filename in os.listdir("/Users/user/Desktop/logs/upstart"):
 19 | 	    if filename.endswith(".gz"):
 20 | 	    	proc = subprocess.Popen(["bitmap/exec/generate_bitmap"], stdout=subprocess.PIPE, stdin=subprocess.PIPE)
 21 | 	    	ret, stderr = proc.communicate(filename)
 22 | 	    	file = open(filename[0:-3]+'.bin',"w")
 23 | 	    	file.write(ret)
 24 | 	    	file.close()
 25 | 
 26 | def ratio(im):
 27 | 	pixels = im.getdata()
 28 | 	threshold = 100
 29 | 	count = 0
 30 | 	for pixel in pixels:
 31 | 		if pixel > threshold:
 32 | 			count += 1
 33 | 	n = len(pixels)
 34 | 	print('Percentage:{:.2f} Black:{} Size:{}'.format(100.0*count/n, count, n))
 35 | 
 36 | 
 37 | class FinalArray(object):
 38 | 	def __init__(self):
 39 | 		self.bytelist = bytearray(131072)
 40 | 
 41 | def byte_or(bytearray_list, i):
 42 | 	a = 0
 43 | 	for bytearray in bytearray_list:
 44 | 		a = a | bytearray[i]
 45 | 	return (a,i)
 46 | 
 47 | def update_final(result, final):
 48 | 	a , i = result
 49 | 	print(i)
 50 | 	final.bytelist[i] = a
 51 | 
 52 | def combine_bitmaps(bytearray_list):
 53 | 	l = len(bytearray_list[0])
 54 | 	print(l)
 55 | 	final = FinalArray()
 56 | 	for i in range(l):
 57 | 		r = byte_or(bytearray_list, i)
 58 | 		update_final(r, final)
 59 | 	return final
 60 | 
 61 | def get_byte_list():
 62 | 	results = []
 63 | 	for filename in os.listdir("/Users/user/Desktop/logs/bitmaps"):
 64 | 		if filename.endswith(".bin"):
 65 | 			bin_file_tmp = open('../Desktop/logs/bitmaps/' + filename, 'rb')
 66 | 			results.append(bytearray(bin_file_tmp.read()))
 67 | 	return results
 68 | 
 69 | # a = get_byte_list()
 70 | # b = combine_bitmaps(a)
 71 | # im = Image.frombytes("1", (1024, 1024), str(b.bytelist))
 72 | # im.show()
 73 | 
 74 | class Progress(object):
 75 | 	def __init__(self):
 76 | 		self.init = 0
 77 | 		self.curr = 0
 78 | 
 79 | def print_progress(progress):
 80 | 	perc = 100-((progress.curr-1)*100.0/(progress.init-1))
 81 | 	print('>>{:.1f}%\033[K\033[F'.format(perc), file=sys.stderr)
 82 | 
 83 | def start():
 84 | 	bitmap_queue = Queue()
 85 | 	l = 131072
 86 | 	for filename in os.listdir("/Users/user/Desktop/logs/bitmaps"):
 87 | 		if filename.endswith(".bin"):
 88 | 			bin_file_tmp = open('../Desktop/logs/bitmaps/' + filename, 'rb')
 89 | 			ba = bytearray(bin_file_tmp.read())
 90 | 			bitmap_queue.put(ba)
 91 | 
 92 | 	prog = Progress()
 93 | 	prog.init = bitmap_queue.qsize()
 94 | 	prog.curr = bitmap_queue.qsize()
 95 | 
 96 | 	while prog.curr > 1:
 97 | 		print_progress(prog)
 98 | 		a = bitmap_queue.get()
 99 | 		b = bitmap_queue.get()
100 | 		c = bytearray(l)
101 | 
102 | 		for i in range(l):
103 | 			c[i] = a[i] | b[i]
104 | 		bitmap_queue.put(c)
105 | 		prog.curr += -1
106 | 
107 | 	final = bitmap_queue.get()
108 | 	im = Image.frombytes("1", (1024, 1024), str(final))
109 | 	im.show()
110 | 	ratio(im)
111 | 
112 | class stdin_iter:
113 | 	def __init__(self):
114 | 		pass
115 | 
116 | 	def __iter__(self):
117 | 		return self
118 | 
119 | 	def next(self):
120 | 		ret = sys.stdin.readline().strip()
121 | 		if not ret:
122 | 			raise StopIteration
123 | 		return ret
124 | 
125 | def main():
126 | 	parser = argparse.ArgumentParser("disp_bitmap", usage=HELP, add_help=False)
127 | 	parser.add_argument('files', metavar='FILE', type=str, nargs='*')
128 | 	args, options = parser.parse_known_args()
129 | 	filelist = args.files
130 | 	if not filelist:
131 | 		filelist = stdin_iter()
132 | 	start()
133 | 
134 | if __name__ == "__main__":
135 | 	try:
136 | 		main()
137 | 	except IOError as e:
138 | 		if e.errno == errno.EPIPE:
139 | 			pass
140 | 	except KeyboardInterrupt:
141 | 		pass
142 | 


--------------------------------------------------------------------------------
/docker_build/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:xenial
 2 | 
 3 | ENV DEBIAN_FRONTEND=noninteractive
 4 | 
 5 | RUN mkdir /build
 6 | 
 7 | RUN apt-get update && apt-get install -y --force-yes \
 8 |     gcc-4.8 \
 9 |     libc6 \
10 |     build-essential \
11 |     liblockfile-dev \
12 |     zlib1g-dev \
13 |     git \
14 |     python-pip \
15 |     python-dev \
16 |     devscripts \
17 |     debhelper
18 | 
19 | WORKDIR "/build"
20 | CMD git submodule init && git submodule update && make test && sh docker_build/docker_build_deb.sh
21 | 


--------------------------------------------------------------------------------
/docker_build/docker_build_deb.sh:
--------------------------------------------------------------------------------
1 | mkdir build
2 | cp -r * build
3 | cd build
4 | make
5 | dch -v "1.0.0-$COMMIT_COUNT" "$COMMIT_HASH"
6 | debuild -i -I -us -uc -b
7 | cd ..
8 | rm -rf build
9 | 


--------------------------------------------------------------------------------
/img/example.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/purestorage/4grep/1b721ea3ab1f284a4b41083b34f1540a90b76f6a/img/example.gif


--------------------------------------------------------------------------------
/img/zgrepvs4grep.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/purestorage/4grep/1b721ea3ab1f284a4b41083b34f1540a90b76f6a/img/zgrepvs4grep.png


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import unittest
  4 | import tempfile
  5 | import os
  6 | import ctypes
  7 | import imp
  8 | import shutil
  9 | import subprocess
 10 | import sys
 11 | 
 12 | TGREP_DIR = os.path.dirname(os.path.realpath(__file__))
 13 | TGREP_FILE = os.path.join(TGREP_DIR, '4grep')
 14 | 
 15 | tgrep = imp.load_source('4grep', TGREP_FILE)
 16 | 
 17 | TRUNC = 0
 18 | MTCH = 1
 19 | NO_MTCH = 2
 20 | 
 21 | class TestFiltering(unittest.TestCase):
 22 | 	def setUp(self):
 23 | 		self.tempdir = tempfile.mkdtemp()
 24 | 		self.tempindex = tempfile.mkdtemp()
 25 | 
 26 | 	def tearDown(self):
 27 | 		shutil.rmtree(self.tempdir)
 28 | 		shutil.rmtree(self.tempindex)
 29 | 
 30 | 	def test_filter(self):
 31 | 		index = tgrep.StringIndex([[str(10 ** tgrep.NGRAM_CHARS)]])
 32 | 		c_index = index.get_index_struct()
 33 | 		for i in range(10):
 34 | 			name = os.path.join(self.tempdir, '{}.txt'.format(i))
 35 | 			f = open(name, 'w')
 36 | 			f.write(str(i * 10 ** tgrep.NGRAM_CHARS))
 37 | 			f.close()
 38 | 
 39 | 			c_filename = ctypes.c_char_p(name)
 40 | 			# first run through: no bitmaps, 2nd bit should be set
 41 | 			ret = tgrep.start_filter(c_index, c_filename, self.tempindex)
 42 | 			if i == 1:
 43 | 				self.assertEqual(ret, 3)
 44 | 			else:
 45 | 				self.assertEqual(ret, 4)
 46 | 			# 2nd run through: all bitmaps should be cached, 2nd bit unset
 47 | 			ret = tgrep.start_filter(c_index, c_filename, self.tempindex)
 48 | 			if i == 1:
 49 | 				self.assertEqual(ret, 1)
 50 | 			else:
 51 | 				self.assertEqual(ret, 2)
 52 | 
 53 | 	def test_filter_deletedfiles(self):
 54 | 		index = tgrep.StringIndex([[str(10 ** tgrep.NGRAM_CHARS)]])
 55 | 		c_index = index.get_index_struct()
 56 | 		for i in range(10):
 57 | 			name = os.path.join(self.tempdir, '{}.txt'.format(i))
 58 | 			c_filename = ctypes.c_char_p(name)
 59 | 			# no files exist, so should filter files out
 60 | 			ret = tgrep.start_filter(c_index, c_filename, self.tempindex)
 61 | 			self.assertEqual(ret, -1)
 62 | 
 63 | 	def test_filter_modifiedfiles(self):
 64 | 		index = tgrep.StringIndex([[str(10 ** tgrep.NGRAM_CHARS)]])
 65 | 		c_index = index.get_index_struct()
 66 | 		# write garbage to each file with an old modification time
 67 | 		for i in range(10):
 68 | 			name = os.path.join(self.tempdir, '{}.txt'.format(i))
 69 | 			f = open(name, 'w')
 70 | 			f.write(str(i * 9 ** tgrep.NGRAM_CHARS))
 71 | 			f.close()
 72 | 			os.utime(name, (100, 100))
 73 | 		# make sure nothing is found when we search
 74 | 		for i in range(10):
 75 | 			name = os.path.join(self.tempdir, '{}.txt'.format(i))
 76 | 			c_filename = ctypes.c_char_p(name)
 77 | 			ret = tgrep.start_filter(c_index, c_filename, self.tempindex)
 78 | 			self.assertEqual(ret, 4)
 79 | 		# modify each file with mtime=real, current time
 80 | 		for i in range(10):
 81 | 			name = os.path.join(self.tempdir, '{}.txt'.format(i))
 82 | 			f = open(name, 'w')
 83 | 			f.write(str(i * 10 ** tgrep.NGRAM_CHARS))
 84 | 			f.close()
 85 | 		# the query should now match file 1.
 86 | 		for i in range(10):
 87 | 			name = os.path.join(self.tempdir, '{}.txt'.format(i))
 88 | 			c_filename = ctypes.c_char_p(name)
 89 | 			ret = tgrep.start_filter(c_index, c_filename, self.tempindex)
 90 | 			if i == 1:
 91 | 				self.assertEqual(ret, 3)
 92 | 			else:
 93 | 				self.assertEqual(ret, 4)
 94 | 
 95 | class TestIndexAutodetection(unittest.TestCase):
 96 | 	def test_parsable_chars(self):
 97 | 		self.assertEqual(
 98 | 			tgrep.get_index_from_regex('12345.54321'),
 99 | 			tgrep.StringIndex([['12345', '54321']]))
100 | 		self.assertEqual(
101 | 			tgrep.get_index_from_regex('12345+54321'),
102 | 			tgrep.StringIndex([['12345', '54321']]))
103 | 		self.assertEqual(
104 | 			tgrep.get_index_from_regex('12345.*54321'),
105 | 			tgrep.StringIndex([['12345', '54321']]))
106 | 
107 | 	def test_short(self):
108 | 		self.assertEqual(
109 | 			tgrep.get_index_from_regex('1234'),
110 | 			tgrep.empty_index())
111 | 		self.assertEqual(
112 | 			tgrep.get_index_from_regex(''),
113 | 			tgrep.empty_index())
114 | 		self.assertEqual(
115 | 			tgrep.get_index_from_regex('1234|4321'),
116 | 			tgrep.empty_index())
117 | 		self.assertEqual(
118 | 			tgrep.get_index_from_regex('1234|4321.*4321'),
119 | 			tgrep.empty_index())
120 | 
121 | 	def test_regex_or(self):
122 | 		self.assertEqual(
123 | 			tgrep.get_index_from_regex('one111|two22|three'),
124 | 			tgrep.StringIndex([['one111'], ['two22'], ['three']]))
125 | 		self.assertTrue(
126 | 			tgrep.get_index_from_regex('one***111|two22|three').empty())
127 | 		self.assertEqual(
128 | 			tgrep.get_index_from_regex('12345.54321|two22|three'),
129 | 			tgrep.StringIndex([['12345', '54321'], ['two22'], ['three']]))
130 | 
131 | 	def test_literal(self):
132 | 		self.assertEqual(
133 | 			tgrep.get_index_from_regex('qwertyuiop'),
134 | 			tgrep.StringIndex([['qwertyuiop']]))
135 | 
136 | 	def test_regex_question_mark(self):
137 | 		self.assertTrue(
138 | 			tgrep.get_index_from_regex('12345?').empty())
139 | 
140 | 	def test_regex_star(self):
141 | 		self.assertTrue(
142 | 			tgrep.get_index_from_regex('12345*').empty())
143 | 
144 | 	def test_regex_curly_braces(self):
145 | 		self.assertTrue(
146 | 			tgrep.get_index_from_regex('12345{0,9}').empty())
147 | 
148 | class TestStringIndex(unittest.TestCase):
149 | 	def test_get_index_struct(self):
150 | 		si = tgrep.StringIndex([['aaaaa']])
151 | 		struct = si.get_index_struct()
152 | 		self.assertEqual(struct.num_rows, 1)
153 | 		self.assertEqual(struct.rows[0].length, 1)
154 | 		self.assertEqual(struct.rows[0].data[0], 0b00010001000100010001)
155 | 
156 | 		si = tgrep.StringIndex([['aaaaa'], ['bbbbb']])
157 | 		struct = si.get_index_struct()
158 | 		self.assertEqual(struct.num_rows, 2)
159 | 		self.assertEqual(struct.rows[0].length, 1)
160 | 		self.assertEqual(struct.rows[0].data[0], 0b00010001000100010001)
161 | 		self.assertEqual(struct.rows[1].length, 1)
162 | 		self.assertEqual(struct.rows[1].data[0], 0b00100010001000100010)
163 | 
164 | 		si = tgrep.StringIndex([['aaaaa', 'bbbbb'], ['bbbbb']])
165 | 		struct = si.get_index_struct()
166 | 		self.assertEqual(struct.num_rows, 2)
167 | 		self.assertEqual(struct.rows[0].length, 2)
168 | 		self.assertEqual(struct.rows[0].data[0], 0b00010001000100010001)
169 | 		self.assertEqual(struct.rows[0].data[1], 0b00100010001000100010)
170 | 
171 | class TestTgrep(unittest.TestCase):
172 | 	def setUp(self):
173 | 		self.tempdir = tempfile.mkdtemp()
174 | 
175 | 	def tearDown(self):
176 | 		shutil.rmtree(self.tempdir)
177 | 
178 | 	def test_tgrep(self):
179 | 		index = str(10 ** tgrep.NGRAM_CHARS)
180 | 		for i in range(10):
181 | 			name = os.path.join(self.tempdir, '{}.txt'.format(i))
182 | 			f = open(name, 'w')
183 | 			f.write(str(i * 10 ** tgrep.NGRAM_CHARS))
184 | 			f.close()
185 | 		search = str(10 ** tgrep.NGRAM_CHARS)
186 | 		command = "{} {} {}/*.txt".format(
187 | 			TGREP_FILE, search, self.tempdir)
188 | 		out = subprocess.check_output(command, shell=True)
189 | 		self.assertEqual(out.strip(), self.tempdir + '/1.txt:' + search)
190 | 
191 | 	def test_tgrep_or_regex(self):
192 | 		str1 = str(10 ** tgrep.NGRAM_CHARS)
193 | 		str2 = str(2 * 10 ** tgrep.NGRAM_CHARS)
194 | 		search = "{}|{}".format(str1, str2)
195 | 		for i in range(10):
196 | 			name = os.path.join(self.tempdir, '{}.txt'.format(i))
197 | 			f = open(name, 'w')
198 | 			f.write(str(i * 10 ** tgrep.NGRAM_CHARS))
199 | 			f.close()
200 | 		command = "{} -E '{}' {}/*.txt".format(
201 | 			TGREP_FILE, search, self.tempdir)
202 | 		out = subprocess.check_output(command, shell=True)
203 | 		self.assertEqual(out.strip(),
204 | 				self.tempdir + '/1.txt:' + str1 + "\n"
205 | 				+ self.tempdir + "/2.txt:" + str2)
206 | 
207 | 	def test_tgrep_unindexed(self):
208 | 		index = str(10 ** tgrep.NGRAM_CHARS)
209 | 		c_index = ctypes.c_char_p(index)
210 | 		for i in range(10):
211 | 			name = os.path.join(self.tempdir, '{}.txt'.format(i))
212 | 			f = open(name, 'w')
213 | 			f.write(str(i * 10 ** tgrep.NGRAM_CHARS))
214 | 			f.close()
215 | 		search = str(10 ** tgrep.NGRAM_CHARS)
216 | 		command = "{} --filter='' {} {}/*.txt".format(
217 | 			TGREP_FILE, search, self.tempdir)
218 | 		out = subprocess.check_output(command, shell=True)
219 | 		self.assertEqual(out.strip(), self.tempdir + '/1.txt:' + search)
220 | 
221 | if __name__ == '__main__':
222 | 	unittest.main()
223 | 


--------------------------------------------------------------------------------
/tune.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | 
 4 | def set_params(n, b):
 5 | 	f = open("./bitmap/src/util.h")
 6 | 	lines = list(f.readlines())
 7 | 	n_set = b_set = False
 8 | 	for i, line in enumerate(lines):
 9 | 		if not n_set and 'NGRAM_CHARS' in line:
10 | 			lines[i] = "#define NGRAM_CHARS {}\n".format(n)
11 | 			n_set = True
12 | 			if b_set:
13 | 				break
14 | 		elif not b_set and 'NGRAM_CHAR_BITS' in line:
15 | 			lines[i] = '#define NGRAM_CHAR_BITS {}\n'.format(b)
16 | 			b_set = True
17 | 			if n_set:
18 | 				break
19 | 	f.close()
20 | 	with open('./bitmap/src/util.h', 'w') as f:
21 | 		f.writelines(lines)
22 | 
23 | def test_params(n, b):
24 | 	set_params(n, b)
25 | 	subprocess.check_call('make')
26 | 	subprocess.check_call('rm -rf ~/.cache/4gram', shell=True)
27 | 	search = 'May 10 12:12:12'
28 | 	print('{} {}'.format(n, b))
29 | 	for i in range(2):
30 | 		p = subprocess.Popen('find /home/mpfeiffer/logs/remote_logs -name "*.gz" -type f | 4grep --index="{}" "{}" > /dev/null'.format(search, search), shell=True, stderr=subprocess.PIPE)
31 | 		output = p.communicate()[1]
32 | 		lines = output.split('\n')
33 | 		lastline = output.split('\n')[-3]
34 | 		print(lastline)
35 | 	print(subprocess.check_output('du -h ~/.cache/4gram/packfile')
36 | 
37 | for n in range(2, 10 + 1):
38 | 	for b in range(1, min(31 / n + 1, 8 + 1)):
39 | 		test_params(n, b)
40 | 


--------------------------------------------------------------------------------