├── .gitignore ├── .gitmodules ├── 4grep ├── LICENSE ├── Makefile ├── README.md ├── bitmap ├── .gitignore ├── Makefile ├── exec │ └── .gitignore ├── lib │ ├── minunit.h │ └── portable_endian.h ├── main │ ├── generate_bitmap.c │ └── test.c └── src │ ├── bitmap.c │ ├── bitmap.h │ ├── filter.c │ ├── filter.h │ ├── packfile.c │ ├── packfile.h │ ├── util.c │ └── util.h ├── build_deb.sh ├── debian ├── 4grep.links ├── changelog ├── compat ├── control ├── copyright ├── postinst ├── rules └── source │ ├── format │ └── options ├── description ├── disp_bitmap.py ├── docker_build ├── Dockerfile └── docker_build_deb.sh ├── img ├── example.gif └── zgrepvs4grep.png ├── test.py └── tune.py /.gitignore: -------------------------------------------------------------------------------- 1 | .*.swp 2 | .*.swo 3 | *.orig 4 | *.DS_Store 5 | **/*~ 6 | **/*.pyc 7 | 4grepc 8 | debian/4grep 9 | debian/4grep.substvars 10 | debian/4grep.debhelper.log 11 | debian/files 12 | debian/debhelper-build-stamp 13 | compile_commands.json 14 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "zstd"] 2 | path = bitmap/lib/zstd 3 | url = https://github.com/facebook/zstd 4 | [submodule "bitmap/lib/xxHash"] 5 | path = bitmap/lib/xxhash 6 | url = https://github.com/Cyan4973/xxHash 7 | ignore = dirty 8 | [submodule "bitmap/lib/zstd"] 9 | path = bitmap/lib/zstd 10 | url = https://github.com/facebook/zstd.git 11 | -------------------------------------------------------------------------------- /4grep: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | from distutils.spawn import find_executable 4 | from multiprocessing.pool import ThreadPool 5 | from ctypes.util import find_library 6 | from contextlib import contextmanager 7 | from collections import deque 8 | from subprocess import PIPE 9 | from Queue import Empty 10 | 11 | import multiprocessing as mp 12 | import subprocess 13 | import threading 14 | import itertools 15 | import argparse 16 | import tempfile 17 | import getpass 18 | import shutil 19 | import signal 20 | import ctypes as ct 21 | import errno 22 | import math 23 | import time 24 | import sys 25 | import os 26 | import re 27 | 28 | NGRAM_CHARS = 5 29 | TGREP_DIR = os.path.dirname(os.path.realpath(__file__)) 30 | MODULE_PATHS = [os.path.join(TGREP_DIR, module_name) 31 | for module_name in ("bitmap/4grep.so", "4grep.so")] 32 | REGEX_METACHARACTERS = r".^$*+?{}[]\|()" 33 | ESCAPED_REGEX_METACHARACTERS = re.escape(REGEX_METACHARACTERS) 34 | 35 | try: 36 | module_path = next(m for m in MODULE_PATHS if os.path.isfile(m)) 37 | except StopIteration: 38 | module_path = find_library("4grep") 39 | if module_path is None: 40 | print("4grep: Error: 4grep.so not found") 41 | sys.exit(-1) 42 | 43 | class intarray(ct.Structure): 44 | _fields_ = [("length", ct.c_int), ("data", ct.POINTER(ct.c_int))] 45 | 46 | class intarrayarray(ct.Structure): 47 | _fields_ = [("num_rows", ct.c_int), ("rows", ct.POINTER(intarray))] 48 | 49 | mymod = ct.cdll.LoadLibrary(module_path) 50 | 51 | strings_to_sorted_indices = mymod.strings_to_sorted_indices 52 | strings_to_sorted_indices.argtypes = [ct.POINTER(ct.c_char_p), ct.c_int] 53 | strings_to_sorted_indices.restype = intarray 54 | 55 | start_filter = mymod.start_filter 56 | start_filter.argtypes = [intarrayarray, ct.c_char_p, ct.c_char_p] 57 | start_filter.restype = ct.c_int 58 | 59 | pack = mymod.pack_loose_files 60 | pack.argtypes = [ct.c_char_p] 61 | 62 | get_index_directory = mymod.get_index_directory 63 | get_index_directory.restype = ct.c_char_p 64 | 65 | HELP = '''\033[1m4grep\033[0m: fast grep using multiple cpus and 4gram filter 66 | 67 | \033[1mSIMPLE USAGE\033[0m 68 | 4grep 69 | find | 4grep 70 | 71 | \033[1mADVANCED USAGE\033[0m 72 | 4grep --filter 73 | 4grep --filter --filter 74 | 4grep --cores N --indexdir path/to/index 75 | 76 | \033[1mOPTIONAL ARGUMENTS\033[0m 77 | --filter specify a filter string 78 | --cores limit number of cores used 79 | --excludes exclude files and directories by regex 80 | --indexdir specify directory to store index 81 | 82 | \033[1mDESCRIPTION\033[0m 83 | For standard use, 4grep takes in two parameters: a non-regex string 84 | and the list of files. The string is first used to filter out files that 85 | have no instances of the string anywhere in the file. It will the grep using 86 | the string to find the lines where the string exists. 87 | 88 | The more advanced uses breaks down the standard case. You can specify the 89 | filter string and regex seperately. The filter string will be used to 90 | filter any files that have no instances of the string anywhere in the file. 91 | The regex is then passed onto grep for these subset of files and will give 92 | you the lines which contain the regex. 93 | 94 | Hence an advanced usage of 4grep may be that the list of files are first 95 | filtered so that only files that have a certain string remain. Then the 96 | regex will grep for lines that contain something else. 97 | 98 | [--cores] was added to limit the number of cores that 4grep uses. If not 99 | specified, or too large, the program will use the maximum number of cores -1. 100 | 101 | \033[1mEXAMPLES\033[0m 102 | $ 4grep WARNING foo/bar/log.gz 103 | This will search for WARNING in the file 'log.gz', first filtering then grep 104 | 105 | $ 4grep --filter WARNING [0-9] foo/bar/log.gz 106 | This is more advanced use. First the list of files will be filtered so that 107 | only files with the string 'WARNING' remain. Then the regex '[0-9]' 108 | will be used to grep and so any line that contains a number, in this subset 109 | of files, will be printed. 110 | 111 | $ 4grep --exclude='bar|fizz' 'STACKTRACE' foo///.log 112 | This will search all of the files matching the regex 113 | .*foo.*/.*/.*/.*.log.* and exclude all directories named 'bar' or 114 | 'fizz'. 115 | 116 | \033[1mNOTES\033[0m 117 | - Filter strings are auto-detected from the regex in simple cases. 118 | - Filter strings must be at least 5 characters 119 | - Filter strings do not support regex yet and so are parsed as a 120 | literal string 121 | ''' 122 | 123 | def run_pack_process(*args): 124 | ignore_sigint() 125 | pack(*args) 126 | 127 | 128 | # adapted from answers at https://stackoverflow.com/questions/5081657/ 129 | @contextmanager 130 | def redirect(from_file, to_file): 131 | """ 132 | Redirects output to from_file's file descriptor into to_file's 133 | descriptor. 134 | 135 | For example, the following redirects stdout to stderr: 136 | 137 | import sys, os 138 | 139 | with redirect(sys.stdout, sys.stderr): 140 | print('Hello, world!') # outputs to stderr 141 | os.system("echo hello world") # also outputs to stderr 142 | 143 | from_file is flushed when entering and to_file is flushed when exiting 144 | the context to make sure userspace buffers don't write to the wrong 145 | place. 146 | 147 | You probably don't want to write to both from_file and to_file under 148 | this context. 149 | """ 150 | # backup from_file fd by dup'ing it 151 | from_fd = from_file.fileno() 152 | dup_from_fd = os.dup(from_fd) 153 | 154 | from_file.flush() 155 | 156 | # replace from_file fd with to_file dup fd 157 | os.dup2(to_file.fileno(), from_fd) 158 | try: 159 | yield 160 | finally: 161 | # restore original from_file fd 162 | os.dup2(dup_from_fd, from_fd) 163 | # get rid of backup from_file fd 164 | os.close(dup_from_fd) 165 | 166 | to_file.flush() 167 | 168 | def filter_and_grep_worker_func(in_queue, out_queue, options, regex, index, 169 | index_dir, quit_flag): 170 | ignore_sigint() 171 | tp = ThreadPool(1) 172 | while not quit_flag.value: 173 | try: 174 | item = in_queue.get(timeout=1) 175 | if item is None: 176 | return 177 | (i, f) = item 178 | result = tp.apply_async( 179 | do_filter_and_grep, (i, options, regex, 180 | f, index, index_dir)) 181 | while not result.ready(): 182 | result.wait(1.0) 183 | if quit_flag.value: 184 | tp.terminate() 185 | return 186 | out_queue.put(result.get()) 187 | except Empty: 188 | pass 189 | 190 | def do_filter_and_grep(i, options, regex, f, index=None, index_dir=None): 191 | BTMP_MTCH = 1 192 | BTMP_NOMTCH = 2 193 | NOBTMP_MTCH = 3 #never gets used since default 194 | NOBTMP_NOMTCH = 4 195 | bitmapped = filtered = False 196 | err = output = "" 197 | 198 | if index and not index.empty(): 199 | assert index_dir is not None 200 | index_dir_char_p = ct.c_char_p(index_dir) 201 | c_filename = ct.c_char_p(f) 202 | filter_struct = index.get_index_struct() 203 | with tempfile.TemporaryFile() as temp: 204 | with redirect(sys.stderr, temp): 205 | ret = start_filter( 206 | filter_struct, c_filename, 207 | index_dir_char_p) 208 | temp.seek(0) 209 | err = temp.read() 210 | 211 | bitmapped = ret == BTMP_MTCH or ret == BTMP_NOMTCH 212 | filtered = ret == NOBTMP_NOMTCH or ret == BTMP_NOMTCH 213 | 214 | if not filtered: 215 | output, grep_err = do_grep(options, regex, f) 216 | err += grep_err 217 | return (i, output, err, (bitmapped, filtered)) 218 | 219 | def do_grep(options, regex, f): 220 | grep = ["zgrep"] + options + ["--"] + [regex, f] 221 | # see https://blog.nelhage.com/2010/02/a-very-subtle-bug/ 222 | # or http://bugs.python.org/issue1652 for why we need to handle SIGPIPE 223 | default_sigpipe = lambda: signal.signal(signal.SIGPIPE, signal.SIG_DFL) 224 | p = subprocess.Popen(grep, stdout=subprocess.PIPE, stderr=subprocess.PIPE, 225 | preexec_fn=default_sigpipe) 226 | output, err = p.communicate() 227 | return (output, err) 228 | 229 | def print_progress_bar(progress, done, tracelog): 230 | total_files = progress.total_files 231 | count = progress.count 232 | elapsed = time.time() - progress.init_time 233 | mins, secs = divmod(elapsed, 60) 234 | if total_files == 0: 235 | return 236 | 237 | if count == 0: 238 | print('>{bold}Done:{end}{:5.1f}% of {}{}{end} ' 239 | '{bold}Elapsed:{end}{:g}m{:>04.1f}s{}'.format( 240 | 0, progress.color, total_files, 241 | mins, secs, Color.CLEAR_END+Color.UP, bold=Color.BOLD, 242 | end=Color.END), file=sys.stderr) 243 | return 244 | 245 | remain = (total_files-count)*elapsed/count 246 | mins2, secs2 = divmod(remain, 60) 247 | bitmapped_p = progress.bitmapped*100.0/count 248 | filtered_p = progress.filtered*100.0/count 249 | 250 | if not done: 251 | done_p = math.floor(count*1000.0/total_files)/10 252 | print('>{bold}Done:{end}{:5.1f}% of {}{}{end} ' 253 | '{bold}Elapsed:{end}{:g}m{:>04.1f}s ' 254 | '{bold}Bitmapped:{end}{:5.1f}% ' 255 | '{bold}Filtered:{end}{:5.1f}% ' 256 | '{bold}ETA:{end}{:g}m{:>04.1f}s{}'.format( 257 | done_p, progress.color, total_files, mins, secs, 258 | bitmapped_p, filtered_p, mins2, secs2, Color.CLEAR_END+Color.UP, 259 | bold=Color.BOLD, end=Color.END), file=sys.stderr) 260 | 261 | if done: 262 | print('>{bold}{}Finished:{end}{} files ' 263 | '{bold}Elapsed:{end}{:g}m{:>04.1f}s ' 264 | '{bold}Bitmapped:{end}{:5.1f}% ' 265 | '{bold}Filtered:{end}{:5.1f}%{}'.format( 266 | Color.GREEN, total_files, mins, secs, bitmapped_p, 267 | filtered_p, Color.CLEAR_END, bold=Color.BOLD, end=Color.END), file=sys.stderr) 268 | tracelog.bitmapped = bitmapped_p 269 | tracelog.filtered = filtered_p 270 | tracelog.total_files = total_files 271 | tracelog.elapsed = elapsed 272 | print_to_log(tracelog) 273 | 274 | 275 | def print_accumulated(progress): 276 | if progress.printed in progress.gout: 277 | print(Color.CLEAR_LINE, end='', file=sys.stderr) 278 | while progress.printed in progress.gout: 279 | output = progress.gout[progress.printed] 280 | print(output, end='') 281 | progress.printed += 1 282 | 283 | def start_pack_process(progress, bitmap_store_dir_char_p): 284 | if progress.pack_process: 285 | progress.pack_process.join() 286 | progress.pack_process = mp.Process( 287 | target=run_pack_process, 288 | args=(bitmap_store_dir_char_p,)) 289 | progress.pack_process.start() 290 | 291 | def handle_results(result_queue, progress, index_dir): 292 | while not result_queue.empty(): 293 | update_progress(result_queue.get(timeout=1), progress, index_dir) 294 | output_progress(progress) 295 | 296 | def update_progress(result, progress, bitmap_store_dir_char_p): 297 | i, output, err, b = result 298 | if err: 299 | progress.error_queue.append(err) 300 | progress.bitmapped += b[0] 301 | progress.filtered += b[1] 302 | progress.gout[i] = output 303 | progress.count += 1 304 | if ((progress.count % 1000 == 0) or (progress.count == \ 305 | progress.total_files)) and not (progress.pack_process.is_alive()): 306 | start_pack_process(progress, bitmap_store_dir_char_p) 307 | 308 | 309 | def output_progress(progress): 310 | while len(progress.error_queue) != 0: 311 | err = progress.error_queue.popleft() 312 | print(Color.CLEAR_LINE, end='', file=sys.stderr) 313 | print(err, end='', file=sys.stderr) 314 | print_accumulated(progress) 315 | print_progress_bar(progress, False, None) 316 | 317 | def queue_generator(queue, generator): 318 | for item in generator: 319 | queue.append(item) 320 | queue.append(None) 321 | 322 | class Color: 323 | GREEN = '\033[92m' 324 | RED = '\033[91m' 325 | BOLD = '\033[1m' 326 | END = '\033[0m' 327 | UP = '\033[F' 328 | CLEAR_END = '\033[K' 329 | CLEAR_LINE = '\x1b[2K' 330 | 331 | class SearchProgress(object): 332 | def __init__(self): 333 | self.init_time = 0 334 | self.count = 0 335 | self.bitmapped = 0 336 | self.filtered = 0 337 | self.printed = 0 338 | self.total_files = 0 339 | self.gout = {} 340 | self.color = Color.RED + Color.BOLD 341 | self.pack_process = None 342 | self.error_queue = deque() 343 | 344 | 345 | def ignore_sigint(): 346 | signal.signal(signal.SIGINT, signal.SIG_IGN) 347 | 348 | def smp_loop(options, files, index, tracelog): 349 | index_dir = tracelog.indexdir_abs 350 | progress = SearchProgress() 351 | progress.init_time = tracelog.init_time 352 | file_queue = deque() 353 | file_queueing_thread = threading.Thread(target=queue_generator, 354 | args=(file_queue, files)) 355 | file_queueing_thread.daemon = True 356 | file_queueing_thread.start() 357 | cores = min(mp.cpu_count()-1, tracelog.cores) if tracelog.cores \ 358 | else mp.cpu_count() - 1 359 | print('{bold}using {} cores{end}\n'.format(cores, bold=Color.BOLD, 360 | end=Color.END), file=sys.stderr) 361 | filter_and_grep_work_input_queue = mp.Queue() 362 | output_queue = mp.Queue() 363 | quit_flag = mp.Value("i", 0) 364 | processes = [mp.Process( 365 | target=filter_and_grep_worker_func, 366 | args=(filter_and_grep_work_input_queue, output_queue, options, 367 | tracelog.regex, index, index_dir, quit_flag)) 368 | for i in range(cores)] 369 | for p in processes: 370 | p.daemon = True 371 | p.start() 372 | try: 373 | start_pack_process(progress, index_dir) 374 | work_queued = 0 375 | while True: 376 | while len(file_queue) == 0: 377 | handle_results(output_queue, progress, index_dir) 378 | time.sleep(0.1) 379 | f = file_queue.popleft() 380 | if f == None: 381 | break 382 | if not os.path.exists(f): 383 | print(Color.CLEAR_LINE + '4grep: {}: No such file or directory'.format(f), 384 | file=sys.stderr) 385 | continue 386 | if os.path.isdir(f): 387 | print(Color.CLEAR_LINE + '4grep: {}: Is a directory'.format(f), 388 | file=sys.stderr) 389 | continue 390 | progress.total_files = len(file_queue) + work_queued 391 | filter_and_grep_work_input_queue.put((work_queued, f)) 392 | handle_results(output_queue, progress, index_dir) 393 | work_queued += 1 394 | for _ in range(cores): 395 | filter_and_grep_work_input_queue.put(None) 396 | progress.total_files = work_queued 397 | progress.color = Color.GREEN + Color.BOLD 398 | for p in processes: 399 | while p.is_alive(): 400 | handle_results(output_queue, progress, 401 | index_dir) 402 | time.sleep(0.1) 403 | handle_results(output_queue, progress, index_dir) 404 | if progress.count != 0: 405 | print_progress_bar(progress, True, tracelog) 406 | else: 407 | print(Color.CLEAR_LINE + '4grep: no files found', file=sys.stderr) 408 | except KeyboardInterrupt: 409 | ignore_sigint() # prevent interruption of interrupt handling 410 | print(file=sys.stderr) 411 | print(Color.END + "Aborting 4grep...", file=sys.stderr) 412 | quit_flag.value = 1 413 | empty_queue(filter_and_grep_work_input_queue) 414 | for p in processes: 415 | p.join() 416 | sys.exit(1) 417 | 418 | def empty_queue(queue): 419 | while not queue.empty(): 420 | try: 421 | queue.get_nowait() 422 | except Empty: 423 | pass 424 | 425 | class stdin_iter: 426 | def __init__(self): 427 | pass 428 | 429 | def __iter__(self): 430 | return self 431 | 432 | def next(self): 433 | ret = sys.stdin.readline().strip() 434 | if not ret: 435 | raise StopIteration 436 | return ret 437 | 438 | class regex_iter: 439 | def __init__(self, regex, excludes): 440 | self.regex = [".*"+r+".*" for r in regex.split("/")] 441 | self.level = 0 442 | self.height = len(self.regex) - 1 443 | self.ls = ['' for r in self.regex] 444 | self.ls[0] = sorted([f for f in os.listdir('.') 445 | if re.match(self.regex[0], f)]) 446 | self.excludes = excludes 447 | 448 | def __iter__(self): 449 | return self 450 | 451 | def walkup(self): 452 | while self.level and not self.ls[self.level]: 453 | self.level -= 1 454 | if self.level == 0 and not self.ls[0]: 455 | raise StopIteration 456 | 457 | def next(self): 458 | def dirlist(l): 459 | return [f for f in l if os.path.isdir(f)] 460 | 461 | def filelist(l): 462 | return [f for f in l if os.path.isfile(f)] 463 | 464 | self.walkup() 465 | while self.level < self.height: 466 | dir = self.ls[self.level].pop(0) 467 | self.level += 1 468 | self.ls[self.level] = sorted([dir+'/'+ f for f in os.listdir(dir) 469 | if re.match(self.regex[self.level], 470 | f)]) 471 | if self.excludes: 472 | self.ls[self.level] = [f for f in self.ls[self.level] 473 | if not re.match(self.excludes, f)] 474 | 475 | if self.level < self.height: 476 | self.ls[self.level] = dirlist(self.ls[self.level]) 477 | else: 478 | self.ls[self.level] = filelist(self.ls[self.level]) 479 | self.walkup() 480 | return self.ls[self.level].pop(0) 481 | 482 | def intersect(a, b): 483 | return list(set(a) & set(b)) 484 | 485 | def get_index_from_regex(regex): 486 | """ 487 | Parsing a regex is hard. 488 | But there are some low-hanging fruits that can satisfy most usecases. 489 | """ 490 | non_regex_metachar = "[^{}]".format(ESCAPED_REGEX_METACHARACTERS) 491 | parsable_metachar = r"(\.\*?|\+)" 492 | # 'parsable': things we can parse an ANDed index from 493 | parsable = "({}|{})".format(non_regex_metachar, parsable_metachar) 494 | # first low-hanging fruit: does grabbing all non-metachars work? 495 | if re.match("^{}+$".format(parsable), regex): 496 | literals = re.split("{}+".format(parsable_metachar), regex) 497 | long_enough = (l for l in literals if len(l) >= NGRAM_CHARS) 498 | return StringIndex([long_enough]) 499 | # second: is it a series of the above |'d together? 500 | elif re.match(r"^{0}+(\|{0}+)+$".format(parsable), regex): 501 | subexprs = regex.split("|") 502 | sub_indices = tuple(get_index_from_regex(r) for r in subexprs) 503 | if any(s.empty() for s in sub_indices): 504 | return empty_index() 505 | string_sets = tuple(ind.strings[0] for ind in sub_indices) 506 | return StringIndex(string_sets) 507 | # third: is it safe to just grab string literals from the start/end? 508 | if any(x in regex for x in ('|', '?', '*', '{,', '{0')): 509 | return empty_index() 510 | start = re.match("^{}+".format(non_regex_metachar), regex) 511 | end = re.search("{}+$".format(non_regex_metachar), regex) 512 | if start and end: 513 | start = start.group() 514 | end = end.group() 515 | return StringIndex([[start]] if start == end else [[start, end]]) 516 | elif start: 517 | return StringIndex([[start.group()]]) 518 | elif end: 519 | return StringIndex([[end.group()]]) 520 | return empty_index() 521 | 522 | class StringIndex(object): 523 | """ Represents a search index. 524 | 525 | strings: a list of lists of strings. 526 | The inner lists of strings will be ANDed together, and the outer lists 527 | will all be ORed together. 528 | 529 | For example, 530 | StringIndex([["one", "two"], "three"]) 531 | represents the index matching ("one" AND "two") OR "three" 532 | """ 533 | def __init__(self, strings): 534 | strings = tuple(strings) 535 | self.strings = tuple(tuple( 536 | s for s in sublist if len(s) >= NGRAM_CHARS) 537 | for sublist in strings) 538 | if any(len(s) == 0 for s in self.strings): 539 | self.strings = () 540 | 541 | def empty(self): 542 | return len(self.strings) == 0 543 | 544 | def get_index_struct(self): 545 | """ Returns a struct suitable for passing to our C code as an 546 | index. 547 | """ 548 | intarrays = [] 549 | assert not self.empty() 550 | for ss in self.strings: 551 | assert len(ss) != 0 552 | assert not any(len(s) < NGRAM_CHARS for s in ss) 553 | char_p_p = (ct.c_char_p * len(ss)) (*ss) 554 | intarrays.append(strings_to_sorted_indices(char_p_p, len(ss))) 555 | iaa = intarrayarray() 556 | iaa.num_rows = len(intarrays) 557 | iaa.rows = (intarray * len(intarrays)) (*intarrays) 558 | return iaa 559 | 560 | def __str__(self): 561 | return ' OR '.join( 562 | ' AND '.join(s for s in ss) 563 | for ss in self.strings) 564 | 565 | def __eq__(self, other): 566 | return type(self) is type(other) \ 567 | and self.strings == other.strings 568 | 569 | def __ne__(self, other): 570 | return not self.__eq__(other) 571 | 572 | def __repr__(self): 573 | return "StringIndex({})".format(str(self)) 574 | 575 | def empty_index(): 576 | return StringIndex([]) 577 | 578 | def get_index(args): 579 | """ Returns a StringIndex parsed from the args. 580 | 581 | If --filter was specified, it uses args.filter, else it uses 582 | args.regex. 583 | """ 584 | if args.filter is not None: 585 | indices = [s for s in args.filter if len(s) >= NGRAM_CHARS] 586 | if len(indices) == 0: 587 | print("{bold}4grep: cannot filter on {} (too short) {end} " 588 | .format(args.filter, bold=Color.BOLD, end=Color.END), file=sys.stderr, end='') 589 | return empty_index() 590 | else: 591 | return StringIndex([indices]) 592 | else: 593 | index = get_index_from_regex(args.regex) 594 | if index.empty(): 595 | print("{bold}4grep: cannot detect filter for '{}' {end} " 596 | .format(args.regex, bold=Color.BOLD, end=Color.END), file=sys.stderr, end='') 597 | 598 | if not index.empty(): 599 | print('{bold}4grep filtering on {} {end}'.format(index, bold=Color.BOLD, 600 | end=Color.END), file=sys.stderr, end='') 601 | return index 602 | 603 | class TraceLog(object): 604 | def __init__(self): 605 | self.user = getpass.getuser() 606 | self.init_time = time.time() 607 | self.end_time = 0 608 | self.filtered = 0 609 | self.bitmapped = 0 610 | self.total_files = 0 611 | self.elapsed = 0 612 | self.regex = None 613 | self.exclude = None 614 | self.cores = None 615 | self.filter = None 616 | self.indexdir = None 617 | self.indexdir_abs = None 618 | 619 | def print_to_log(tracelog): 620 | # Keep .4grep.log hidden or will be packed 621 | file_name = tracelog.indexdir_abs + "/.4grep.log" 622 | exists = os.path.exists(file_name) 623 | datetime = time.gmtime(tracelog.init_time) 624 | elapsed = tracelog.elapsed 625 | with open(file_name, 'a') as f: 626 | if not exists: 627 | f.write("t_end\tuser\tt_elapsed\ttotal_files\t%_filtered\t%_bitmapped" 628 | "\tregex\t--exclude\t--cores\t--filter\t--indexdir\n") 629 | elapsed = tracelog.elapsed 630 | f.write("{}/{}/{} {}:{}:{}\t{}\t{:.2f}\t{:.0f}\t{:.2f}\t{:.2f}\t{}\t{}\t{}\t{}\t{}\n".format( 631 | datetime[1], datetime[2], datetime[0], datetime[3], datetime[4], datetime[5], 632 | tracelog.user, elapsed, tracelog.total_files, tracelog.filtered, tracelog.bitmapped, 633 | tracelog.regex, tracelog.exclude, tracelog.cores, tracelog.filter, tracelog.indexdir)) 634 | if not exists: 635 | os.chmod(file_name, 0o666) 636 | 637 | 638 | def main(): 639 | tracelog = TraceLog() 640 | 641 | parser = argparse.ArgumentParser("4grep", usage=HELP, add_help=False) 642 | parser.add_argument('regex', metavar='REGEX', type=str) 643 | parser.add_argument('files', metavar='FILE', type=str, nargs='*') 644 | parser.add_argument('--exclude', type=str) 645 | parser.add_argument('--cores', type=int) 646 | parser.add_argument('--filter', action='append', type=str) 647 | parser.add_argument('--indexdir', type=str) 648 | parser.add_argument('--help', action="help") 649 | args, options = parser.parse_known_args() 650 | 651 | tracelog.regex = args.regex 652 | tracelog.exclude = args.exclude 653 | tracelog.cores = args.cores 654 | tracelog.filter = args.filter 655 | tracelog.indexdir = args.indexdir 656 | 657 | filelist = args.files 658 | # hack to handle mixed flags and filenames, because argparse doesn't 659 | filelist.extend(opt for opt in options if opt[0] != '-') 660 | options = [opt for opt in options if opt[0] == '-'] 661 | index = get_index(args) 662 | tracelog.indexdir_abs = os.path.abspath(os.path.expanduser(os.path.expandvars( 663 | args.indexdir if args.indexdir is not None 664 | else get_index_directory()))) 665 | 666 | # smart default for -h vs. -H 667 | if intersect(("-h", "-H", "--with-filename", "--no-filename"), options): 668 | # explicitly set by caller 669 | pass 670 | elif len(filelist) == 1 and os.path.isfile(filelist[0]): 671 | options.append("-h") 672 | else: 673 | options.append("-H") 674 | 675 | if not filelist: 676 | # read filelist from stdin instead 677 | filelist = stdin_iter() 678 | elif (len(filelist) == 1) and not os.path.isfile(filelist[0]): 679 | filelist = regex_iter(filelist[0], args.exclude) 680 | 681 | smp_loop(options, filelist, index, tracelog) 682 | 683 | if __name__ == "__main__": 684 | try: 685 | main() 686 | except IOError as e: 687 | if e.errno == errno.EPIPE: 688 | pass 689 | except KeyboardInterrupt: 690 | pass 691 | 692 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | Copyright 2017 Pure Storage 179 | 180 | Licensed under the Apache License, Version 2.0 (the "License"); 181 | you may not use this file except in compliance with the License. 182 | You may obtain a copy of the License at 183 | 184 | http://www.apache.org/licenses/LICENSE-2.0 185 | 186 | Unless required by applicable law or agreed to in writing, software 187 | distributed under the License is distributed on an "AS IS" BASIS, 188 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 189 | See the License for the specific language governing permissions and 190 | limitations under the License. 191 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | @$(MAKE) -C bitmap 3 | 4 | install: all 5 | install -D -m 0755 4grep -t $(DESTDIR)/usr/bin 6 | install -D -m 0644 bitmap/4grep.so -t $(DESTDIR)/usr/lib 7 | 8 | test: 9 | @$(MAKE) -C bitmap 10 | @./bitmap/exec/test 11 | @python ./test.py 12 | 13 | clean: 14 | @$(MAKE) -C bitmap clean 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 4grep 2 | 3 | ![alt tag](https://github.com/purestorage/4grep/blob/master/img/example.gif) 4 | 5 | 4grep is a tool developed by interns [Viveque Ramji](https://github.com/vivequeramji) and [Matthew Pfeiffer](https://github.com/Spferical) during Summer 2017 at Pure Storage to extend the functionality of zgrep. It makes searching log files faster by having a stored 5gram index file and conducting a 'pre-search'. 4grep works by looking at this stored index file (creates one if it doesn’t exist) and skips the grep process altogether if the search string definitely doesn’t exist. It uses multiple processors to run zgrep concurrently. 6 | 7 | It also has a *really* fancy progress bar. 8 | 9 | 4grep also allows you to use the same flags that grep does. The 4grep program passes these on to grep internally if/when a search occurs. 10 | 11 | ## Contents 12 | * [How the Indexing Works](#how-the-indexing-works) 13 | * [More Nuance](more-nuance) 14 | * [How to Get it](#how-to-get-it) 15 | * [Usage](#usage) 16 | * [Simple](#simple) 17 | * [Advanced Options](#advanced-options) 18 | * [Progress Bar](#progress-bar) 19 | * [Limitations](#limitations) 20 | * [Where is the Index Saved?](#where-is-the-index-saved) 21 | * [4grep Log](#4grep-log) 22 | * [Other Tools Used by 4grep](#other-tools-used-by-4grep) 23 | * [Zstandard](#zstandard) 24 | * [xxHash](#xxhash) 25 | * [License](#license) 26 | 27 | ![alt tag](https://github.com/purestorage/4grep/blob/master/img/zgrepvs4grep.png) 28 | 29 | ## How the Indexing Works 30 | 31 | A file is indexed whenever it is first encountered. The index is stored based on its full, expanded, de-symlinked path, and once generated, it will never again be re-indexed. The index stores the existence of all 5-grams in a file (sequences of 5 characters). 32 | 33 | When searching, 4grep will first parse 5-grams from the regex parameter. If filter strings are given via `--filter`, 5-grams will be generated from them instead. Then, 4grep filters out files that, based on the index, do not contain all of the 5-grams from the parameters. A "normal" search is performed on the files that pass this 5-gram filtering step. 34 | 35 | ### More Nuance 36 | 37 | For every character in a 5-gram, 4grep will apply a 4-bit mask. This drastically reduces the number of possible 5-grams from 2^40 to 2^20, making the index much smaller. It also means that there are collisions. For example, the 5-grams "AAAAA" and "aaaaa" are considered the same. There is a balance between filtering files out more effectively and filtering files out faster, and 5-grams with 4 bits-per-gram happens to be very effective on our log files. 38 | 39 | 40 | ## How to Get It 41 | 42 | *Insert instructions here* 43 | 44 | 45 | ## Usage 46 | 47 | ### Simple 48 | ```bash 49 | $ 4grep 50 | $ find | 4grep 51 | ``` 52 | 53 | #### Example 54 | 55 | ```bash 56 | # searches for files that contain 'STACK' 57 | $ find ~/Desktop/logs/* | ./4grep STACK 58 | 4grep filtering on 'STACK' 59 | ``` 60 | 61 | ### Advanced Options 62 | **--cores** 63 | ```bash 64 | $ 4grep --cores N 65 | ``` 66 | --cores was added to limit the number of cores that 4grep uses. If not specified, or too large, the program will use the maximum number of cores - 1. 67 | 68 | **--indexdir** 69 | ```bash 70 | $ 4grep --indexdir= 71 | ``` 72 | This option specifies where 4grep stores its index. See [Where is the Index Saved?](#where-is-the-index-saved) for the default index locations. 73 | 74 | **--filter** 75 | 76 | 4grep tries to parse string literals from the provided regex. In the pre-filtering step, it uses its index files to filter out files that don't contain all of these string literals. For example, the regex "Overslept by [0-9]{3}" can only match in files that contain the string literal "Overslept by ". So, 4grep will detect "Overslept by" as a filter string and filter out files that don't contain it in the pre-filtering step. 77 | ```bash 78 | $ 4grep STACKTRACE logs/core.log 79 | 4grep filtering on 'STACKTRACE' 80 | ``` 81 | ```bash 82 | $ 4grep "STACK.*TRACE" logs/core.log 83 | 4grep filtering on ['STACK', 'TRACE'] 84 | ``` 85 | ```bash 86 | $ 4grep "(STACK|TRACE)" logs/core.log 87 | 4grep: cannot detect filter for '(STACK|TRACE)' 88 | ``` 89 | This auto-detection works for regexes that have their literals at their start and/or end. However, 4grep's only does really basic regex parsing, and in some cases it may help to manually specify string literals to index with. We will call these string literals "filter strings." This can be done with the `--filter` option: 90 | ```bash 91 | $ 4grep --filter 92 | ``` 93 | You can specify the filter string and regex separately. The filter string will be used to filter any files that have no instances of the string anywhere in the file. The regex is then passed onto grep for these subset of files and will give you the lines which contain the regex. 94 | ```bash 95 | $ 4grep --filter --filter 96 | ``` 97 | 98 | * **You can also specify multiple filter strings that can be used to create a smaller subset of filtered files 99 | Filter strings must be at least 5 characters** 100 | 101 | 102 | * **The filter string must be a literal string, not a regex** 103 | 104 | 105 | #### Advanced Examples 106 | ```bash 107 | # Filters files that contain 'WARNING' anywhere then prints out lines that contain a number 108 | $ find ~/Desktop/logs/* | ./4grep --filter WARNING [0-9] 109 | 4grep filtering on 'WARNING' 110 | ``` 111 | ```bash 112 | # Filters files that contain 'WARNING' and 'STACK' anywhere then prints out lines that contain a number 113 | $ find ~/Desktop/logs/* | ./4grep --filter WARNING --filter STACK [0-9] 114 | 4grep filtering on ['WARNING','STACK'] 115 | ``` 116 | ```bash 117 | # searches for files that contain 'STACK' with at most 10 cores 118 | $ find ~/Desktop/logs/* | ./4grep STACK --cores 10 119 | 4grep filtering on 'STACK' 120 | ``` 121 | ```bash 122 | # searches for files that contain 'STACK' whilst storing index files in ~/.4grep 123 | $ find ~/Desktop/logs/* | ./4grep STACK --indexdir=~/.4grep 124 | 4grep filtering on 'STACK' 125 | 126 | ``` 127 | 128 | 129 | 130 | ## Progress Bar 131 | 132 | Secondly, 4grep includes a progress bar. This is a feature that has become super useful. Here is an example of the progress that is printed to stderr as you run 4grep: 133 | ```bash 134 | >Done: 15.8% of 54752 Elapsed:0m02.4s Bitmapped:100.0% Filtered: 99.9% ETA:0m12.5s 135 | ``` 136 | | Output | Meaning | 137 | | ------------- | ------------- | 138 | | Done | Indicates the percentage of files that have been searched from the number of files found.| 139 | | File Count | The number of files found has two colours. Green if all of the files are found, and red if files are still being piped into stdin.| 140 | | Elapsed | The time since the program began.| 141 | | Bitmapped | Indicates the proportion of files that had already been indexed. If this is the first time 4grep has seen any of the files (and so no index files exist) this will be 0%. The higher this is, the fewer files 4grep will have to index, and the faster 4grep will be.| 142 | | Filtered | Indicates the percentage of files that have skipped the grepping process because of the filter. The higher this number, the faster 4grep should be than tgrep since less files will be searched.| 143 | | ETA | Gives an estimate on how long the program will take to finish. This is calculated from the files already searched and so is only an estimate.| 144 | 145 | 146 | ## Limitations 147 | 4grep does not handle file modification. When it filters files out of the search with its filter string, 4grep will consider the state of the file as it was when it was first indexed. If a file is modified to contain a string that is then used as a search index, 4grep may wrongly filter the file out of the search and not report matches within the file. There is not currently any way to re-index a file or directory. 148 | 149 | 4grep can be bottlenecked by the speed of the filesystem that the index file is stored on, say, NFS. 150 | The smaller the files being searched over, the more significant 4grep's overhead becomes, and the less of a performance improvement it will give. 151 | 152 | As described above, 4grep does not parse regular expressions. It only autodetects a filter string for the easy case where string literals are on the left and/or right of the regex. 153 | 154 | Strings less than 5 characters cannot be indexed on. Longer strings are best for filtering. The longer the filter string(s), the higher percentage of files should be filtered, and the faster 4grep will go. 155 | 4grep wants to go as fast as possible. One process per core going through files as fast as it can may bring some machines to their knees. We've had one report of 4grep freezing up a machine searching through a checkout of purity with 40 cores. 156 | 157 | 158 | ## Where is the Index Saved? 159 | For minimal overhead, we want a global index file shared by everyone. Ideally, log files can be automatically indexed as they are added to fuse, but we do not do this (yet). For now, 4grep has a ranking of directories it would like to store the index in. This ranking goes: 160 | 1. `/4gram` (to be used when we get a proper distribution method) 161 | 1. `~/.cache/4gram` (should fall back to when running most other places) 162 | 163 | The index is designed to be persistent, multi-process, multi-user, and multi-machine-on-NFS safe. Though you should just be able to `rm -r` it if the index is just stored in your home directory and 4grep is not running. 164 | The index location may be overridden with the `--indexdir` option. 165 | 166 | 167 | ## 4grep Log 168 | When 4grep finishes its search, certain statistics will be recorded in 4grep's own log file. This will help to spot patterns between searches and hopefully optimize for the 90% case in the future. This logfile is stored in /path/to/index/.4grep.log. 169 | 170 | 171 | ## Other Tools Used by 4grep 172 | 173 | ### Zstandard 174 | When storing the index files, Zstandard was chosen as the compression algorithm. Zstandard outperformed gzip significantly for compression ratios and decompression speeds on our index files. We also kept the compression level down at 8 (current max = 22) since we found that for our data, which is small data with mostly 0's, this performed best. More info at: [https://github.com/facebook/zstd](https://github.com/facebook/zstd). 175 | 176 | ### xxHash 177 | To store the index file, we decided to hash its original name into something more uniform. xxHash, developed by the same author of Zstd (Yann Collet), seemed to be the fastest and easiest to use for our program. More info at: [https://github.com/Cyan4973/xxHash](https://github.com/Cyan4973/xxHash) 178 | 179 | 180 | ## License 181 | 182 | The full license for the project can be found [here](https://github.com/purestorage/4grep/blob/master/LICENSE). 183 | 184 | This project is licensed under the terms of the Apache-2.0 license. 185 | -------------------------------------------------------------------------------- /bitmap/.gitignore: -------------------------------------------------------------------------------- 1 | packfile 2 | test 3 | *.o 4 | *.so 5 | -------------------------------------------------------------------------------- /bitmap/Makefile: -------------------------------------------------------------------------------- 1 | CC=gcc 2 | CFLAGS=-Wall -std=gnu11 -O3 -fPIC 3 | LIBS=-lz ./lib/zstd/lib/libzstd.a -llockfile -lpthread 4 | INCLUDES = -I./src -I./lib -I./lib/xxhash -I./lib/zstd/lib 5 | HEADERS := $(shell find ./src -name "*.h") 6 | 7 | SRCS_FILES := $(shell find ./src -name "*.c") 8 | SRCS_FILES += ./lib/xxhash/xxhash.c 9 | EXEDIR=exec 10 | MAINDIR=main 11 | UNAME_S := $(shell uname -s) 12 | LIBFLAGS= 13 | ifeq ($(UNAME_S),Linux) 14 | LIBFLAGS += -Wl,-soname,4grep.so 15 | endif 16 | 17 | ZSTD_STATIC=./lib/zstd/lib/libzstd.a 18 | 19 | all: $(EXEDIR)/test $(EXEDIR)/generate_bitmap 4grep.so 20 | 21 | SRCS_OBJECTS := $(patsubst %.c, %.o, $(SRCS_FILES)) 22 | 23 | %.o: %.c $(HEADERS) 24 | @$(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@ 25 | 26 | $(ZSTD_STATIC): 27 | @$(MAKE) -s -C ./lib/zstd/lib CFLAGS="-fPIC -O3" libzstd.a 28 | 29 | 4grep.so: $(SRCS_OBJECTS) $(ZSTD_STATIC) 30 | @$(CC) $(CFLAGS) $(LIBFLAGS) -o 4grep.so $(SRCS_OBJECTS) -shared $(LIBS) 31 | 32 | $(EXEDIR)/generate_bitmap: $(MAINDIR)/generate_bitmap.o $(SRCS_OBJECTS) $(ZSTD_STATIC) 33 | @$(CC) $(CFLAGS) $(SRCS_OBJECTS) $(MAINDIR)/generate_bitmap.o -o \ 34 | $(EXEDIR)/generate_bitmap $(LIBS) 35 | 36 | $(EXEDIR)/test: $(MAINDIR)/test.o $(SRCS_OBJECTS) $(ZSTD_STATIC) 37 | @$(CC) $(CFLAGS) $(SRCS_OBJECTS) $(MAINDIR)/test.o -o $(EXEDIR)/test $(LIBS) 38 | 39 | clean: 40 | @$(RM) $(EXEDIR)/generate_bitmap $(EXEDIR)/4gram_filter $(EXEDIR)/test */*.o 4grep.so $(ZSTD_STATIC) ./lib/xxhash/*.o 41 | @$(MAKE) -C ./lib/zstd clean 42 | 43 | .PHONY: all clean 44 | -------------------------------------------------------------------------------- /bitmap/exec/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /bitmap/lib/minunit.h: -------------------------------------------------------------------------------- 1 | /** MinUnit 2 | * As taken from http://www.jera.com/techinfo/jtns/jtn002.html 3 | * License: You may use the code in this tech note for any purpose, with 4 | * the understanding that it comes with NO WARRANTY. */ 5 | #define mu_assert(message, test) do { if (!(test)) return message; } while (0) 6 | #define mu_run_test(test) do { char *message = test(); tests_run++; \ 7 | if (message) return message; } while (0) 8 | extern int tests_run; 9 | -------------------------------------------------------------------------------- /bitmap/lib/portable_endian.h: -------------------------------------------------------------------------------- 1 | // From https://gist.github.com/panzi/6856583 2 | // "License": Public Domain 3 | // I, Mathias Panzenböck, place this file hereby into the public domain. Use it at your own risk for whatever you like. 4 | // In case there are jurisdictions that don't support putting things in the public domain you can also consider it to 5 | // be "dual licensed" under the BSD, MIT and Apache licenses, if you want to. This code is trivial anyway. Consider it 6 | // an example on how to get the endian conversion functions on different platforms. 7 | 8 | #ifndef PORTABLE_ENDIAN_H__ 9 | #define PORTABLE_ENDIAN_H__ 10 | 11 | #if (defined(_WIN16) || defined(_WIN32) || defined(_WIN64)) && !defined(__WINDOWS__) 12 | 13 | # define __WINDOWS__ 14 | 15 | #endif 16 | 17 | #if defined(__linux__) || defined(__CYGWIN__) 18 | 19 | # include 20 | 21 | #elif defined(__APPLE__) 22 | 23 | # include 24 | 25 | # define htobe16(x) OSSwapHostToBigInt16(x) 26 | # define htole16(x) OSSwapHostToLittleInt16(x) 27 | # define be16toh(x) OSSwapBigToHostInt16(x) 28 | # define le16toh(x) OSSwapLittleToHostInt16(x) 29 | 30 | # define htobe32(x) OSSwapHostToBigInt32(x) 31 | # define htole32(x) OSSwapHostToLittleInt32(x) 32 | # define be32toh(x) OSSwapBigToHostInt32(x) 33 | # define le32toh(x) OSSwapLittleToHostInt32(x) 34 | 35 | # define htobe64(x) OSSwapHostToBigInt64(x) 36 | # define htole64(x) OSSwapHostToLittleInt64(x) 37 | # define be64toh(x) OSSwapBigToHostInt64(x) 38 | # define le64toh(x) OSSwapLittleToHostInt64(x) 39 | 40 | # define __BYTE_ORDER BYTE_ORDER 41 | # define __BIG_ENDIAN BIG_ENDIAN 42 | # define __LITTLE_ENDIAN LITTLE_ENDIAN 43 | # define __PDP_ENDIAN PDP_ENDIAN 44 | 45 | #elif defined(__OpenBSD__) 46 | 47 | # include 48 | 49 | #elif defined(__NetBSD__) || defined(__FreeBSD__) || defined(__DragonFly__) 50 | 51 | # include 52 | 53 | # define be16toh(x) betoh16(x) 54 | # define le16toh(x) letoh16(x) 55 | 56 | # define be32toh(x) betoh32(x) 57 | # define le32toh(x) letoh32(x) 58 | 59 | # define be64toh(x) betoh64(x) 60 | # define le64toh(x) letoh64(x) 61 | 62 | #elif defined(__WINDOWS__) 63 | 64 | # include 65 | # include 66 | 67 | # if BYTE_ORDER == LITTLE_ENDIAN 68 | 69 | # define htobe16(x) htons(x) 70 | # define htole16(x) (x) 71 | # define be16toh(x) ntohs(x) 72 | # define le16toh(x) (x) 73 | 74 | # define htobe32(x) htonl(x) 75 | # define htole32(x) (x) 76 | # define be32toh(x) ntohl(x) 77 | # define le32toh(x) (x) 78 | 79 | # define htobe64(x) htonll(x) 80 | # define htole64(x) (x) 81 | # define be64toh(x) ntohll(x) 82 | # define le64toh(x) (x) 83 | 84 | # elif BYTE_ORDER == BIG_ENDIAN 85 | 86 | /* that would be xbox 360 */ 87 | # define htobe16(x) (x) 88 | # define htole16(x) __builtin_bswap16(x) 89 | # define be16toh(x) (x) 90 | # define le16toh(x) __builtin_bswap16(x) 91 | 92 | # define htobe32(x) (x) 93 | # define htole32(x) __builtin_bswap32(x) 94 | # define be32toh(x) (x) 95 | # define le32toh(x) __builtin_bswap32(x) 96 | 97 | # define htobe64(x) (x) 98 | # define htole64(x) __builtin_bswap64(x) 99 | # define be64toh(x) (x) 100 | # define le64toh(x) __builtin_bswap64(x) 101 | 102 | # else 103 | 104 | # error byte order not supported 105 | 106 | # endif 107 | 108 | # define __BYTE_ORDER BYTE_ORDER 109 | # define __BIG_ENDIAN BIG_ENDIAN 110 | # define __LITTLE_ENDIAN LITTLE_ENDIAN 111 | # define __PDP_ENDIAN PDP_ENDIAN 112 | 113 | #else 114 | 115 | # error platform not supported 116 | 117 | #endif 118 | 119 | #endif 120 | -------------------------------------------------------------------------------- /bitmap/main/generate_bitmap.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "../src/bitmap.h" 4 | #include "../src/util.h" 5 | 6 | int main(int argc, char **argv){ 7 | FILE *f; 8 | 9 | if (argc == 2) { 10 | f = fopen(argv[1], "r"); 11 | } else if (argc < 2 && !isatty(fileno(stdin))) { 12 | f = stdin; 13 | } else { 14 | printf("Usage: \n" 15 | " %s \n" 16 | " echo | %s\n", argv[0], argv[0]); 17 | return 1; 18 | } 19 | 20 | if(f == NULL) { 21 | perror("Error: File not opened"); 22 | return(-1); 23 | } 24 | 25 | uint8_t *bitmap = init_bitmap(); 26 | int ret = apply_file_to_bitmap(bitmap, f); 27 | if (ret == GZ_TRUNCATED) { 28 | fprintf(stderr, "gzip stream truncated\n"); 29 | return GZ_TRUNCATED; 30 | } 31 | 32 | write_bitmap(bitmap, stdout); 33 | 34 | return 0; 35 | } 36 | -------------------------------------------------------------------------------- /bitmap/main/test.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "../lib/minunit.h" 12 | #include "../src/filter.h" 13 | #include "../src/bitmap.h" 14 | #include "../src/util.h" 15 | #include "../src/packfile.h" 16 | #include "portable_endian.h" 17 | 18 | /*--------------------------------------------------------------------*/ 19 | 20 | int tests_run = 0; 21 | 22 | /*--------------------------------------------------------------------*/ 23 | 24 | /* 25 | * Returns a file descriptor pointing towards a pipe containing solely the 26 | * passed-in string. 27 | * 28 | * Works by forking the current process and writing the string to a pipe in the 29 | * child process. 30 | */ 31 | FILE *get_pipe(char *string) { 32 | int p[2]; 33 | if (pipe(p) != 0) { 34 | perror("Error: pipe failed"); 35 | exit(-1); 36 | } 37 | if (!fork()) { 38 | int len = strlen(string); 39 | for (int written = 0; len > written;) { 40 | int result = write(p[1], string + written, len - written); 41 | if (result < 0) { 42 | perror("Error converting string to pipe\n"); 43 | exit(1); 44 | } 45 | written += result; 46 | } 47 | exit(0); 48 | } 49 | 50 | close(p[1]); 51 | FILE *f = fdopen(p[0], "r"); 52 | return f; 53 | } 54 | 55 | /*--------------------------------------------------------------------*/ 56 | /** 57 | * Sets the bits in the bitmap for all the 4grams stored in the string. 58 | */ 59 | void apply_string_to_bitmap(uint8_t *bitmap, char *string) { 60 | FILE *file = get_pipe(string); 61 | apply_file_to_bitmap(bitmap, file); 62 | fclose(file); 63 | } 64 | 65 | int bitmaps_are_the_same(uint8_t *bitmap1, uint8_t *bitmap2) { 66 | for (int i = 0; i < SIZEOF_BITMAP; i++) { 67 | if (bitmap1[i] != bitmap2[i]) { 68 | return 0; 69 | } 70 | } 71 | return 1; 72 | } 73 | 74 | /*--------------------------------------------------------------------*/ 75 | 76 | static char *test_init_bitmap() { 77 | uint8_t *bitmap = init_bitmap(); 78 | for (size_t i = 0; i < SIZEOF_BITMAP; i++) { 79 | mu_assert("Initialized bitmap has nonzero byte", bitmap[i] == 0); 80 | } 81 | free(bitmap); 82 | return 0; 83 | } 84 | 85 | /*--------------------------------------------------------------------*/ 86 | 87 | static char *test_set_bit() { 88 | uint8_t *bitmap = init_bitmap(); 89 | set_bit(bitmap, 0); 90 | mu_assert("Bitmap bit set failed", bitmap[0] == 1); 91 | set_bit(bitmap, 1); 92 | mu_assert("Bitmap bit set failed", bitmap[0] == 0b11); 93 | set_bit(bitmap, 15); 94 | mu_assert("Bitmap bit set failed", bitmap[1] == 0b10000000); 95 | set_bit(bitmap, 8 * SIZEOF_BITMAP - 1); 96 | mu_assert("Bitmap bit set failed", bitmap[SIZEOF_BITMAP - 1] == 0b10000000); 97 | free(bitmap); 98 | return 0; 99 | } 100 | 101 | static char *test_string_to_bitmap_empty() { 102 | uint8_t *bitmap = init_bitmap(); 103 | apply_string_to_bitmap(bitmap, ""); 104 | for (size_t i = 0; i < SIZEOF_BITMAP; i++) { 105 | mu_assert("Extra bits added in string to bitmap", bitmap[i] == 0); 106 | } 107 | free(bitmap); 108 | return 0; 109 | } 110 | 111 | static char *test_string_to_bitmap_tiny() { 112 | uint8_t *bitmap = init_bitmap(); 113 | apply_string_to_bitmap(bitmap, "as"); 114 | for (size_t i = 0; i < SIZEOF_BITMAP; i++) { 115 | mu_assert("Extra bits added in string to bitmap", bitmap[i] == 0); 116 | } 117 | free(bitmap); 118 | return 0; 119 | } 120 | 121 | static char *test_string_to_bitmap_nchars() { 122 | uint8_t *bitmap = init_bitmap(); 123 | char str[NGRAM_CHARS + 1]; 124 | int n = 0; 125 | for (int i = 0; i < NGRAM_CHARS; i++) { 126 | str[i] = 'a'; 127 | n = (n << NGRAM_CHAR_BITS) + ('a' & CHAR_MASK); 128 | } 129 | str[NGRAM_CHARS] = '\0'; 130 | apply_string_to_bitmap(bitmap, str); 131 | mu_assert("test_string_to_bitmap_nchars: bit unset", bitmap[n / 8] == 1 << (n % 8)); 132 | for (size_t i = 0; i < SIZEOF_BITMAP; i++) { 133 | if (i != n / 8) { 134 | mu_assert("test_string_to_bitmap_nchars: extra bit set", bitmap[i] == 0); 135 | } 136 | } 137 | free(bitmap); 138 | return 0; 139 | } 140 | 141 | static char *test_string_to_bitmap_long() { 142 | uint8_t *bitmap = init_bitmap(); 143 | apply_string_to_bitmap(bitmap, "aaaaaaaaaaaaaaaaaaaz"); 144 | 145 | int n = 0; 146 | for (int i = 0; i < NGRAM_CHARS; i++) { 147 | n = (n << NGRAM_CHAR_BITS) + ('a' & CHAR_MASK); 148 | } 149 | int m = 0; 150 | for (int i = 0; i < NGRAM_CHARS - 1; i++) { 151 | m = (m << NGRAM_CHAR_BITS) + ('a' & CHAR_MASK); 152 | } 153 | m = (m << NGRAM_CHAR_BITS) + ('z' & CHAR_MASK); 154 | 155 | mu_assert("test_string_to_bitmap_long: n unset", bitmap[n / 8] == 1 << (n % 8)); 156 | mu_assert("test_string_to_bitmap_long: m unset", bitmap[m / 8] == 1 << (m % 8)); 157 | for (size_t i = 0; i < SIZEOF_BITMAP; i++) { 158 | if (i != n / 8 && i != m / 8) { 159 | mu_assert("test_string_to_bitmap_long: extra bit set", bitmap[i] == 0); 160 | } 161 | } 162 | free(bitmap); 163 | return 0; 164 | } 165 | 166 | static char *test_string_to_bitmap() { 167 | mu_run_test(test_string_to_bitmap_empty); 168 | mu_run_test(test_string_to_bitmap_tiny); 169 | mu_run_test(test_string_to_bitmap_nchars); 170 | mu_run_test(test_string_to_bitmap_long); 171 | return 0; 172 | } 173 | 174 | static char *test_compress_bitmap() { 175 | // create a simple bitmap 176 | uint8_t *bitmap = init_bitmap(); 177 | set_bit(bitmap, 0); 178 | set_bit(bitmap, 8); 179 | 180 | // pick a file path to pretend we're compressing 181 | char *fake_tmpfile_path = "/tmp/asdf"; 182 | int64_t fake_mtime = 0; 183 | 184 | char bitmap_tmpfile_path[PATH_MAX] = "/tmp/4gramtmpfile.XXXXXX"; 185 | FILE *bitmap_file = fdopen(mkstemp(bitmap_tmpfile_path), "w"); 186 | // compress to a file in the bitmap store 187 | mu_assert("Error writing to tmpfile", bitmap_file != NULL); 188 | compress_to_fp(bitmap, bitmap_file, fake_tmpfile_path, fake_mtime); 189 | fclose(bitmap_file); 190 | 191 | // try decompressing it 192 | uint8_t *decompressed = init_bitmap(); 193 | int decompress_ret = decompress_file(decompressed, bitmap_tmpfile_path); 194 | mu_assert("Decompress error", decompress_ret == 0); 195 | 196 | // check that it decompressed correctly 197 | for (int i = 0; i < SIZEOF_BITMAP; i++) { 198 | // printf("%d %d\n", bitmap[i], decompressed[i]); 199 | mu_assert("Decompressed bitmap not the same", bitmap[i] == decompressed[i]); 200 | } 201 | 202 | free(bitmap); 203 | free(decompressed); 204 | return 0; 205 | } 206 | 207 | static char *test_compress_to_file_no_collision() { 208 | uint8_t *bitmap = init_bitmap(); 209 | char *file_path = "/tmp/nonexistent"; 210 | char template[] = "/tmp/4gramtmpdir.XXXXXX"; 211 | char *store = mkdtemp(template); 212 | mu_assert("Could not create tmpdir", store != NULL); 213 | 214 | int ret = compress_to_file(bitmap, file_path, 0, store); 215 | mu_assert("Compress to file failed", ret == 0); 216 | 217 | char hashed_filename[21]; 218 | get_hash(file_path, strlen(file_path), hashed_filename); 219 | strcat(hashed_filename, "_000"); 220 | char *path_to_bitmap_file = add_path_parts(store, hashed_filename); 221 | 222 | mu_assert("Compressed bitmap file doesn't exist", 223 | access(path_to_bitmap_file, F_OK) == 0); 224 | uint8_t *decompressed = init_bitmap(); 225 | ret = decompress_file(decompressed, path_to_bitmap_file); 226 | mu_assert("Error occurred in decompression", ret == 0); 227 | for (int i = 0; i < SIZEOF_BITMAP; i++) { 228 | mu_assert("Decompressed bitmap not the same", bitmap[i] == decompressed[i]); 229 | } 230 | free(bitmap); 231 | free(decompressed); 232 | free(path_to_bitmap_file); 233 | return 0; 234 | } 235 | 236 | static char *test_compress_to_file_with_collision() { 237 | uint8_t *bitmap = init_bitmap(); 238 | char *file_path = "/tmp/nonexistent"; 239 | char template[] = "/tmp/4gramtmpdir.XXXXXX"; 240 | char *store = mkdtemp(template); 241 | mu_assert("Could not create tmpdir", store != NULL); 242 | int num_files = 3; 243 | 244 | for (int i = 0; i < num_files; i++) { 245 | int ret = compress_to_file(bitmap, file_path, 0, store); 246 | mu_assert("Compress to file failed", ret == 0); 247 | 248 | char cache_file_name[21]; 249 | char num_extension[5]; 250 | sprintf(num_extension, "_%.3d", i); 251 | get_hash(file_path, strlen(file_path), cache_file_name); 252 | strcat(cache_file_name, num_extension); 253 | char *path_to_bitmap_file = add_path_parts(store, cache_file_name); 254 | 255 | mu_assert("Compressed bitmap file doesn't exist", 256 | access(path_to_bitmap_file, F_OK) == 0); 257 | uint8_t *decompressed = init_bitmap(); 258 | ret = decompress_file(decompressed, path_to_bitmap_file); 259 | mu_assert("Error occurred in decompression", ret == 0); 260 | for (int i = 0; i < SIZEOF_BITMAP; i++) { 261 | mu_assert("Decompressed bitmap not the same", bitmap[i] == decompressed[i]); 262 | } 263 | free(decompressed); 264 | free(path_to_bitmap_file); 265 | } 266 | free(bitmap); 267 | return 0; 268 | } 269 | 270 | static char *test_compress_to_file() { 271 | mu_run_test(test_compress_to_file_no_collision); 272 | mu_run_test(test_compress_to_file_with_collision); 273 | return 0; 274 | } 275 | 276 | static char *test_file_packing_single_file() { 277 | char template[] = "/tmp/4gramtmpdir.XXXXXX"; 278 | char template2[] = "/tmp/4gramtmpdir.XXXXXX"; 279 | char *store = mkdtemp(template); 280 | char *tmpfile_dir = mkdtemp(template2); 281 | mu_assert("Could not create tmpdir", store != NULL); 282 | char *tmpfile_path = add_path_parts(tmpfile_dir, "1.txt"); 283 | 284 | // write a small temporary file 285 | FILE *tmpfile = fopen(tmpfile_path, "w"); 286 | mu_assert("Could not create tmpfile", tmpfile != NULL); 287 | fputs("asdf", tmpfile); 288 | fclose(tmpfile); 289 | 290 | // create a bitmap for the file 291 | uint8_t *bitmap = init_bitmap(); 292 | tmpfile = fopen(tmpfile_path, "r"); 293 | apply_file_to_bitmap(bitmap, tmpfile); 294 | fclose(tmpfile); 295 | int64_t mtime = get_mtime(tmpfile_path); 296 | 297 | // figure out the hash of the file 298 | char loose_file_name[PATH_MAX]; 299 | get_hash(tmpfile_path, strlen(tmpfile_path), loose_file_name); 300 | 301 | // compress the bitmap to a file in the store 302 | int ret = compress_to_file(bitmap, tmpfile_path, mtime, store); 303 | mu_assert("Error compressing", ret == 0); 304 | 305 | // make sure it exists 306 | mu_assert("Compressed bitmap file doesn't exist", 307 | access(loose_file_name, F_OK)); 308 | 309 | pack_loose_files_in_subdir(store); 310 | 311 | uint8_t *read_bitmap = read_from_packfile(tmpfile_path, mtime, store); 312 | mu_assert("Could not find bitmap in packfile", read_bitmap != NULL); 313 | for (size_t i = 0; i < SIZEOF_BITMAP; i++) { 314 | mu_assert("Wrong bitmap returned", (bitmap[i] == read_bitmap[i])); 315 | } 316 | 317 | // make sure loose file was removed 318 | DIR *dir = opendir(store); 319 | mu_assert("Error opening bitmap store directory", dir != NULL); 320 | struct dirent *entry; 321 | while ((entry = readdir(dir))) { 322 | if (strcmp(entry->d_name, PACKFILE_NAME) == 0 323 | || strcmp(entry->d_name, PACKFILE_INDEX_NAME) == 0 324 | || strcmp(entry->d_name, ".") == 0 325 | || strcmp(entry->d_name, "..") == 0) { 326 | continue; 327 | } 328 | mu_assert("Loose file still in bitmap store directory", 0); 329 | } 330 | closedir(dir); 331 | 332 | free(tmpfile_path); 333 | free(bitmap); 334 | free(read_bitmap); 335 | return 0; 336 | } 337 | 338 | static char *test_file_packing_multiple_files() { 339 | char template[] = "/tmp/4gramtmpdir.XXXXXX"; 340 | char template2[] = "/tmp/4gramtmpdir.XXXXXX"; 341 | char *store = mkdtemp(template); 342 | char *tmpfile_dir = mkdtemp(template2); 343 | mu_assert("Could not create tmpdir", store != NULL); 344 | mu_assert("Could not create tmpdir", tmpfile_dir != NULL); 345 | int num_files = 10; 346 | uint8_t *bitmaps[num_files]; 347 | char *tmpfile_paths[num_files]; 348 | for (int i = 0; i < num_files; i++) { 349 | char name[PATH_MAX]; 350 | sprintf(name, "%d.txt", i); 351 | tmpfile_paths[i] = add_path_parts(tmpfile_dir, name); 352 | FILE *tmpfile = fopen(tmpfile_paths[i], "w"); 353 | mu_assert("Could not create tmpfile", tmpfile != NULL); 354 | fprintf(tmpfile, "%d", i * 1000); 355 | fclose(tmpfile); 356 | tmpfile = fopen(tmpfile_paths[i], "r"); 357 | bitmaps[i] = init_bitmap(); 358 | apply_file_to_bitmap(bitmaps[i], tmpfile); 359 | fclose(tmpfile); 360 | char loose_file_name[PATH_MAX]; 361 | get_hash(tmpfile_paths[i], strlen(tmpfile_paths[i]), loose_file_name); 362 | int64_t mtime = get_mtime(tmpfile_paths[i]); 363 | int ret = compress_to_file(bitmaps[i], tmpfile_paths[i], mtime, store); 364 | mu_assert("Error compressing", ret == 0); 365 | } 366 | pack_loose_files_in_subdir(store); 367 | for (int i = 0; i < num_files; i++) { 368 | int64_t mtime = get_mtime(tmpfile_paths[i]); 369 | uint8_t *read_bitmap = read_from_packfile(tmpfile_paths[i], mtime, store); 370 | mu_assert("Could not find bitmap in packfile", read_bitmap != NULL); 371 | for (size_t j = 0; j < SIZEOF_BITMAP; j++) { 372 | mu_assert("Wrong bitmap returned", bitmaps[i][j] == read_bitmap[j]); 373 | } 374 | free(read_bitmap); 375 | free(bitmaps[i]); 376 | free(tmpfile_paths[i]); 377 | } 378 | 379 | // make sure loose file was removed 380 | DIR *dir = opendir(store); 381 | mu_assert("Error opening bitmap store directory", dir != NULL); 382 | struct dirent *entry; 383 | while ((entry = readdir(dir))) { 384 | if (strcmp(entry->d_name, PACKFILE_NAME) == 0 385 | || strcmp(entry->d_name, PACKFILE_INDEX_NAME) == 0 386 | || strcmp(entry->d_name, ".") == 0 387 | || strcmp(entry->d_name, "..") == 0) { 388 | continue; 389 | } 390 | mu_assert("Loose file still in bitmap store directory", 0); 391 | } 392 | closedir(dir); 393 | 394 | return 0; 395 | } 396 | 397 | static char *test_file_packing_existing_packfile() { 398 | char template[] = "/tmp/4gramtmpdir.XXXXXX"; 399 | char template2[] = "/tmp/4gramtmpdir.XXXXXX"; 400 | char *store = mkdtemp(template); 401 | char *tmpfile_dir = mkdtemp(template2); 402 | mu_assert("Could not create tmpdir", store != NULL); 403 | mu_assert("Could not create tmpdir", tmpfile_dir != NULL); 404 | int num_files = 20; 405 | uint8_t *bitmaps[num_files]; 406 | char *tmpfile_paths[num_files]; 407 | for (int i = 0; i < num_files; i++) { 408 | char name[PATH_MAX]; 409 | sprintf(name, "%d.txt", i); 410 | tmpfile_paths[i] = add_path_parts(tmpfile_dir, name); 411 | FILE *tmpfile = fopen(tmpfile_paths[i], "w"); 412 | mu_assert("Could not create tmpfile", tmpfile != NULL); 413 | fprintf(tmpfile, "%d", i * 1000); 414 | fclose(tmpfile); 415 | tmpfile = fopen(tmpfile_paths[i], "r"); 416 | bitmaps[i] = init_bitmap(); 417 | apply_file_to_bitmap(bitmaps[i], tmpfile); 418 | fclose(tmpfile); 419 | char loose_file_name[PATH_MAX]; 420 | get_hash(tmpfile_paths[i], strlen(tmpfile_paths[i]), loose_file_name); 421 | int64_t mtime = get_mtime(tmpfile_paths[i]); 422 | int ret = compress_to_file(bitmaps[i], tmpfile_paths[i], mtime, store); 423 | mu_assert("Error compressing", ret == 0); 424 | if (i == num_files / 2) { 425 | pack_loose_files_in_subdir(store); 426 | } 427 | } 428 | pack_loose_files_in_subdir(store); 429 | for (int i = 0; i < num_files; i++) { 430 | int64_t mtime = get_mtime(tmpfile_paths[i]); 431 | uint8_t *read_bitmap = read_from_packfile(tmpfile_paths[i], mtime, store); 432 | mu_assert("Could not find bitmap in packfile", read_bitmap != NULL); 433 | for (size_t j = 0; j < SIZEOF_BITMAP; j++) { 434 | mu_assert("Wrong bitmap returned", bitmaps[i][j] == read_bitmap[j]); 435 | } 436 | free(read_bitmap); 437 | free(bitmaps[i]); 438 | free(tmpfile_paths[i]); 439 | } 440 | 441 | // make sure loose file was removed 442 | DIR *dir = opendir(store); 443 | mu_assert("Error opening bitmap store directory", dir != NULL); 444 | struct dirent *entry; 445 | while ((entry = readdir(dir))) { 446 | if (strcmp(entry->d_name, PACKFILE_NAME) == 0 447 | || strcmp(entry->d_name, PACKFILE_INDEX_NAME) == 0 448 | || strcmp(entry->d_name, ".") == 0 449 | || strcmp(entry->d_name, "..") == 0) { 450 | continue; 451 | } 452 | mu_assert("Loose file still in bitmap store directory", 0); 453 | } 454 | closedir(dir); 455 | 456 | return 0; 457 | } 458 | 459 | static char *test_file_packing() { 460 | mu_run_test(test_file_packing_single_file); 461 | mu_run_test(test_file_packing_multiple_files); 462 | mu_run_test(test_file_packing_existing_packfile); 463 | return 0; 464 | } 465 | 466 | static char *test_filter_checks_emptydir() { 467 | char template[] = "/tmp/4gramtmpdir.XXXXXX"; 468 | char template2[] = "/tmp/4gramtmpdir.XXXXXX"; 469 | char *store = mkdtemp(template); 470 | char *tmpfile_dir = mkdtemp(template2); 471 | mu_assert("Could not create tmpdir", store != NULL); 472 | mu_assert("Could not create tmpdir", tmpfile_dir != NULL); 473 | 474 | // write a small temporary file 475 | char *tmpfile_path = add_path_parts(tmpfile_dir, "1.txt"); 476 | FILE *tmpfile = fopen(tmpfile_path, "w"); 477 | mu_assert("Could not create tmpfile", tmpfile != NULL); 478 | fputs("asdf", tmpfile); 479 | fclose(tmpfile); 480 | int64_t mtime = get_mtime(tmpfile_path); 481 | 482 | uint8_t *bitmap = init_bitmap(); 483 | mu_assert("Should not detect loose file", 484 | check_loose_files(tmpfile_path, mtime, bitmap, store) != 0); 485 | mu_assert("Should not detect entry in pack file", 486 | check_pack_files(tmpfile_path, mtime, bitmap, store) != 0); 487 | free(tmpfile_path); 488 | free(bitmap); 489 | return 0; 490 | } 491 | 492 | static char *test_filter_checks_loose_file() { 493 | char template[] = "/tmp/4gramtmpdir.XXXXXX"; 494 | char template2[] = "/tmp/4gramtmpdir.XXXXXX"; 495 | char *store = mkdtemp(template); 496 | char *tmpfile_dir = mkdtemp(template2); 497 | mu_assert("Could not create 4gramtmpdir", store != NULL); 498 | mu_assert("Could not create 4gramtmpdir", tmpfile_dir != NULL); 499 | 500 | // write a small temporary file 501 | char *tmpfile_path = add_path_parts(tmpfile_dir, "1.txt"); 502 | FILE *tmpfile = fopen(tmpfile_path, "w"); 503 | mu_assert("Could not create tmpfile", tmpfile != NULL); 504 | fputs("asdf", tmpfile); 505 | fclose(tmpfile); 506 | int64_t mtime = get_mtime(tmpfile_path); 507 | 508 | // compress it to loose file 509 | uint8_t *bitmap = init_bitmap(); 510 | tmpfile = fopen(tmpfile_path, "r"); 511 | mu_assert("Could not open tmpfile", tmpfile != NULL); 512 | apply_file_to_bitmap(bitmap, tmpfile); 513 | fclose(tmpfile); 514 | compress_to_file(bitmap, tmpfile_path, mtime, store); 515 | 516 | uint8_t *read_bitmap = init_bitmap(); 517 | mu_assert("Should detect loose file", 518 | check_loose_files(tmpfile_path, mtime, bitmap, store) == 0); 519 | mu_assert("Should not detect entry in pack file", 520 | check_pack_files(tmpfile_path, mtime, bitmap, store) != 0); 521 | free(bitmap); 522 | free(read_bitmap); 523 | free(tmpfile_path); 524 | return 0; 525 | } 526 | 527 | static char *test_filter_checks_packfile() { 528 | char template[] = "/tmp/4gramtmpdir.XXXXXX"; 529 | char template2[] = "/tmp/4gramtmpdir.XXXXXX"; 530 | char *store = mkdtemp(template); 531 | char *tmpfile_dir = mkdtemp(template2); 532 | mu_assert("Could not create 4gramtmpdir", store != NULL); 533 | mu_assert("Could not create 4gramtmpdir", tmpfile_dir != NULL); 534 | 535 | // write a small temporary file 536 | char *tmpfile_path = add_path_parts(tmpfile_dir, "1.txt"); 537 | FILE *tmpfile = fopen(tmpfile_path, "w"); 538 | mu_assert("Could not create tmpfile", tmpfile != NULL); 539 | fputs("asdf", tmpfile); 540 | fclose(tmpfile); 541 | int64_t mtime = get_mtime(tmpfile_path); 542 | 543 | // compress it to loose file 544 | uint8_t *bitmap = init_bitmap(); 545 | tmpfile = fopen(tmpfile_path, "r"); 546 | mu_assert("Could not open tmpfile", tmpfile != NULL); 547 | apply_file_to_bitmap(bitmap, tmpfile); 548 | fclose(tmpfile); 549 | compress_to_file(bitmap, tmpfile_path, mtime, store); 550 | 551 | pack_loose_files_in_subdir(store); 552 | 553 | uint8_t *read_bitmap = init_bitmap(); 554 | mu_assert("Should not detect loose file", 555 | check_loose_files(tmpfile_path, mtime, bitmap, store) != 0); 556 | mu_assert("Should detect entry in pack file", 557 | check_pack_files(tmpfile_path, mtime, bitmap, store) == 0); 558 | free(bitmap); 559 | free(read_bitmap); 560 | free(tmpfile_path); 561 | return 0; 562 | } 563 | 564 | static char *test_filter_checks() { 565 | mu_run_test(test_filter_checks_emptydir); 566 | mu_run_test(test_filter_checks_loose_file); 567 | mu_run_test(test_filter_checks_packfile); 568 | return 0; 569 | } 570 | 571 | static char *test_packfile_locking() { 572 | uint8_t *bitmap = init_bitmap(); 573 | char *file_path = "/tmp/nonexistent"; 574 | char template[] = "/tmp/4gramtmpdir.XXXXXX"; 575 | char *store = mkdtemp(template); 576 | mu_assert("Could not create tmpdir", store != NULL); 577 | 578 | int ret = compress_to_file(bitmap, file_path, 0, store); 579 | mu_assert("Compress to file failed", ret == 0); 580 | 581 | char hashed_filename[21]; 582 | get_hash(file_path, strlen(file_path), hashed_filename); 583 | strcat(hashed_filename, "_000"); 584 | char *path_to_bitmap_file = add_path_parts(store, hashed_filename); 585 | mu_assert("Loose file not created\n", access(path_to_bitmap_file, F_OK) == 0); 586 | 587 | char *packfile_path = add_path_parts(store, PACKFILE_NAME); 588 | 589 | mu_assert("Could not lock packfile", lockfile_create(packfile_path, 0, 0) == 0); 590 | 591 | if (fork() == 0) { 592 | pack_loose_files_in_subdir(store); 593 | exit(0); 594 | } 595 | int wait_status; 596 | wait(&wait_status); 597 | 598 | mu_assert("Loose files were packed despite lock", 599 | access(path_to_bitmap_file, F_OK) == 0); 600 | lockfile_remove(packfile_path); 601 | free(path_to_bitmap_file); 602 | free(packfile_path); 603 | free(bitmap); 604 | return 0; 605 | } 606 | 607 | static char *test_get_4gram_indices() { 608 | char *strings[] = { 609 | "qwertyuiop", 610 | "asdfghjkl", 611 | "zxcvbnm!@#$%^&*()", 612 | }; 613 | for (int i = 0; i < 3; i++) { 614 | uint8_t *bitmap = init_bitmap(); 615 | apply_string_to_bitmap(bitmap, strings[i]); 616 | int *indices = get_4gram_indices(strings[i]); 617 | int len = strlen(strings[i]) - NGRAM_CHARS + 1; 618 | for (int j = 0; j < len; j++) { 619 | int k = indices[j]; 620 | mu_assert("Invalid 4gram indices", get_bit(bitmap, k)); 621 | } 622 | free(bitmap); 623 | free(indices); 624 | } 625 | return 0; 626 | } 627 | 628 | static char *test_corruption_size() { 629 | uint8_t *bitmap = init_bitmap(); 630 | apply_string_to_bitmap(bitmap, "hello"); 631 | char *orig_filename = "should be 12"; 632 | FILE *temp = fopen("tmp.txt", "w"); 633 | 634 | uint16_t len = strlen(orig_filename); 635 | void* compressed = malloc(131616); 636 | uint32_t compressed_size = ZSTD_compress(compressed, 131616, 637 | bitmap, SIZEOF_BITMAP, 3); 638 | 639 | uint16_t len_be = htobe16(len); 640 | uint32_t compressed_size_be = htobe32(compressed_size); 641 | fwrite(&len_be, sizeof(uint16_t), 1, temp); 642 | fwrite(orig_filename, len, 1, temp); 643 | fwrite(&compressed_size_be, sizeof(uint32_t), 1, temp); 644 | fwrite(compressed, compressed_size, 1, temp); 645 | fclose(temp); 646 | 647 | FILE *temp2 = fopen("tmp.txt", "r"); 648 | fseek(temp2, 0, SEEK_END); 649 | unsigned long written_size = ftell(temp2); 650 | rewind(temp2); 651 | fclose(temp2); 652 | remove("tmp.txt"); 653 | mu_assert("Size of file not same as written size", 654 | (len + compressed_size + 6 == written_size)); 655 | free(compressed); 656 | free(bitmap); 657 | return 0; 658 | } 659 | 660 | static char *test_loose_file_locking() { 661 | uint8_t *bitmap = init_bitmap(); 662 | char *filename = "/tmp/nonexistent"; 663 | char template[] = "/tmp/4gramtmpdir.XXXXXX"; 664 | char *store = mkdtemp(template); 665 | mu_assert("Could not create tmpdir", store != NULL); 666 | compress_to_file(bitmap, filename, 0, store); 667 | 668 | char hash[21]; 669 | uint16_t len = strlen(filename); 670 | get_hash(filename, len, hash); 671 | char loose_file_name[PATH_MAX]; 672 | strcpy(loose_file_name, hash); 673 | strcat(loose_file_name, "_000"); 674 | char *lockfile_path = get_lock_path(store, loose_file_name); 675 | mu_assert("Could not lock file", lockfile_create(lockfile_path, 0, 0) == 0); 676 | 677 | if (fork() == 0) { 678 | pack_loose_files_in_subdir(store); 679 | exit(0); 680 | } 681 | int wait_status; 682 | wait(&wait_status); 683 | 684 | char *loose_file_path = add_path_parts(store, loose_file_name); 685 | mu_assert("Loose file was packed despite lock", 686 | access(loose_file_path, F_OK) == 0); 687 | free(loose_file_path); 688 | lockfile_remove(lockfile_path); 689 | free(lockfile_path); 690 | free(bitmap); 691 | 692 | return 0; 693 | } 694 | 695 | static char *test_strings_to_sorted_indices() { 696 | char *strings[] = { 697 | "qwertyuiop", 698 | "asdfghjkl", 699 | "zxcvbnm!@#$%^&*()", 700 | }; 701 | uint8_t *bitmap = init_bitmap(); 702 | for (int i = 0; i < 3; i++) { 703 | apply_string_to_bitmap(bitmap, strings[i]); 704 | } 705 | struct intarray indices; 706 | indices = strings_to_sorted_indices(strings, 3); 707 | for (int i = 0; i < POSSIBLE_NGRAMS; i++) { 708 | if (get_bit(bitmap, i)) { 709 | int contained = 0; 710 | for (int j = 0; j < indices.length; j++) { 711 | if (indices.data[j] == i) { 712 | contained = 1; 713 | } 714 | } 715 | mu_assert("strings_to_sorted_indices: Index not found", contained); 716 | } 717 | } 718 | for (int i = 1; i < indices.length; i++) { 719 | mu_assert("strings_to_sorted_indices: unsorted", 720 | indices.data[i-1] <= indices.data[i]); 721 | } 722 | free(bitmap); 723 | free_intarray(indices); 724 | return 0; 725 | } 726 | 727 | static char *test_mtime() { 728 | char template[] = "/tmp/4gramtmpdir.XXXXXX"; 729 | char template2[] = "/tmp/4gramtmpdir.XXXXXX"; 730 | char *store = mkdtemp(template); 731 | char *tmpfile_dir = mkdtemp(template2); 732 | mu_assert("Could not create tmpdir", store != NULL); 733 | char *tmpfile_path = add_path_parts(tmpfile_dir, "1.txt"); 734 | 735 | // write a small temporary file 736 | FILE *tmpfile = fopen(tmpfile_path, "w"); 737 | mu_assert("Could not create tmpfile", tmpfile != NULL); 738 | fputs("qwertyuiop", tmpfile); 739 | fclose(tmpfile); 740 | int64_t mtime = 0; 741 | 742 | // create a bitmap for the file 743 | uint8_t *bitmap = init_bitmap(); 744 | tmpfile = fopen(tmpfile_path, "r"); 745 | apply_file_to_bitmap(bitmap, tmpfile); 746 | fclose(tmpfile); 747 | 748 | // figure out the hash of the file 749 | char loose_file_name[PATH_MAX]; 750 | get_hash(tmpfile_path, strlen(tmpfile_path), loose_file_name); 751 | 752 | // compress the bitmap to a file in the store 753 | int ret = compress_to_file(bitmap, tmpfile_path, mtime, store); 754 | mu_assert("Error compressing", ret == 0); 755 | 756 | // make sure it exists 757 | mu_assert("Compressed bitmap file doesn't exist", 758 | access(loose_file_name, F_OK)); 759 | 760 | uint8_t *bitmap1 = init_bitmap(); 761 | // make sure we can access the file with our mtime 762 | mu_assert("Could not access loose_file with mtime 0", 763 | check_loose_files(tmpfile_path, mtime, bitmap1, store) == 0); 764 | mu_assert("Didn't get same bitmap back", 765 | bitmaps_are_the_same(bitmap, bitmap1)); 766 | free(bitmap1); 767 | bitmap1 = init_bitmap(); 768 | mu_assert("Got bitmap with invalid mtime", 769 | check_loose_files(tmpfile_path, 123, bitmap1, store) != 0); 770 | free(bitmap1); 771 | 772 | pack_loose_files_in_subdir(store); 773 | 774 | uint8_t *read_bitmap = read_from_packfile(tmpfile_path, mtime, store); 775 | mu_assert("Could not find bitmap in packfile", read_bitmap != NULL); 776 | mu_assert("Didn't get same bitmap back", 777 | bitmaps_are_the_same(bitmap, read_bitmap)); 778 | mu_assert("Got bitmap with invalid mtime", 779 | read_from_packfile(tmpfile_path, 1, store) == NULL); 780 | 781 | free(tmpfile_path); 782 | free(bitmap); 783 | free(read_bitmap); 784 | return 0; 785 | } 786 | 787 | static char *test_get_index_subdirectory() { 788 | char *indexdir = "/4gram"; 789 | char *subdir = get_index_subdirectory(indexdir, 0); 790 | mu_assert("get_index_subdirectory epoch failed", 791 | strcmp(subdir, "/4gram/1970_01") == 0); 792 | free(subdir); 793 | subdir = get_index_subdirectory(indexdir, -1); 794 | mu_assert("get_index_subdirectory negative failed", 795 | strcmp(subdir, "/4gram/1969_12") == 0); 796 | free(subdir); 797 | subdir = get_index_subdirectory(indexdir, 1502920742); 798 | mu_assert("get_index_subdirectory normal date failed", 799 | strcmp(subdir, "/4gram/2017_08") == 0); 800 | free(subdir); 801 | subdir = get_index_subdirectory(indexdir, 1L << 31); 802 | mu_assert("get_index_subdirectory overflow test failed", 803 | strcmp(subdir, "/4gram/2038_01") == 0); 804 | free(subdir); 805 | return 0; 806 | } 807 | 808 | 809 | static char *run_tests() { 810 | mu_run_test(test_init_bitmap); 811 | mu_run_test(test_set_bit); 812 | mu_run_test(test_string_to_bitmap); 813 | mu_run_test(test_compress_to_file); 814 | mu_run_test(test_compress_bitmap); 815 | mu_run_test(test_file_packing); 816 | mu_run_test(test_filter_checks); 817 | mu_run_test(test_packfile_locking); 818 | mu_run_test(test_get_4gram_indices); 819 | mu_run_test(test_corruption_size); 820 | mu_run_test(test_loose_file_locking); 821 | mu_run_test(test_strings_to_sorted_indices); 822 | mu_run_test(test_mtime); 823 | mu_run_test(test_get_index_subdirectory); 824 | return 0; 825 | } 826 | 827 | int main() { 828 | char *result = run_tests(); 829 | if (result != 0) { 830 | printf("%s\n", result); 831 | } else { 832 | printf("All tests passed!\n"); 833 | } 834 | if (system("rm -rf /tmp/4gramtmpdir.*") != 0) { 835 | printf("Warning: error cleaning temporary directories."); 836 | } 837 | if (system("rm -rf /tmp/4gramtmpfile.*") != 0) { 838 | printf("Warning: error cleaning temporary files."); 839 | } 840 | printf("Tests run: %d\n", tests_run); 841 | return result != 0; 842 | } 843 | -------------------------------------------------------------------------------- /bitmap/src/bitmap.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "bitmap.h" 15 | #include "xxhash.h" 16 | #include "util.h" 17 | #include "portable_endian.h" 18 | 19 | /*--------------------------------------------------------------------*/ 20 | 21 | #define ESTIMATED_ZSTD_SIZE (ZSTD_compressBound(SIZEOF_BITMAP)) 22 | 23 | /*--------------------------------------------------------------------*/ 24 | 25 | /** 26 | * Initalizes memory for bitmap 27 | */ 28 | uint8_t *init_bitmap(){ 29 | uint8_t *bitmap = calloc(SIZEOF_BITMAP, 1); 30 | if (bitmap == NULL){ 31 | perror("Error: Bitmap not initialized"); 32 | return(NULL); 33 | } 34 | return bitmap; 35 | } 36 | 37 | /*--------------------------------------------------------------------*/ 38 | 39 | void set_bit(uint8_t *bitmap, int bit_index){ 40 | bitmap[bit_index / 8] |= (1 << bit_index % 8); 41 | } 42 | 43 | /*--------------------------------------------------------------------*/ 44 | 45 | uint8_t get_bit(uint8_t *bitmap, int bit_index) { 46 | return (bitmap[bit_index / 8] >> (bit_index % 8)) & 1; 47 | } 48 | 49 | /*--------------------------------------------------------------------*/ 50 | 51 | void write_bitmap(uint8_t *bitmap, FILE *file){ 52 | fwrite(bitmap, 1, SIZEOF_BITMAP, file); 53 | } 54 | 55 | /*--------------------------------------------------------------------*/ 56 | 57 | char *get_lock_path(char *directory, char *filename) { 58 | char lock_filename[PATH_MAX]; 59 | sprintf(lock_filename, ".%s.lock", filename); 60 | return add_path_parts(directory, lock_filename); 61 | } 62 | 63 | /*--------------------------------------------------------------------*/ 64 | 65 | /** 66 | * Finds the first path of the form "directory/filename_XXX" that doesn't 67 | * already exist, counting up from 000 to 999. 68 | * 69 | * When found, it creates a new file with mode 0666 and returns a file 70 | * descriptor. 71 | */ 72 | int available_name(char *filename, char *directory){ 73 | char tmp[21]; 74 | int i = 0; 75 | //Max hash collision will be _999 76 | while(i < 1000){ 77 | sprintf(tmp, "%s_%.3d", filename, i); 78 | char *full_path = add_path_parts(directory, tmp); 79 | int fd = open(full_path, O_WRONLY | O_CREAT | O_EXCL, 0666); 80 | free(full_path); 81 | if(fd != -1) {//exists 82 | char *lock_path = get_lock_path(directory, tmp); 83 | int a = lockfile_create(lock_path, 0, 0); 84 | free(lock_path); 85 | if(a != 0){ 86 | i++; 87 | continue; 88 | } 89 | strcpy(filename, tmp); 90 | return fd; 91 | } 92 | i++; 93 | } 94 | return(-1); 95 | } 96 | 97 | /*--------------------------------------------------------------------*/ 98 | 99 | /** 100 | * Fucnction gets the xxhash of the filename and stores it 101 | * in hash_hex_str 102 | */ 103 | int get_hash(char *filename, size_t len, char *hash_hex_str){ 104 | XXH64_canonical_t* dst = malloc(sizeof(XXH64_canonical_t)); 105 | if (dst == NULL){ 106 | perror("Error: Memory not allocated"); 107 | return(-1); 108 | } 109 | uint64_t hashed = XXH64(filename, len, HASH_SEED); 110 | XXH64_canonicalFromHash(dst, hashed); 111 | for(int i = 0; i < 8; i++){ 112 | sprintf((hash_hex_str+2*i), "%02X", dst->digest[i]); 113 | } 114 | free(dst); 115 | return 0; 116 | } 117 | 118 | /*--------------------------------------------------------------------*/ 119 | 120 | /** 121 | * Function will write the bitmap that has been compressed in filename to 122 | * decompressed. 123 | * Saved data comprises of length of filename, filename, compressed data, 124 | * decompressed size. 125 | */ 126 | int decompress_file(uint8_t *decompressed, char *full_path){ 127 | uint16_t len; 128 | uint32_t compressed_size; 129 | int ret_val = -1; 130 | FILE *f = fopen(full_path, "r"); 131 | if(f == NULL) { 132 | if (errno != ENOENT) { 133 | perrorf("Error: File not opened: %s", full_path); 134 | } 135 | return ret_val; 136 | } 137 | if (fread(&len, sizeof(uint16_t), 1, f) != 1) { 138 | perrorf("Error in reading file size: %s", full_path); 139 | fclose(f); 140 | return ret_val; 141 | } 142 | len = be16toh(len); 143 | char orig_filename[len+1]; 144 | if (fread(orig_filename, len, 1, f) != 1){ 145 | perrorf("Error in reading filename: %s", full_path); 146 | fclose(f); 147 | return ret_val; 148 | } 149 | int64_t mtime; 150 | if (fread(&mtime, sizeof(int64_t), 1, f) != 1) { 151 | perrorf("Error in reading mtime: %s", full_path); 152 | fclose(f); 153 | return ret_val; 154 | } 155 | mtime = be64toh(mtime); 156 | if (fread(&compressed_size, sizeof(uint32_t), 1, f) != 1){ 157 | perrorf("Error in reading decompressed size: %s", full_path); 158 | fclose(f); 159 | return ret_val; 160 | } 161 | compressed_size = be32toh(compressed_size); 162 | char stream[compressed_size+1]; 163 | if (fread(&stream, compressed_size, 1, f) != 1){ 164 | perrorf("Error in reading decompressed file: %s", full_path); 165 | goto OUT1; 166 | } 167 | size_t decompressed_size = ZSTD_decompress(decompressed, SIZEOF_BITMAP, 168 | stream,compressed_size); 169 | if(ZSTD_isError(decompressed_size) == 1){ 170 | perrorf("Error in decompression of %s: %s", 171 | full_path, ZSTD_getErrorName(decompressed_size)); 172 | goto OUT1; 173 | } 174 | ret_val = 0; 175 | goto OUT1; 176 | 177 | OUT1: 178 | fclose(f); 179 | return ret_val; 180 | } 181 | 182 | /*--------------------------------------------------------------------*/ 183 | 184 | /** 185 | * Compresses the bitmap into the file described by fp using ZSTD 186 | * The original filename's length is stored followed by the filename, followed 187 | * by the compressed size, followed by the actual compressed data. 188 | */ 189 | int compress_to_fp(uint8_t *bitmap, FILE *fp, char *orig_filename, 190 | int64_t mtime) { 191 | uint16_t len = strlen(orig_filename); 192 | void* compressed = malloc(ESTIMATED_ZSTD_SIZE); 193 | int ret_val = -1; 194 | if (compressed == NULL){ 195 | perror("Error: Memory not allocated"); 196 | return ret_val; 197 | } 198 | 199 | uint32_t compressed_size = ZSTD_compress(compressed, ESTIMATED_ZSTD_SIZE, 200 | bitmap, SIZEOF_BITMAP, 8); 201 | 202 | if(ZSTD_isError(compressed_size) == 1) { 203 | perror("Error in compression"); 204 | goto OUT2; 205 | } 206 | uint16_t len_be = htobe16(len); 207 | if (fwrite(&len_be, sizeof(uint16_t), 1, fp) != 1){ 208 | goto OUT2; 209 | } 210 | if (fwrite(orig_filename, len, 1, fp) != 1){ 211 | perror("Error: Filename not written"); 212 | goto OUT2; 213 | } 214 | int64_t mtime_be = htobe64(mtime); 215 | if (fwrite(&mtime_be, sizeof(int64_t), 1, fp) != 1){ 216 | perror("Error: mtime not written"); 217 | goto OUT2; 218 | } 219 | uint32_t compressed_size_be = htobe32(compressed_size); 220 | if (fwrite(&compressed_size_be, sizeof(uint32_t), 1, fp) != 1){ 221 | perror("Error: Compressed size not written"); 222 | goto OUT2; 223 | } 224 | if (fwrite(compressed, compressed_size, 1, fp) != 1){ 225 | perror("Error: Compressed file not written"); 226 | goto OUT2; 227 | } 228 | ret_val = 0; 229 | goto OUT2; 230 | 231 | OUT2: 232 | free(compressed); 233 | return ret_val; 234 | } 235 | 236 | /*--------------------------------------------------------------------*/ 237 | 238 | /** 239 | * Function will compress the bitmap into a loosefile which is 240 | * named after the filename's hash and number of occurences. 241 | */ 242 | int compress_to_file(uint8_t *bitmap, char *filename, int64_t mtime, 243 | char *indexdir) { 244 | char hashed_filename[21], lock[27]; 245 | uint16_t len = strlen(filename); 246 | get_hash(filename, len, hashed_filename); 247 | int fd = available_name(hashed_filename, indexdir); 248 | FILE *fp = fdopen(fd, "wb"); 249 | if(fp == NULL) { 250 | perrorf("Error: File not opened: %s", hashed_filename); 251 | return(-1); 252 | } 253 | int ret = compress_to_fp(bitmap, fp, filename, mtime); 254 | fflush(fp); 255 | fsync(fd); 256 | fclose(fp); 257 | sprintf(lock, ".%s.lock", hashed_filename); 258 | char *lock_path = add_path_parts(indexdir, lock); 259 | lockfile_remove(lock_path); 260 | free(lock_path); 261 | return ret; 262 | } 263 | 264 | /*--------------------------------------------------------------------*/ 265 | 266 | __attribute__ ((target("bmi2"))) 267 | int init_4gram_state_bmi2(char *text) { 268 | int n = 0; 269 | for (int i = 0; i < NGRAM_CHARS; i++){ 270 | int tmp = text[i] & CHAR_MASK; 271 | n = _pdep_u32(n, NGRAM_SHIFT_LEFT_MASK) + tmp; 272 | } 273 | return n; 274 | } 275 | 276 | /*--------------------------------------------------------------------*/ 277 | 278 | int init_4gram_state_slow(char *text) { 279 | int n = 0; 280 | for (int i = 0; i < NGRAM_CHARS; i++){ 281 | int tmp = text[i] & CHAR_MASK; 282 | n = ((n << NGRAM_CHAR_BITS) & NGRAM_MASK) + tmp; 283 | } 284 | return n; 285 | } 286 | 287 | /*--------------------------------------------------------------------*/ 288 | 289 | /** 290 | * Returns the ngram index of the first ngram in text. 291 | */ 292 | int init_4gram_state(char *text) { 293 | if (supports_bmi2()) { 294 | return init_4gram_state_bmi2(text); 295 | } else { 296 | return init_4gram_state_slow(text); 297 | } 298 | } 299 | 300 | /*--------------------------------------------------------------------*/ 301 | 302 | __attribute__ ((target("bmi2"))) 303 | int apply_to_bitmap_bmi2(uint8_t *bitmap, char *buf, int len, int n) { 304 | for (int i = 0; i < len / sizeof(char); i++) { 305 | int tmp = buf[i] & CHAR_MASK; 306 | n = _pdep_u32(n, NGRAM_SHIFT_LEFT_MASK) + tmp; 307 | set_bit(bitmap, n); 308 | } 309 | return n; 310 | } 311 | 312 | /*--------------------------------------------------------------------*/ 313 | 314 | int apply_to_bitmap_slow(uint8_t *bitmap, char *buf, int len, int n) { 315 | for (int i = 0; i < len / sizeof(char); i++) { 316 | int tmp = buf[i] & CHAR_MASK; 317 | n = ((n << NGRAM_CHAR_BITS) & NGRAM_MASK) + tmp; 318 | set_bit(bitmap, n); 319 | } 320 | return n; 321 | } 322 | 323 | /*--------------------------------------------------------------------*/ 324 | /** 325 | * Applies all of the ngrams in buf to bitmap. 326 | * 327 | * Checks to see if system supports bmi2 instructions and calls relevant 328 | * functions 329 | */ 330 | int apply_to_bitmap(uint8_t *bitmap, char *buf, int len, int n) { 331 | if (supports_bmi2()) { 332 | return apply_to_bitmap_bmi2(bitmap, buf, len, n); 333 | } else { 334 | return apply_to_bitmap_slow(bitmap, buf, len, n); 335 | } 336 | } 337 | 338 | /*--------------------------------------------------------------------*/ 339 | /** 340 | * Scans the file at filename and writes bits for its 4grams to bitmap. 341 | * Decompresses the file to read it if the file is gzip-compressed. 342 | * Returns GZ_TRUNCATED if the given file was gzip-compressed and the 343 | * last read ended in the middle of the gzip stream. 344 | */ 345 | int apply_file_to_bitmap(uint8_t *bitmap, FILE *f){ 346 | int n = 0; 347 | int ret_val = -1; 348 | int fd = fileno(f); 349 | char buf[BUFSIZE]; 350 | int read_amount; 351 | 352 | // open file with a dup fd so closing gzf doesn't close the file descriptor 353 | int dup_fd = dup(fd); 354 | if (dup_fd < 0) { 355 | perror("Error duplicating fd"); 356 | return ret_val; 357 | } 358 | gzFile gzf = gzdopen(dup_fd, "r"); 359 | if (gzf == NULL) { 360 | perror("Error opening gzip stream"); 361 | goto OUT1; 362 | } 363 | 364 | // read first four characters to initialize 4gram 365 | read_amount = gzread(gzf, buf, NGRAM_CHARS * sizeof(char)); 366 | if (read_amount < 0) { 367 | fprintf(stderr, "gzread error: %s\n", gzerror(gzf, &read_amount)); 368 | gzclose(gzf); 369 | goto OUT1; 370 | } 371 | if (read_amount == NGRAM_CHARS) { 372 | n = init_4gram_state(buf); 373 | set_bit(bitmap, n); 374 | } 375 | 376 | // read rest of file 377 | do { 378 | read_amount = gzread(gzf, buf, BUFSIZE); 379 | if (read_amount < 0) { 380 | fprintf(stderr, "gzread error: %s\n", gzerror(gzf, &read_amount)); 381 | gzclose(gzf); 382 | goto OUT1; 383 | } 384 | n = apply_to_bitmap(bitmap, buf, read_amount, n); 385 | } while (read_amount > 0 || (read_amount < 0 && errno == EINTR)); 386 | 387 | int gzclose_ret = gzclose(gzf); 388 | if (gzclose_ret == Z_BUF_ERROR) { 389 | return GZ_TRUNCATED; 390 | } else if (gzclose_ret != Z_OK) { 391 | perror("Error closing .gz file"); 392 | goto OUT1; 393 | } 394 | ret_val = 0; 395 | goto OUT1; 396 | 397 | OUT1: 398 | close(dup_fd); 399 | return ret_val; 400 | } 401 | 402 | /*--------------------------------------------------------------------*/ 403 | 404 | uint8_t *b_or_b(uint8_t *bitmap1, uint8_t *bitmap2){ 405 | uint8_t *b1_or_b2 = init_bitmap(); 406 | for (int i = 0 ; i < SIZEOF_BITMAP; i++) { 407 | uint8_t b1 = get_bit(bitmap1, i); 408 | uint8_t b2 = get_bit(bitmap2, i); 409 | set_bit(b1_or_b2, (b1 | b2)); 410 | } 411 | return b1_or_b2; 412 | } 413 | 414 | /*--------------------------------------------------------------------*/ 415 | 416 | 417 | 418 | 419 | 420 | -------------------------------------------------------------------------------- /bitmap/src/bitmap.h: -------------------------------------------------------------------------------- 1 | #ifndef BITMAP_INCLUDED 2 | #define BITMAP_INCLUDED 3 | 4 | /*--------------------------------------------------------------------*/ 5 | 6 | #include 7 | #include 8 | 9 | /*--------------------------------------------------------------------*/ 10 | 11 | uint8_t *init_bitmap(); 12 | 13 | void set_bit(uint8_t *bitmap, int bit_index); 14 | 15 | uint8_t get_bit(uint8_t *bitmap, int bit_index); 16 | 17 | void write_bitmap(uint8_t *bitmap, FILE *file); 18 | 19 | int get_hash(char *filename, size_t len, char *hash_hex_str); 20 | 21 | int decompress_file(uint8_t *decompressed, char *full_path); 22 | 23 | int compress_to_fp(uint8_t *bitmap, FILE *fp, char *orig_filename, int64_t mtime); 24 | 25 | int compress_to_file(uint8_t *bitmap, char *filename, int64_t mtime, char *indexdir); 26 | 27 | int apply_file_to_bitmap(uint8_t *bitmap, FILE *f); 28 | 29 | uint8_t *b_or_b(uint8_t *bitmap1, uint8_t *bitmap2); 30 | 31 | /*--------------------------------------------------------------------*/ 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /bitmap/src/filter.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include "bitmap.h" 16 | #include "filter.h" 17 | #include "packfile.h" 18 | #include "util.h" 19 | #include "xxhash.h" 20 | #include "portable_endian.h" 21 | 22 | /*--------------------------------------------------------------------*/ 23 | 24 | #define BITMAP_CREATED 2 25 | 26 | /*--------------------------------------------------------------------*/ 27 | 28 | /** 29 | * Checks packfiles for filename. 30 | * 31 | * If an entry with the given filename and mtime is found in a packfile, it is 32 | * applied to the given bitmap. 33 | */ 34 | int check_pack_files(char *filename, int64_t mtime, uint8_t *bitmap, 35 | char *dir){ 36 | errno = 0; 37 | uint8_t *read_bitmap = read_from_packfile(filename, mtime, dir); 38 | if(read_bitmap == NULL) { 39 | if (errno == ESTALE) { 40 | // retry once on stale NFS file handle 41 | read_bitmap = read_from_packfile(filename, mtime, dir); 42 | if (read_bitmap == NULL) { 43 | if (errno == ESTALE) { 44 | perrorf("Error checking packfile for %s", filename); 45 | } 46 | return(-1); 47 | } 48 | } 49 | return(-1); 50 | } 51 | memcpy(bitmap, read_bitmap, SIZEOF_BITMAP); 52 | free(read_bitmap); 53 | return 0; 54 | } 55 | 56 | /*--------------------------------------------------------------------*/ 57 | 58 | /** 59 | * Checks the loosefiles in the directory to see if the bitmap exists. 60 | * 61 | * If an entry with the given filename and mtime is found in a loose file, it 62 | * is applied to the given bitmap. 63 | */ 64 | int check_loose_files(char *filename, int64_t mtime, uint8_t *bitmap, char *directory){ 65 | int ret_val = -1; 66 | uint16_t orig_len; 67 | char hashed_filename[21]; 68 | char tmp[27]; 69 | DIR *dir; 70 | FILE *possible; 71 | 72 | uint16_t len = strlen(filename); 73 | get_hash(filename, len, hashed_filename); 74 | 75 | if ((dir = opendir(directory)) == NULL) { 76 | perrorf("Error opening directory: %s", directory); 77 | return ret_val; 78 | } 79 | 80 | int i = 0; 81 | char *tmp_real_path; 82 | 83 | while(i < 1000){ 84 | sprintf(tmp, "%s_%.3d", hashed_filename, i); 85 | tmp_real_path = add_path_parts(directory, tmp); 86 | 87 | possible = fopen(tmp_real_path, "r"); 88 | if (possible == NULL) { 89 | free(tmp_real_path); 90 | break; 91 | } 92 | 93 | char *lock_path = get_lock_path(directory, tmp); 94 | int ret = lockfile_check(lock_path, 0); 95 | free(lock_path); 96 | if(ret == 0){ 97 | free(tmp_real_path); 98 | break; 99 | } 100 | 101 | if(remove_if_corrupted(possible, tmp_real_path)) { 102 | i++; 103 | free(tmp_real_path); 104 | fclose(possible); 105 | continue; 106 | } 107 | 108 | if (fread(&orig_len, 2, 1, possible) != 1) { 109 | perrorf("Error in reading file size: %s", tmp_real_path); 110 | goto OUT1; 111 | } 112 | orig_len = be16toh(orig_len); 113 | char orig_filename[orig_len]; 114 | if (fread(orig_filename, orig_len, 1, possible) != 1){ 115 | perrorf("Error in reading filename: %s", tmp_real_path); 116 | goto OUT1; 117 | 118 | } 119 | if(strncmp(orig_filename, filename, len) != 0){ 120 | goto OUT1; 121 | } 122 | 123 | int64_t loose_mtime; 124 | if (fread(&loose_mtime, sizeof(int64_t), 1, possible) != 1) { 125 | perrorf("Error in reading mtime: %s", tmp_real_path); 126 | goto OUT1; 127 | } 128 | loose_mtime = be64toh(loose_mtime); 129 | if (loose_mtime != mtime) { 130 | goto OUT1; 131 | } 132 | 133 | if(decompress_file(bitmap, tmp_real_path) == 0){ 134 | ret_val = 0; 135 | goto OUT1; 136 | } 137 | 138 | fclose(possible); 139 | free(tmp_real_path); 140 | i++; 141 | } 142 | closedir(dir); 143 | return ret_val; 144 | 145 | OUT1: 146 | closedir(dir); 147 | free(tmp_real_path); 148 | fclose(possible); 149 | return ret_val; 150 | 151 | } 152 | 153 | /*--------------------------------------------------------------------*/ 154 | 155 | /** 156 | * Scans the file at filename and writes bits for its 4grams to bitmap. 157 | * Decompresses the file to read it if the file is gzip-compressed. 158 | * 159 | * If the bitmap is cached in the index directory, the bitmap is read from the 160 | * cache and the file at filename is ignored. 161 | * 162 | * Returns 0 upon success. 163 | * Returns GZ_TRUNCATED if the given file was gzip-compressed and the 164 | * last read ended in the middle of the gzip stream. 165 | * Returns 3 if the given file does not exist. 166 | */ 167 | int get_bitmap_for_file(uint8_t *bitmap, char *filename, char *indexdir) { 168 | char *real_path = realpath(filename, NULL); 169 | if (real_path == NULL) { 170 | return 3; 171 | } 172 | int64_t mtime = get_mtime(real_path); 173 | char *index_subdir = get_index_subdirectory(indexdir, mtime); 174 | int ret_val = 0; 175 | //check loosefiles 176 | if (check_loose_files(real_path, mtime, bitmap, index_subdir) == 0){ 177 | goto OUT2; 178 | } 179 | // not in loosefiles so check packfiles 180 | if(check_pack_files(real_path, mtime, bitmap, index_subdir) == 0){ 181 | goto OUT2; 182 | } 183 | 184 | FILE *file = fopen(real_path, "r"); 185 | if (file == NULL) { 186 | perrorf("Could not open file %s", real_path); 187 | ret_val = 1; 188 | goto OUT2; 189 | } 190 | 191 | int ret = apply_file_to_bitmap(bitmap, file); 192 | fclose(file); 193 | if (ret != 0) { 194 | ret_val = ret; 195 | goto OUT2; 196 | } 197 | compress_to_file(bitmap, real_path, mtime, index_subdir); 198 | return BITMAP_CREATED; 199 | 200 | OUT2: 201 | free(real_path); 202 | return ret_val; 203 | } 204 | 205 | /*--------------------------------------------------------------------*/ 206 | 207 | int *get_4gram_indices_slow(char *string) { 208 | int len = strlen(string); 209 | int n = 0; 210 | if (len <= 0) 211 | return NULL; 212 | 213 | if (len < NGRAM_CHARS) { 214 | int *indices = malloc(sizeof(int)); 215 | for (int i = 0; i < len; i++){ 216 | int tmp = string[i] & CHAR_MASK; 217 | n = ((n << NGRAM_CHAR_BITS) & NGRAM_MASK) + tmp; 218 | } 219 | indices[0] = n; 220 | return indices; 221 | } 222 | int *indices = malloc((strlen(string) - NGRAM_CHARS + 1) * sizeof(int)); 223 | for (int i = 0; i < NGRAM_CHARS - 1; i++){ 224 | int tmp = string[i] & CHAR_MASK; 225 | n = ((n << NGRAM_CHAR_BITS) & NGRAM_MASK) + tmp; 226 | } 227 | for (int i = NGRAM_CHARS - 1; i < len; i++) { 228 | int tmp = string[i] & CHAR_MASK; 229 | n = ((n << NGRAM_CHAR_BITS) & NGRAM_MASK) + tmp; 230 | indices[i - NGRAM_CHARS + 1] = n; 231 | } 232 | return indices; 233 | } 234 | 235 | /*--------------------------------------------------------------------*/ 236 | 237 | __attribute__ ((target("bmi2"))) 238 | int *get_4gram_indices_bmi2(char *string) { 239 | int len = strlen(string); 240 | int n = 0; 241 | if (len <= 0) 242 | return NULL; 243 | if (len < NGRAM_CHARS) { 244 | int *indices = malloc(sizeof(int)); 245 | for (int i = 0; i < len; i++){ 246 | int tmp = string[i] & CHAR_MASK; 247 | n = ((n << NGRAM_CHAR_BITS) & NGRAM_MASK) + tmp; 248 | } 249 | indices[0] = n; 250 | return indices; 251 | } 252 | int *indices = malloc((strlen(string) + 1 - NGRAM_CHARS) * sizeof(int)); 253 | for (int i = 0; i < NGRAM_CHARS - 1; i++){ 254 | int tmp = string[i] & CHAR_MASK; 255 | n = _pdep_u32(n, NGRAM_SHIFT_LEFT_MASK) + tmp; 256 | } 257 | for (int i = NGRAM_CHARS - 1; i < len; i++) { 258 | int tmp = string[i] & CHAR_MASK; 259 | n = _pdep_u32(n, NGRAM_SHIFT_LEFT_MASK) + tmp; 260 | indices[i - NGRAM_CHARS + 1] = n; 261 | } 262 | return indices; 263 | } 264 | 265 | /*--------------------------------------------------------------------*/ 266 | 267 | /** 268 | * Returns an array of ngram indices found in the provided string. 269 | */ 270 | int *get_4gram_indices(char *string) { 271 | if (supports_bmi2()) { 272 | return get_4gram_indices_bmi2(string); 273 | } else { 274 | return get_4gram_indices_slow(string); 275 | } 276 | } 277 | 278 | /*--------------------------------------------------------------------*/ 279 | 280 | int compare_ints(const void *a, const void *b) { 281 | const int *ia = (const int *) a; 282 | const int *ib = (const int *) b; 283 | return (*ia > *ib) - (*ia < *ib); 284 | } 285 | 286 | /*--------------------------------------------------------------------*/ 287 | 288 | /** 289 | * Merges the two sorted arrays of ints arr1 and arr2 and stores the result in 290 | * result. 291 | */ 292 | void two_finger_merge_int(int *arr1, int arr1size, 293 | int *arr2, int arr2size, 294 | int *result) { 295 | int i1 = 0; 296 | int i2 = 0; 297 | for (int r = 0; r < arr1size + arr2size; r++) { 298 | if (i1 < arr1size 299 | && (i2 >= arr2size || arr1[i1] < arr2[i2])) { 300 | result[r] = arr1[i1]; 301 | i1++; 302 | } else { 303 | result[r] = arr2[i2]; 304 | i2++; 305 | } 306 | } 307 | } 308 | 309 | /*--------------------------------------------------------------------*/ 310 | 311 | /** 312 | * Gets the indices of the grams in the given index string, puts them in an 313 | * array, sorts them, and returns them. 314 | */ 315 | struct intarray string_to_sorted_indices(char *index_string){ 316 | int len_index_string = strlen(index_string); 317 | int *index_string_4gram_indices = get_4gram_indices(index_string); 318 | struct intarray arr = { 319 | .length = len_index_string - NGRAM_CHARS + 1, 320 | .data = index_string_4gram_indices, 321 | }; 322 | qsort(arr.data, arr.length, sizeof(int), compare_ints); 323 | return arr; 324 | } 325 | 326 | /*--------------------------------------------------------------------*/ 327 | /** 328 | * Returns a sorted list of all ngram indices found in the index strings. 329 | */ 330 | struct intarray strings_to_sorted_indices(char **index_strings, 331 | int num_index_strings) { 332 | 333 | struct intarray indices; 334 | indices = string_to_sorted_indices(index_strings[0]); 335 | for (int i = 1; i < num_index_strings; i++) { 336 | struct intarray old_indices = indices; 337 | struct intarray new_indices; 338 | new_indices = string_to_sorted_indices(index_strings[i]); 339 | indices.length = old_indices.length + new_indices.length; 340 | indices.data = malloc(indices.length * sizeof(int)); 341 | two_finger_merge_int(old_indices.data, old_indices.length, 342 | new_indices.data, new_indices.length, indices.data); 343 | free_intarray(old_indices); 344 | free_intarray(new_indices); 345 | } 346 | return indices; 347 | } 348 | 349 | /** 350 | * Returns 1 if file_bitmap does not match filter. 351 | * 352 | * filter is a 'sum of products' array of arrays of ngram indices. The indices 353 | * in each subarray are anded together, and each subarray is orred together. 354 | * Put another way, we filter out files that don't contain all the ngrams in at 355 | * least one subarray. 356 | * 357 | */ 358 | int should_filter_out_file(uint8_t *file_bitmap, struct intarrayarray filter) { 359 | int contained = 0; 360 | for (int i = 0; i < filter.num_rows; i++) { 361 | int ngrams_in_subarray_all_present = 1; 362 | for (int j = 0; j < filter.rows[i].length; j++) { 363 | if (!get_bit(file_bitmap, filter.rows[i].data[j])) { 364 | ngrams_in_subarray_all_present = 0; 365 | break; 366 | } 367 | } 368 | if (ngrams_in_subarray_all_present) { 369 | contained = 1; 370 | break; 371 | } 372 | } 373 | return !contained; 374 | } 375 | 376 | /*--------------------------------------------------------------------*/ 377 | /** 378 | * Function that is called by 4grep to start filtering using search strings 379 | * 380 | * See should_filter_out_file for details on ngram_filter. 381 | * 382 | * Returns -1 upon failure, 1 if bitmap is found and indices 383 | * match, 2 if bitmap found but does not match, 3 if no bitmap found and 384 | * matches, 4 if did not have bitmap and has no match. 385 | */ 386 | int start_filter(struct intarrayarray ngram_filter, 387 | char *filename, char *indexdir){ 388 | 389 | int ret = -1, MTCH = 1, NO_MTCH = 2; 390 | mode_t old_umask = umask(0); 391 | 392 | // now start filtering files 393 | uint8_t *file_bitmap = init_bitmap(); 394 | 395 | int bitmap_ret = get_bitmap_for_file(file_bitmap, filename, 396 | indexdir); 397 | if (bitmap_ret != 0 && bitmap_ret != 2) { 398 | goto OUT1; 399 | } 400 | 401 | int filtered = should_filter_out_file(file_bitmap, ngram_filter); 402 | 403 | if (!filtered) 404 | ret = MTCH; 405 | else 406 | ret = NO_MTCH; 407 | 408 | if (bitmap_ret == BITMAP_CREATED) 409 | ret += BITMAP_CREATED; 410 | 411 | OUT1: 412 | free(file_bitmap); 413 | umask(old_umask); 414 | return ret; 415 | } 416 | -------------------------------------------------------------------------------- /bitmap/src/filter.h: -------------------------------------------------------------------------------- 1 | #ifndef FILTER_INCLUDED 2 | #define FILTER_INCLUDED 3 | 4 | /*--------------------------------------------------------------------*/ 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | /*--------------------------------------------------------------------*/ 11 | 12 | int check_pack_files(char *filename, int64_t mtime, uint8_t *bitmap, char *dir); 13 | 14 | int check_loose_files(char *filename, int64_t mtime, uint8_t *bitmap, char *directory); 15 | 16 | int *get_4gram_indices(char *string); 17 | 18 | struct intarray strings_to_sorted_indices(char **index_strings, 19 | int num_index_strings); 20 | 21 | struct intarrayarray strings_to_filter_anded(char **index_strings, 22 | int num_index_strings); 23 | 24 | struct intarrayarray strings_to_filter_orred(char **index_strings, 25 | int num_index_strings); 26 | 27 | int should_filter_out_file(uint8_t *file_bitmap, struct intarrayarray filter); 28 | /*--------------------------------------------------------------------*/ 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /bitmap/src/packfile.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #include "bitmap.h" 19 | #include "util.h" 20 | #include "xxhash.h" 21 | #include "packfile.h" 22 | #include "portable_endian.h" 23 | 24 | /*--------------------------------------------------------------------*/ 25 | 26 | /** An entry in the index. 27 | * Note: in order to allow mmaping the index file, this struct stores 28 | * packfile_offset as big-endian! 29 | */ 30 | struct index_entry { 31 | uint64_t hash; 32 | uint64_t packfile_offset; 33 | }; 34 | 35 | /*--------------------------------------------------------------------*/ 36 | 37 | /** 38 | * Checks to see if loosefile was interrupted when writing by checking 39 | * actual size vs expected size 40 | * Returns 0 if no error, returns EMPTY_FILE if loose file is empty, 41 | * returns -1 if other error or file corrupted. 42 | */ 43 | int is_corrupted(FILE* loosefile) { 44 | uint16_t len; 45 | uint32_t compressed_size; 46 | int64_t mtime; 47 | 48 | struct stat loosefile_stat; 49 | fstat(fileno(loosefile), &loosefile_stat); 50 | off_t loosefile_size = loosefile_stat.st_size; 51 | 52 | if (loosefile_size == 0) { 53 | return EMPTY_FILE; 54 | } 55 | 56 | if (fread(&len, sizeof(uint16_t), 1, loosefile) != 1) { 57 | perror("Error in reading filename size"); 58 | return(-1); 59 | } 60 | len = be16toh(len); 61 | if (fseek(loosefile, len, SEEK_CUR) != 0) { 62 | perror("Error reading loose file"); 63 | return(-1); 64 | } 65 | if (fread(&mtime, sizeof(int64_t), 1, loosefile) != 1) { 66 | perror("Error in reading mtime"); 67 | return(-1); 68 | } 69 | mtime = be64toh(mtime); 70 | if (fread(&compressed_size, sizeof(uint32_t), 1, loosefile) != 1){ 71 | perror("Error in reading decompressed size"); 72 | return(-1); 73 | } 74 | compressed_size = be32toh(compressed_size); 75 | 76 | if((len + compressed_size + sizeof(uint16_t) + sizeof(uint32_t) + 77 | sizeof(int64_t)) != loosefile_size){ 78 | fprintf(stderr, "Corrupted file: l:%u, cs:%u, filesize:%lld\n", 79 | len, compressed_size, (long long) loosefile_size); 80 | return(-1); 81 | } 82 | 83 | rewind(loosefile); 84 | return 0; 85 | } 86 | 87 | /*--------------------------------------------------------------------*/ 88 | 89 | /** 90 | * Returns 1 if file was detected as corrupted, deleted and should be skipped. 91 | * Returns 2 if file was empty and should be skipped, but wasn't deleted. 92 | * Returns 0 otherwise. 93 | */ 94 | int remove_if_corrupted(FILE *file, char *file_path) { 95 | int corrupt_status = is_corrupted(file); 96 | if(corrupt_status != 0){ 97 | // ignore empty files because they could mean we acquired a read lock 98 | // on the file before the writing process could acquire a write lock 99 | // TODO: remove empty files that are very old 100 | if (corrupt_status != EMPTY_FILE) { 101 | remove(file_path); 102 | return 1; 103 | } 104 | return 2; 105 | } 106 | return 0; 107 | } 108 | 109 | /*--------------------------------------------------------------------*/ 110 | 111 | /** 112 | * Returns the index into the packfile index entries where the first entry with 113 | * the given hash is located, or -1 if the hash does not exist in the index. 114 | */ 115 | size_t find_hash_in_index(struct index_entry *index, 116 | size_t num_entries, uint64_t hash) { 117 | 118 | size_t left = 0; 119 | size_t right = num_entries - 1; 120 | while (left != right) { 121 | size_t middle = (right + left) / 2; 122 | if (index[middle].hash < hash) { 123 | left = middle + 1; 124 | } else { 125 | right = middle; 126 | } 127 | } 128 | if (index[left].hash != hash) { 129 | return -1; 130 | } else { 131 | return left; 132 | } 133 | } 134 | 135 | /*--------------------------------------------------------------------*/ 136 | 137 | /** 138 | * Calculates the number of existing entries in the packfile index based on the 139 | * size of the packfile index file. 140 | * 141 | * Returns -1 on error. 142 | */ 143 | size_t get_num_index_entries(FILE *packfile_index) { 144 | if (fseek(packfile_index, 0, SEEK_END) != 0) { 145 | return(-1); 146 | } 147 | long index_size = ftell(packfile_index); 148 | rewind(packfile_index); 149 | return index_size / sizeof(struct index_entry); 150 | } 151 | 152 | /*--------------------------------------------------------------------*/ 153 | 154 | /** 155 | * Reads the data stored in the packfile with the given name. 156 | * Assumes the data is (the size of a) bitmap when decompressed. 157 | * 158 | * filename: name of file to search for in the packfile 159 | * mtime: mtime of file to search for in the packfile 160 | * indexdir: index directory 161 | */ 162 | uint8_t *read_from_packfile(char *filename, int64_t mtime, char *indexdir) { 163 | 164 | char *packfile_path = add_path_parts(indexdir, PACKFILE_NAME); 165 | FILE *packfile = fopen(packfile_path, "r"); 166 | uint8_t *packed_file = NULL; 167 | free(packfile_path); 168 | if(packfile == NULL) { 169 | if (errno != ENOENT) { 170 | perror("Error: could not open packfile"); 171 | } 172 | return(NULL); 173 | } 174 | char *packfile_index_path = add_path_parts(indexdir, PACKFILE_INDEX_NAME); 175 | FILE *packfile_index = fopen(packfile_index_path, "r"); 176 | free(packfile_index_path); 177 | if(packfile_index == NULL) { 178 | fclose(packfile); 179 | if (errno != ENOENT) 180 | perror("Error: could not open packfile index"); 181 | return(NULL); 182 | } 183 | 184 | uint64_t hashed = XXH64(filename, strlen(filename), HASH_SEED); 185 | size_t num_index_entries = get_num_index_entries(packfile_index); 186 | if (num_index_entries <= 0) 187 | goto OUT2; 188 | size_t index_filesize = num_index_entries * sizeof(struct index_entry); 189 | struct index_entry *index = mmap(NULL, index_filesize, PROT_READ, 190 | MAP_PRIVATE, fileno(packfile_index), 0); 191 | if(index == MAP_FAILED){ 192 | perror("Error: could not mmap packfile index"); 193 | goto OUT1; 194 | } 195 | size_t first_identical_hash_loc = find_hash_in_index( 196 | index, num_index_entries, hashed); 197 | if (first_identical_hash_loc == -1) { 198 | goto OUT1; 199 | } 200 | // now to see if any of the identical hashes map to the same filename 201 | // we need to read the packfile for this 202 | for (size_t i = first_identical_hash_loc; index[i].hash == hashed; i++) { 203 | size_t offset = be64toh(index[i].packfile_offset); 204 | uint16_t name_len; 205 | fseek(packfile, offset, SEEK_SET); 206 | if (fread(&name_len, sizeof(uint16_t), 1, packfile) != 1) { 207 | perror("Error in packfile fread"); 208 | goto OUT1; 209 | } 210 | name_len = be16toh(name_len); 211 | char packed_filename[name_len]; 212 | if (fread(packed_filename, name_len, 1, packfile) != 1) { 213 | perror("Error in packfile fread"); 214 | goto OUT1; 215 | } 216 | if (strncmp(packed_filename, filename, name_len) != 0) { 217 | continue; 218 | } 219 | int64_t packed_mtime; 220 | if (fread(&packed_mtime, sizeof(int64_t), 1, packfile) != 1) { 221 | perror("Error in packfile fread"); 222 | goto OUT1; 223 | } 224 | packed_mtime = be64toh(packed_mtime); 225 | if (packed_mtime != mtime) { 226 | continue; 227 | } 228 | uint32_t packed_file_len; 229 | // we found an entry with the same filename! 230 | // now we may read the file 231 | if (fread(&packed_file_len, sizeof(uint32_t), 1, packfile) != 1) { 232 | perror("Error in packfile fread"); 233 | goto OUT1; 234 | } 235 | packed_file_len = be32toh(packed_file_len); 236 | uint8_t compressed_file[packed_file_len]; 237 | if (fread(&compressed_file, packed_file_len, 1, packfile) != 1) { 238 | perror("Error in packfile fread"); 239 | goto OUT1; 240 | } 241 | // now decompress it 242 | packed_file = malloc(SIZEOF_BITMAP); 243 | if (packed_file == NULL){ 244 | perror("Error: Memory not allocated"); 245 | goto OUT1; 246 | } 247 | 248 | size_t s = ZSTD_decompress(packed_file, SIZEOF_BITMAP, 249 | compressed_file, packed_file_len); 250 | if(ZSTD_isError(s) == 1){ 251 | fprintf(stderr, "Error in packfile decompression: %s\n", 252 | ZSTD_getErrorName(s)); 253 | free(packed_file); 254 | packed_file = NULL; 255 | goto OUT1; 256 | } 257 | } 258 | 259 | OUT1: 260 | if (munmap(index, index_filesize) == -1) { 261 | perror("Error in packfile index munmap"); 262 | } 263 | OUT2: 264 | fclose(packfile); 265 | fclose(packfile_index); 266 | return packed_file; 267 | 268 | } 269 | 270 | /*--------------------------------------------------------------------*/ 271 | 272 | /** 273 | * If the file at the given path does not exist, it is created with permissions 274 | * 0666. 275 | */ 276 | int create_file_if_nonexistent(char *path) { 277 | int fd = open(path, O_CREAT, 0666); 278 | if (fd == -1) { 279 | perrorf("Error creating file: %s", path); 280 | return(-1); 281 | } 282 | close(fd); 283 | return 0; 284 | } 285 | 286 | /*--------------------------------------------------------------------*/ 287 | 288 | /** 289 | * Appends the given data to the end of the packfile, returning the offset at 290 | * which it is added. 291 | */ 292 | long write_data_to_packfile(void *data, size_t size, FILE *packfile) { 293 | long packfile_offset = ftell(packfile); 294 | int write_amount = fwrite(data, size, 1, packfile); 295 | if (write_amount != 1) { 296 | perror("Error writing to packfile"); 297 | return -1; 298 | } 299 | return packfile_offset; 300 | } 301 | 302 | /** 303 | * Adds the file at the given path to the packfile opened in append-mode. 304 | * Returns the offset into the packfile at which the new file is written. 305 | */ 306 | long add_file_to_packfile(char *filename, char *indexdir, FILE *packfile) { 307 | char tmp[27]; 308 | int ret_val = -1; 309 | sprintf(tmp, ".%s.lock", filename); 310 | char *lock_path = add_path_parts(indexdir, tmp); 311 | int ret = lockfile_check(lock_path, 0); 312 | free(lock_path); 313 | if(ret == 0){ 314 | return ret_val; 315 | } 316 | 317 | char *file_path = add_path_parts(indexdir, filename); 318 | FILE *f = fopen(file_path, "r"); 319 | if (f == NULL) { 320 | perrorf("Could not open %s", file_path); 321 | goto OUT1; 322 | } 323 | 324 | if (remove_if_corrupted(f, file_path)) { 325 | fclose(f); 326 | goto OUT1; 327 | } 328 | char buf[BUFSIZE]; 329 | long packfile_offset = ftell(packfile); 330 | 331 | int read_amount; 332 | while ((read_amount = fread(buf, 1, BUFSIZE * sizeof(char), f)) > 0) { 333 | int write_amount = fwrite(buf, 1, read_amount, packfile); 334 | if (write_amount != read_amount) { 335 | perror("Error writing to packfile"); 336 | } 337 | } 338 | fclose(f); 339 | if (read_amount < 0) { 340 | perrorf("Error reading from %s", file_path); 341 | goto OUT1; 342 | } 343 | ret_val = packfile_offset; 344 | goto OUT1; 345 | 346 | OUT1: 347 | free(file_path); 348 | return ret_val; 349 | 350 | } 351 | 352 | /*--------------------------------------------------------------------*/ 353 | 354 | /** 355 | * Comparison function for sorting index entries. 356 | */ 357 | int compare_index_entries(const void *a, const void *b) { 358 | const struct index_entry *iea = (const struct index_entry *)a; 359 | const struct index_entry *ieb = (const struct index_entry *)b; 360 | return (iea->hash > ieb->hash) - (iea->hash < ieb->hash); 361 | } 362 | 363 | /*--------------------------------------------------------------------*/ 364 | 365 | /** 366 | * Counts files in the directory to be added to packfile 367 | */ 368 | int count_loose_files(char *dir_path) { 369 | DIR *dir = opendir(dir_path); 370 | if (dir == NULL){ 371 | perrorf("Error in opening directory: %s", dir_path); 372 | return(-1); 373 | } 374 | int num_loose = 0; 375 | struct dirent *entry; 376 | while ((entry = readdir(dir))) { 377 | if (strcmp(entry->d_name, PACKFILE_NAME) == 0 378 | || strcmp(entry->d_name, PACKFILE_INDEX_NAME) == 0 379 | || entry->d_name[0] == '.' ) { 380 | continue; 381 | } 382 | num_loose++; 383 | } 384 | closedir(dir); 385 | return num_loose; 386 | } 387 | 388 | /*--------------------------------------------------------------------*/ 389 | 390 | /** 391 | * Writes new_index to a temporary file, then renames it to the index file, 392 | * replacing the old one atomically. 393 | */ 394 | int write_new_index(struct index_entry *new_index, 395 | int new_index_length, char *file_path) { 396 | // write the new index to a tmp file 397 | if(create_file_if_nonexistent(file_path) == -1) { 398 | return -1; 399 | } 400 | FILE *file = fopen(file_path, "w"); 401 | if (file == NULL) { 402 | perror("Error creating tempfile"); 403 | return(-1); 404 | } 405 | int write_amount = fwrite(new_index, new_index_length, 406 | sizeof(struct index_entry), file); 407 | fclose(file); 408 | if (write_amount < 0) { 409 | perror("Error writing tempfile"); 410 | return(-1); 411 | } 412 | return 0; 413 | } 414 | 415 | /*--------------------------------------------------------------------*/ 416 | 417 | /** 418 | * Gets hash from the saved string 419 | */ 420 | uint64_t string_to_hash(char *filename){ 421 | XXH64_canonical_t *canonical = malloc(sizeof(XXH64_canonical_t)); 422 | char *hash_str = filename; 423 | for(int i = 0; i < 8; i++){ 424 | sscanf((hash_str+2*i), "%02hhx", &(canonical->digest[i])); 425 | } 426 | uint64_t ret_hash = XXH64_hashFromCanonical(canonical); 427 | free(canonical); 428 | return ret_hash; 429 | } 430 | 431 | /*--------------------------------------------------------------------*/ 432 | 433 | struct read_file_result { 434 | int error; 435 | void *data; 436 | size_t length; 437 | }; 438 | 439 | struct read_file_args { 440 | char *filename; 441 | char *indexdir; 442 | }; 443 | 444 | void *read_file(void *args) { 445 | struct read_file_args *real_args = args; 446 | char *filename = real_args->filename; 447 | char *indexdir = real_args->indexdir; 448 | free(args); 449 | struct read_file_result *result = malloc(sizeof(struct read_file_result)); 450 | result->error = 0; 451 | 452 | char *path = add_path_parts(indexdir, filename); 453 | char tmp[27]; 454 | sprintf(tmp, ".%s.lock", filename); 455 | char *lock_path = add_path_parts(indexdir, tmp); 456 | int ret = lockfile_check(lock_path, 0); 457 | free(lock_path); 458 | if(ret == 0){ 459 | goto OUT4; 460 | } 461 | 462 | FILE *file = fopen(path, "r"); 463 | if (file == NULL) { 464 | result->error = errno; 465 | goto OUT4; 466 | } 467 | int corrupt = remove_if_corrupted(file, path); 468 | if (corrupt) { 469 | result->error = -corrupt; 470 | goto OUT3; 471 | } 472 | if (fseek(file, 0, SEEK_END) != 0) { 473 | result->error = errno; 474 | goto OUT3; 475 | } 476 | long size = ftell(file); 477 | if (size < 0) { 478 | result->error = errno; 479 | goto OUT3; 480 | } 481 | rewind(file); 482 | void *buff = malloc(size); 483 | if (buff == NULL) { 484 | result->error = errno; 485 | goto OUT3; 486 | } 487 | int read_amount = fread(buff, size, 1, file); 488 | if (read_amount != 1) { 489 | result->error = errno; 490 | free(buff); 491 | goto OUT3; 492 | } else { 493 | result->data = buff; 494 | result->length = size; 495 | } 496 | fclose(file); 497 | free(path); 498 | return result; 499 | 500 | OUT3: 501 | fclose(file); 502 | OUT4: 503 | free(path); 504 | result->data = NULL; 505 | result->length = 0; 506 | return result; 507 | } 508 | 509 | /** 510 | * Reads many files in parallel, starting a separate thread per file. 511 | */ 512 | struct read_file_result *read_files_in_parallel(char **filenames, int num, 513 | char *indexdir) { 514 | struct read_file_result *results = 515 | malloc(num * sizeof(struct read_file_result)); 516 | pthread_t threads[num]; 517 | int threads_created = 0; 518 | for (int i = 0; i < num; i++) { 519 | struct read_file_args *args = malloc(sizeof(*args)); 520 | args->filename = filenames[i]; 521 | args->indexdir = indexdir; 522 | if (pthread_create(&threads[i], NULL, read_file, args) != 0) { 523 | perror("Could not create thread"); 524 | break; 525 | } 526 | threads_created++; 527 | } 528 | for (int t = 0; t < threads_created; t++) { 529 | struct read_file_result *result; 530 | pthread_join(threads[t], (void **)&result); 531 | results[t] = *result; 532 | free(result); 533 | } 534 | if (threads_created < num) { 535 | for (int i = 0; i < num; i++) { 536 | free(results[i].data); 537 | } 538 | free(results); 539 | return NULL; 540 | } 541 | return results; 542 | } 543 | 544 | /** 545 | * Appends all file data from results to the packfile, writing the new index 546 | * entries to new_entries. 547 | */ 548 | int write_to_packfile( 549 | struct read_file_result *results, 550 | int num_results, 551 | struct index_entry *new_entries, 552 | char *added_file_paths[], 553 | FILE *packfile, 554 | char *indexdir, 555 | char *filenames[]) { 556 | int files_added = 0; 557 | for (int i = 0; i < num_results; i++) { 558 | struct read_file_result result = results[i]; 559 | if (result.length > 0) { 560 | long offset = write_data_to_packfile( 561 | result.data, result.length, packfile); 562 | if (offset < 0) { 563 | continue; 564 | } 565 | new_entries[files_added].hash = string_to_hash(filenames[i]); 566 | new_entries[files_added].packfile_offset = htobe64(offset); 567 | added_file_paths[files_added] = add_path_parts(indexdir, filenames[i]); 568 | files_added++; 569 | } else { 570 | if (result.error > 0 && result.error != EACCES) { 571 | fprintf(stderr, "Error reading file %s: %s\n", filenames[i], 572 | strerror(result.error)); 573 | } else if (result.error == -1) { 574 | char *path = add_path_parts(indexdir, filenames[i]); 575 | fprintf(stderr, "File was corrupted and removed: %s", path); 576 | free(path); 577 | } 578 | } 579 | } 580 | return files_added; 581 | } 582 | 583 | /** 584 | * Adds num_loose loose files to the packfile. 585 | * Returns a pointer to index entries for the now-packed files. 586 | */ 587 | struct index_entry *add_loose_files_to_packfile( 588 | int *num_loose, char *indexdir, char *file_paths[], 589 | FILE *packfile, char* lock_path) { 590 | static const int parallel_reads = 50; 591 | 592 | struct index_entry *new_entries = malloc( 593 | sizeof(struct index_entry) * *num_loose); 594 | if (new_entries == NULL){ 595 | perror("Error: Memory not allocated"); 596 | return(NULL); 597 | } 598 | time_t last_lockfile_touch = time(NULL); 599 | 600 | DIR *dir = opendir(indexdir); 601 | if (dir == NULL){ 602 | perror("Error in opening directory"); 603 | free(new_entries); 604 | return NULL; 605 | } 606 | 607 | struct dirent *entry; 608 | int files_added = 0; 609 | char *filenames_buffer[parallel_reads]; 610 | int buffer_size = 0; 611 | while (1) { 612 | entry = readdir(dir); 613 | if (entry != NULL) { 614 | if (strcmp(entry->d_name, PACKFILE_NAME) == 0 615 | || strcmp(entry->d_name, PACKFILE_INDEX_NAME) == 0 616 | || entry->d_name[0] == '.') { 617 | continue; 618 | } 619 | 620 | filenames_buffer[buffer_size] = malloc(strlen(entry->d_name) + 1); 621 | strcpy(filenames_buffer[buffer_size], entry->d_name); 622 | buffer_size++; 623 | } 624 | time_t curr_time = time(NULL); 625 | if (curr_time > last_lockfile_touch + 60) { 626 | lockfile_touch(lock_path); 627 | last_lockfile_touch = curr_time; 628 | } 629 | int buffer_full = buffer_size == parallel_reads; 630 | int enough_files = entry == NULL || files_added + buffer_size == *num_loose; 631 | if (buffer_full || enough_files) { 632 | // read some files 633 | struct read_file_result *results = read_files_in_parallel( 634 | filenames_buffer, buffer_size, indexdir); 635 | files_added += write_to_packfile( 636 | results, buffer_size, new_entries + files_added, file_paths + 637 | files_added, packfile, indexdir, filenames_buffer); 638 | for (int i = 0; i < buffer_size; i++) { 639 | free(filenames_buffer[i]); 640 | free(results[i].data); 641 | } 642 | free(results); 643 | buffer_size = 0; 644 | } 645 | if (enough_files) { 646 | break; 647 | } 648 | } 649 | *num_loose = files_added; 650 | closedir(dir); 651 | return new_entries; 652 | } 653 | 654 | /*--------------------------------------------------------------------*/ 655 | 656 | /** 657 | * Merges the two sorted arrays arr1 and arr2 and stores the result in result. 658 | */ 659 | void two_finger_merge(struct index_entry *arr1, int arr1size, 660 | struct index_entry *arr2, int arr2size, 661 | struct index_entry *result) { 662 | int i1 = 0; 663 | int i2 = 0; 664 | for (int r = 0; r < arr1size + arr2size; r++) { 665 | if (i1 < arr1size 666 | && (i2 >= arr2size || arr1[i1].hash < arr2[i2].hash)) { 667 | result[r] = arr1[i1]; 668 | i1++; 669 | } else { 670 | result[r] = arr2[i2]; 671 | i2++; 672 | } 673 | } 674 | } 675 | 676 | /*--------------------------------------------------------------------*/ 677 | 678 | /** 679 | * Adds all of the new index entries to the packfile index. 680 | * Re-sorts as needed. 681 | */ 682 | int add_entries_to_index(struct index_entry *new_entries, 683 | int num_new_entries, char *indexdir) { 684 | // read old index into new index buffer 685 | int ret_val = -1; 686 | char *packfile_index_path = add_path_parts( 687 | indexdir, PACKFILE_INDEX_NAME); 688 | create_file_if_nonexistent(packfile_index_path); 689 | FILE *packfile_index = fopen(packfile_index_path, "r"); 690 | if (packfile_index == NULL) { 691 | perror("Error opening packfile index"); 692 | free(packfile_index_path); 693 | return ret_val; 694 | } 695 | 696 | size_t num_existing = get_num_index_entries(packfile_index); 697 | size_t new_index_length = num_existing + num_new_entries; 698 | struct index_entry *new_index = malloc(new_index_length * 699 | sizeof(struct index_entry)); 700 | struct index_entry *old_index = malloc(num_existing * 701 | sizeof(struct index_entry)); 702 | int read_amount = fread(old_index, sizeof(struct index_entry), 703 | num_existing, packfile_index); 704 | fclose(packfile_index); 705 | 706 | if (read_amount < 0) { 707 | perror("Error reading index file"); 708 | goto OUT1; 709 | } 710 | assert(read_amount == num_existing); 711 | 712 | qsort(new_entries, num_new_entries, sizeof(struct index_entry), 713 | compare_index_entries); 714 | two_finger_merge(old_index, num_existing, new_entries, num_new_entries, 715 | new_index); 716 | char *tmpfile_path = add_path_parts(indexdir, 717 | TEMP_PACKFILE_INDEX_NAME); 718 | write_new_index(new_index, new_index_length, tmpfile_path); 719 | rename(tmpfile_path, packfile_index_path); 720 | 721 | free(tmpfile_path); 722 | ret_val = 0; 723 | goto OUT1; 724 | 725 | OUT1: 726 | free(packfile_index_path); 727 | free(new_index); 728 | free(old_index); 729 | return ret_val; 730 | } 731 | 732 | /*--------------------------------------------------------------------*/ 733 | 734 | struct delete_files_thread_args { 735 | char **file_paths; 736 | int num_to_delete; 737 | }; 738 | 739 | void *delete_files_thread_work(void *args) { 740 | struct delete_files_thread_args *actual_args = args; 741 | for (int i = 0; i < actual_args->num_to_delete; i++) { 742 | remove(actual_args->file_paths[i]); 743 | } 744 | free(actual_args); 745 | return 0; 746 | } 747 | 748 | /** 749 | * Delete the files that were in directory but now in packfile. 750 | * 751 | * Files are deleted in parallel across 50 threads. 752 | */ 753 | void delete_loose_files(char *file_paths[], int num_loose){ 754 | if (num_loose == 0) { 755 | return; 756 | } 757 | int num_threads = num_loose > 50 ? 50 : num_loose; 758 | pthread_t threads[num_threads]; 759 | int base_files_per_thread = num_loose / num_threads; 760 | int deletes_delegated = 0; 761 | int threads_created = 0; 762 | while (deletes_delegated < num_loose) { 763 | struct delete_files_thread_args *args = malloc(sizeof(*args)); 764 | int num_to_delete; 765 | if (threads_created < num_loose % num_threads) { 766 | num_to_delete = base_files_per_thread + 1; 767 | } else { 768 | num_to_delete = base_files_per_thread; 769 | } 770 | args->file_paths = &file_paths[deletes_delegated]; 771 | args->num_to_delete = num_to_delete; 772 | if (pthread_create(&threads[threads_created], NULL, delete_files_thread_work, args)) { 773 | free(args); 774 | perror("Error starting deletion thread"); 775 | goto OUT2; 776 | } 777 | deletes_delegated += num_to_delete; 778 | threads_created++; 779 | } 780 | 781 | OUT2: 782 | for (int t = 0; t < threads_created; t++) { 783 | pthread_join(threads[t], NULL); 784 | } 785 | } 786 | 787 | /*--------------------------------------------------------------------*/ 788 | 789 | /** 790 | * Scans the index directory for files not in the packfile. 791 | * Each found file is read, inserted into the packfile, and deleted. 792 | * The packfile index is updated as well. 793 | */ 794 | int pack_loose_files_in_subdir(char *index_subdir) { 795 | // add all the loose files to the packfile and 796 | // create index entries for them 797 | int ret_val = -1; 798 | mode_t old_umask = umask(0); 799 | 800 | char *packfile_path = add_path_parts(index_subdir, PACKFILE_NAME); 801 | create_file_if_nonexistent(packfile_path); 802 | 803 | char *packfile_lock = add_path_parts(index_subdir, PACKFILE_LOCK_NAME); 804 | int ret = lockfile_create(packfile_lock, 0, 0); 805 | if(ret != 0){ 806 | free(packfile_lock); 807 | free(packfile_path); 808 | return(ret_val); 809 | } 810 | 811 | FILE *packfile = fopen(packfile_path, "a"); 812 | free(packfile_path); 813 | if (packfile == NULL) { 814 | lockfile_remove(packfile_lock); 815 | free(packfile_lock); 816 | perror("Error opening packfile"); 817 | return(ret_val); 818 | } 819 | 820 | // figure our how many loose files there are 821 | int num_loose = count_loose_files(index_subdir); 822 | char *file_paths[num_loose]; 823 | 824 | if (num_loose == 0) { 825 | goto OUT1; 826 | } 827 | struct index_entry *new_entries = add_loose_files_to_packfile( 828 | &num_loose, index_subdir, file_paths, packfile, packfile_lock); 829 | 830 | if (new_entries == NULL){ 831 | goto OUT1; 832 | } else if (num_loose == 0) { 833 | free(new_entries); 834 | goto OUT1; 835 | } 836 | 837 | fflush(packfile); 838 | int fd = fileno(packfile); 839 | fsync(fd); 840 | 841 | add_entries_to_index(new_entries, num_loose, index_subdir); 842 | free(new_entries); 843 | delete_loose_files(file_paths, num_loose); 844 | for (int i = 0; i < num_loose; i++) { 845 | free(file_paths[i]); 846 | } 847 | 848 | ret_val = 0; 849 | goto OUT1; 850 | 851 | OUT1: 852 | fclose(packfile); 853 | lockfile_remove(packfile_lock); 854 | free(packfile_lock); 855 | umask(old_umask); 856 | return(ret_val); 857 | } 858 | 859 | int pack_loose_files(char *indexdir) { 860 | DIR *dir = opendir(indexdir); 861 | if (dir == NULL){ 862 | perrorf("Error in opening directory: %s", indexdir); 863 | return(-1); 864 | } 865 | struct dirent *entry; 866 | while ((entry = readdir(dir))) { 867 | if (entry->d_name[0] == '.' ) { 868 | continue; 869 | } 870 | char *path = add_path_parts(indexdir, entry->d_name); 871 | if (is_dir(path)) { 872 | pack_loose_files_in_subdir(path); 873 | } 874 | free(path); 875 | } 876 | closedir(dir); 877 | 878 | return 0; 879 | } 880 | -------------------------------------------------------------------------------- /bitmap/src/packfile.h: -------------------------------------------------------------------------------- 1 | #ifndef PACKFILE_INCLUDED 2 | #define PACKFILE_INCLUDED 3 | 4 | #include 5 | #include 6 | 7 | /*--------------------------------------------------------------------*/ 8 | 9 | #define PACKFILE_NAME "packfile" 10 | #define PACKFILE_INDEX_NAME "packfile_index" 11 | #define TEMP_PACKFILE_INDEX_NAME ".packfile_index.tmp" 12 | #define PACKFILE_LOCK_NAME ".packfile.lock" 13 | #define EMPTY_FILE 1 14 | 15 | /*--------------------------------------------------------------------*/ 16 | 17 | int is_corrupted(FILE* loosefile); 18 | 19 | uint8_t *read_from_packfile(char *filename, int64_t mtime, char *store); 20 | 21 | int pack_loose_files(char *indexdir); 22 | 23 | int pack_loose_files_in_subdir(char *index_subdir); 24 | 25 | int remove_if_corrupted(FILE *file, char *file_path); 26 | 27 | /*--------------------------------------------------------------------*/ 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /bitmap/src/util.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "util.h" 14 | 15 | /*--------------------------------------------------------------------*/ 16 | 17 | /** 18 | * Allocates a new string consisting of dir + '/' + filename. 19 | * The combined strings must not exceed PATH_MAX-1 in length. 20 | */ 21 | char *add_path_parts(char *dir, char *filename) { 22 | char *path = malloc(PATH_MAX); 23 | strcpy(path, dir); 24 | strcat(path, "/"); 25 | strcat(path, filename); 26 | return path; 27 | } 28 | 29 | /*--------------------------------------------------------------------*/ 30 | 31 | /** 32 | * Returns whether we can read from and write to the given directory. 33 | */ 34 | int is_directory_readwritable(char *path) { 35 | return (access(path, R_OK) == 0 36 | && access(path, W_OK) == 0); 37 | } 38 | 39 | /*--------------------------------------------------------------------*/ 40 | 41 | /** 42 | * Returns directory where bitmaps are currently stored, by checking 43 | * the first available directory from the list. 44 | */ 45 | char *get_index_directory() { 46 | static char *indexdir = NULL; 47 | if (indexdir != NULL) { 48 | return indexdir; 49 | } 50 | if (is_directory_readwritable("/4gram/")) { 51 | return (indexdir = "/4gram"); 52 | } 53 | char *home_cache_dir = add_path_parts(getenv("HOME"), ".cache"); 54 | char *home_4gram_dir = add_path_parts(home_cache_dir, "4gram"); 55 | mkdir(home_cache_dir, 0700); 56 | mkdir(home_4gram_dir, 0777); 57 | if (is_directory_readwritable(home_4gram_dir)) { 58 | return (indexdir = home_4gram_dir); 59 | } 60 | perror("Could not find readwritable directory to cache 4grams\n"); 61 | return(NULL); 62 | } 63 | 64 | /*--------------------------------------------------------------------*/ 65 | 66 | /** 67 | * Returns what index subdirectory we should store the index for a file with 68 | * the given timestamp. 69 | * 70 | * These subdirectories are of the form "indexdir/YYYY_MM" 71 | */ 72 | char *get_index_subdirectory(char *indexdir, int64_t timestamp) { 73 | struct tm *gmt = gmtime(×tamp); 74 | char date_string[8]; 75 | strftime(date_string, sizeof(date_string), "%Y_%m", gmt); 76 | char *index_subdir = add_path_parts(indexdir, date_string); 77 | mkdir(index_subdir, 0777); 78 | return index_subdir; 79 | } 80 | 81 | /*--------------------------------------------------------------------*/ 82 | 83 | /** 84 | * Determines at runtime whether our CPU supports BMI2 instructions. 85 | */ 86 | int supports_bmi2() { 87 | static int supports_bmi2_cache = -1; 88 | if (supports_bmi2_cache != -1) { 89 | return supports_bmi2_cache; 90 | } 91 | unsigned int level = 0; 92 | unsigned int eax = 1; 93 | unsigned int ebx, ecx, edx; 94 | __get_cpuid(level, &eax, &ebx, &ecx, &edx); 95 | supports_bmi2_cache = (ebx >> 8) & 1; 96 | return supports_bmi2_cache; 97 | } 98 | 99 | /*--------------------------------------------------------------------*/ 100 | 101 | /** 102 | * Frees the data stored in the given array. 103 | */ 104 | void free_intarray(struct intarray arr) { 105 | free(arr.data); 106 | } 107 | 108 | /** 109 | * Frees the data stored by the given array array. 110 | * 111 | * Recursively frees all sub-arrays. 112 | */ 113 | void free_intarrayarray(struct intarrayarray arr) { 114 | for (int i = 0; i < arr.num_rows; i++) { 115 | free_intarray(arr.rows[i]); 116 | } 117 | free(arr.rows); 118 | } 119 | 120 | /** 121 | * Like perror, but uses a format string. 122 | */ 123 | void perrorf(char *fmt, ...) { 124 | va_list args; 125 | va_start(args, fmt); 126 | vfprintf(stderr, fmt, args); 127 | va_end(args); 128 | fprintf(stderr, ": "); 129 | perror(""); 130 | } 131 | 132 | /** 133 | * Returns the mtime of the file entry at the given path. 134 | */ 135 | int64_t get_mtime(char *path) { 136 | struct stat s; 137 | stat(path, &s); 138 | return s.st_mtime; 139 | } 140 | 141 | /** 142 | * Returns whether path points to a directory or not. 143 | */ 144 | int is_dir(char *path) { 145 | struct stat s; 146 | stat(path, &s); 147 | return S_ISDIR(s.st_mode); 148 | } 149 | -------------------------------------------------------------------------------- /bitmap/src/util.h: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_INCLUDED 2 | #define UTIL_INCLUDED 3 | 4 | #include 5 | 6 | /*--------------------------------------------------------------------*/ 7 | 8 | #define NGRAM_CHARS 5 9 | #define NGRAM_CHAR_BITS 4 10 | #define POSSIBLE_NGRAMS ((1u) << (NGRAM_CHARS * NGRAM_CHAR_BITS)) 11 | #define SIZEOF_BITMAP (POSSIBLE_NGRAMS / 8) 12 | 13 | #define BUFSIZE 2048 14 | #define CHAR_MASK ((1 << NGRAM_CHAR_BITS) - 1) 15 | #define NGRAM_MASK (POSSIBLE_NGRAMS - 1) 16 | #define NGRAM_SHIFT_LEFT_MASK (NGRAM_MASK - CHAR_MASK) 17 | #define HASH_SEED 0xfe5000 //purestorage color 18 | 19 | #define GZ_TRUNCATED 1 20 | 21 | /*--------------------------------------------------------------------*/ 22 | 23 | char *add_path_parts(char *dir, char *filename); 24 | 25 | char *get_bitmap_store_directory(); 26 | 27 | int supports_bmi2(); 28 | 29 | struct intarray { 30 | int length; 31 | int *data; 32 | }; 33 | 34 | void free_intarray(struct intarray arr); 35 | 36 | struct intarrayarray { 37 | int num_rows; 38 | struct intarray *rows; 39 | }; 40 | 41 | void free_intarrayarray(struct intarrayarray arr); 42 | 43 | void perrorf(char *fmt, ...) 44 | __attribute__((format (printf, 1, 2))); 45 | 46 | int64_t get_mtime(char *path); 47 | 48 | char *get_lock_path(char *directory, char *filename); 49 | 50 | char *get_index_subdirectory(char *indexdir, int64_t timestamp); 51 | 52 | int is_dir(char *path); 53 | 54 | /*--------------------------------------------------------------------*/ 55 | 56 | #endif 57 | -------------------------------------------------------------------------------- /build_deb.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | docker build -t 4grep-docker docker_build 3 | docker run --rm -v $(pwd):/build \ 4 | -e COMMIT_COUNT="$(git rev-list HEAD --count)" \ 5 | -e COMMIT_HASH="$(git rev-parse HEAD)" \ 6 | 4grep-docker 7 | docker rmi 4grep-docker 8 | -------------------------------------------------------------------------------- /debian/4grep.links: -------------------------------------------------------------------------------- 1 | /usr/lib/4grep.so /usr/lib/lib4grep.so 2 | /usr/lib/lib4grep.so /usr/lib/lib4grep.so.0 3 | -------------------------------------------------------------------------------- /debian/changelog: -------------------------------------------------------------------------------- 1 | 4grep (0.0.1) trusty; urgency=low 2 | 3 | * Initial release. 4 | 5 | -- MAINTAINER Wed, 09 Aug 2017 17:11:46 -0600 6 | -------------------------------------------------------------------------------- /debian/compat: -------------------------------------------------------------------------------- 1 | 9 2 | -------------------------------------------------------------------------------- /debian/control: -------------------------------------------------------------------------------- 1 | Source: 4grep 2 | Maintainer: Matthew Pfeiffer 3 | Build-Depends: debhelper (>=8.0.0), gcc (>=4.9.0), liblockfile-dev, zlib1g-dev 4 | Standards-Version: 3.9.7 5 | Section: utils 6 | 7 | Package: 4grep 8 | Priority: extra 9 | Architecture: any 10 | Depends: python, liblockfile1, zlib1g, ${shlibs:Depends}, ${misc:Depends} 11 | Description: like tgrep, but better 12 | Greps over files with a persistent index and progress bar. 13 | -------------------------------------------------------------------------------- /debian/copyright: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/purestorage/4grep/1b721ea3ab1f284a4b41083b34f1540a90b76f6a/debian/copyright -------------------------------------------------------------------------------- /debian/postinst: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # postinst script for 4grep 3 | # 4 | # see: dh_installdeb(1) 5 | 6 | set -e 7 | 8 | # summary of how this script can be called: 9 | # * `configure' 10 | # * `abort-upgrade' 11 | # * `abort-remove' `in-favour' 12 | # 13 | # * `abort-remove' 14 | # * `abort-deconfigure' `in-favour' 15 | # `removing' 16 | # 17 | # for details, see https://www.debian.org/doc/debian-policy/ or 18 | # the debian-policy package 19 | 20 | 21 | case "$1" in 22 | configure) 23 | ldconfig 24 | ;; 25 | 26 | abort-upgrade|abort-remove|abort-deconfigure) 27 | ;; 28 | 29 | *) 30 | echo "postinst called with unknown argument \`$1'" >&2 31 | exit 1 32 | ;; 33 | esac 34 | 35 | # dh_installdeb will replace this with shell code automatically 36 | # generated by other debhelper scripts. 37 | 38 | #DEBHELPER# 39 | 40 | exit 0 41 | -------------------------------------------------------------------------------- /debian/rules: -------------------------------------------------------------------------------- 1 | #!/usr/bin/make -f 2 | %: 3 | dh $@ 4 | -------------------------------------------------------------------------------- /debian/source/format: -------------------------------------------------------------------------------- 1 | 3.0 (native) 2 | -------------------------------------------------------------------------------- /debian/source/options: -------------------------------------------------------------------------------- 1 | tar-ignore = "4grepc" 2 | -------------------------------------------------------------------------------- /description: -------------------------------------------------------------------------------- 1 | 4grep -------------------------------------------------------------------------------- /disp_bitmap.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | from itertools import izip 4 | from Queue import Queue 5 | from PIL import Image 6 | 7 | import multiprocessing 8 | import subprocess 9 | import argparse 10 | import sys 11 | import os 12 | 13 | HELP = ''' 14 | This is how you use it. 15 | ''' 16 | 17 | def write_bitmaps(): 18 | for filename in os.listdir("/Users/user/Desktop/logs/upstart"): 19 | if filename.endswith(".gz"): 20 | proc = subprocess.Popen(["bitmap/exec/generate_bitmap"], stdout=subprocess.PIPE, stdin=subprocess.PIPE) 21 | ret, stderr = proc.communicate(filename) 22 | file = open(filename[0:-3]+'.bin',"w") 23 | file.write(ret) 24 | file.close() 25 | 26 | def ratio(im): 27 | pixels = im.getdata() 28 | threshold = 100 29 | count = 0 30 | for pixel in pixels: 31 | if pixel > threshold: 32 | count += 1 33 | n = len(pixels) 34 | print('Percentage:{:.2f} Black:{} Size:{}'.format(100.0*count/n, count, n)) 35 | 36 | 37 | class FinalArray(object): 38 | def __init__(self): 39 | self.bytelist = bytearray(131072) 40 | 41 | def byte_or(bytearray_list, i): 42 | a = 0 43 | for bytearray in bytearray_list: 44 | a = a | bytearray[i] 45 | return (a,i) 46 | 47 | def update_final(result, final): 48 | a , i = result 49 | print(i) 50 | final.bytelist[i] = a 51 | 52 | def combine_bitmaps(bytearray_list): 53 | l = len(bytearray_list[0]) 54 | print(l) 55 | final = FinalArray() 56 | for i in range(l): 57 | r = byte_or(bytearray_list, i) 58 | update_final(r, final) 59 | return final 60 | 61 | def get_byte_list(): 62 | results = [] 63 | for filename in os.listdir("/Users/user/Desktop/logs/bitmaps"): 64 | if filename.endswith(".bin"): 65 | bin_file_tmp = open('../Desktop/logs/bitmaps/' + filename, 'rb') 66 | results.append(bytearray(bin_file_tmp.read())) 67 | return results 68 | 69 | # a = get_byte_list() 70 | # b = combine_bitmaps(a) 71 | # im = Image.frombytes("1", (1024, 1024), str(b.bytelist)) 72 | # im.show() 73 | 74 | class Progress(object): 75 | def __init__(self): 76 | self.init = 0 77 | self.curr = 0 78 | 79 | def print_progress(progress): 80 | perc = 100-((progress.curr-1)*100.0/(progress.init-1)) 81 | print('>>{:.1f}%\033[K\033[F'.format(perc), file=sys.stderr) 82 | 83 | def start(): 84 | bitmap_queue = Queue() 85 | l = 131072 86 | for filename in os.listdir("/Users/user/Desktop/logs/bitmaps"): 87 | if filename.endswith(".bin"): 88 | bin_file_tmp = open('../Desktop/logs/bitmaps/' + filename, 'rb') 89 | ba = bytearray(bin_file_tmp.read()) 90 | bitmap_queue.put(ba) 91 | 92 | prog = Progress() 93 | prog.init = bitmap_queue.qsize() 94 | prog.curr = bitmap_queue.qsize() 95 | 96 | while prog.curr > 1: 97 | print_progress(prog) 98 | a = bitmap_queue.get() 99 | b = bitmap_queue.get() 100 | c = bytearray(l) 101 | 102 | for i in range(l): 103 | c[i] = a[i] | b[i] 104 | bitmap_queue.put(c) 105 | prog.curr += -1 106 | 107 | final = bitmap_queue.get() 108 | im = Image.frombytes("1", (1024, 1024), str(final)) 109 | im.show() 110 | ratio(im) 111 | 112 | class stdin_iter: 113 | def __init__(self): 114 | pass 115 | 116 | def __iter__(self): 117 | return self 118 | 119 | def next(self): 120 | ret = sys.stdin.readline().strip() 121 | if not ret: 122 | raise StopIteration 123 | return ret 124 | 125 | def main(): 126 | parser = argparse.ArgumentParser("disp_bitmap", usage=HELP, add_help=False) 127 | parser.add_argument('files', metavar='FILE', type=str, nargs='*') 128 | args, options = parser.parse_known_args() 129 | filelist = args.files 130 | if not filelist: 131 | filelist = stdin_iter() 132 | start() 133 | 134 | if __name__ == "__main__": 135 | try: 136 | main() 137 | except IOError as e: 138 | if e.errno == errno.EPIPE: 139 | pass 140 | except KeyboardInterrupt: 141 | pass 142 | -------------------------------------------------------------------------------- /docker_build/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:xenial 2 | 3 | ENV DEBIAN_FRONTEND=noninteractive 4 | 5 | RUN mkdir /build 6 | 7 | RUN apt-get update && apt-get install -y --force-yes \ 8 | gcc-4.8 \ 9 | libc6 \ 10 | build-essential \ 11 | liblockfile-dev \ 12 | zlib1g-dev \ 13 | git \ 14 | python-pip \ 15 | python-dev \ 16 | devscripts \ 17 | debhelper 18 | 19 | WORKDIR "/build" 20 | CMD git submodule init && git submodule update && make test && sh docker_build/docker_build_deb.sh 21 | -------------------------------------------------------------------------------- /docker_build/docker_build_deb.sh: -------------------------------------------------------------------------------- 1 | mkdir build 2 | cp -r * build 3 | cd build 4 | make 5 | dch -v "1.0.0-$COMMIT_COUNT" "$COMMIT_HASH" 6 | debuild -i -I -us -uc -b 7 | cd .. 8 | rm -rf build 9 | -------------------------------------------------------------------------------- /img/example.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/purestorage/4grep/1b721ea3ab1f284a4b41083b34f1540a90b76f6a/img/example.gif -------------------------------------------------------------------------------- /img/zgrepvs4grep.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/purestorage/4grep/1b721ea3ab1f284a4b41083b34f1540a90b76f6a/img/zgrepvs4grep.png -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import unittest 4 | import tempfile 5 | import os 6 | import ctypes 7 | import imp 8 | import shutil 9 | import subprocess 10 | import sys 11 | 12 | TGREP_DIR = os.path.dirname(os.path.realpath(__file__)) 13 | TGREP_FILE = os.path.join(TGREP_DIR, '4grep') 14 | 15 | tgrep = imp.load_source('4grep', TGREP_FILE) 16 | 17 | TRUNC = 0 18 | MTCH = 1 19 | NO_MTCH = 2 20 | 21 | class TestFiltering(unittest.TestCase): 22 | def setUp(self): 23 | self.tempdir = tempfile.mkdtemp() 24 | self.tempindex = tempfile.mkdtemp() 25 | 26 | def tearDown(self): 27 | shutil.rmtree(self.tempdir) 28 | shutil.rmtree(self.tempindex) 29 | 30 | def test_filter(self): 31 | index = tgrep.StringIndex([[str(10 ** tgrep.NGRAM_CHARS)]]) 32 | c_index = index.get_index_struct() 33 | for i in range(10): 34 | name = os.path.join(self.tempdir, '{}.txt'.format(i)) 35 | f = open(name, 'w') 36 | f.write(str(i * 10 ** tgrep.NGRAM_CHARS)) 37 | f.close() 38 | 39 | c_filename = ctypes.c_char_p(name) 40 | # first run through: no bitmaps, 2nd bit should be set 41 | ret = tgrep.start_filter(c_index, c_filename, self.tempindex) 42 | if i == 1: 43 | self.assertEqual(ret, 3) 44 | else: 45 | self.assertEqual(ret, 4) 46 | # 2nd run through: all bitmaps should be cached, 2nd bit unset 47 | ret = tgrep.start_filter(c_index, c_filename, self.tempindex) 48 | if i == 1: 49 | self.assertEqual(ret, 1) 50 | else: 51 | self.assertEqual(ret, 2) 52 | 53 | def test_filter_deletedfiles(self): 54 | index = tgrep.StringIndex([[str(10 ** tgrep.NGRAM_CHARS)]]) 55 | c_index = index.get_index_struct() 56 | for i in range(10): 57 | name = os.path.join(self.tempdir, '{}.txt'.format(i)) 58 | c_filename = ctypes.c_char_p(name) 59 | # no files exist, so should filter files out 60 | ret = tgrep.start_filter(c_index, c_filename, self.tempindex) 61 | self.assertEqual(ret, -1) 62 | 63 | def test_filter_modifiedfiles(self): 64 | index = tgrep.StringIndex([[str(10 ** tgrep.NGRAM_CHARS)]]) 65 | c_index = index.get_index_struct() 66 | # write garbage to each file with an old modification time 67 | for i in range(10): 68 | name = os.path.join(self.tempdir, '{}.txt'.format(i)) 69 | f = open(name, 'w') 70 | f.write(str(i * 9 ** tgrep.NGRAM_CHARS)) 71 | f.close() 72 | os.utime(name, (100, 100)) 73 | # make sure nothing is found when we search 74 | for i in range(10): 75 | name = os.path.join(self.tempdir, '{}.txt'.format(i)) 76 | c_filename = ctypes.c_char_p(name) 77 | ret = tgrep.start_filter(c_index, c_filename, self.tempindex) 78 | self.assertEqual(ret, 4) 79 | # modify each file with mtime=real, current time 80 | for i in range(10): 81 | name = os.path.join(self.tempdir, '{}.txt'.format(i)) 82 | f = open(name, 'w') 83 | f.write(str(i * 10 ** tgrep.NGRAM_CHARS)) 84 | f.close() 85 | # the query should now match file 1. 86 | for i in range(10): 87 | name = os.path.join(self.tempdir, '{}.txt'.format(i)) 88 | c_filename = ctypes.c_char_p(name) 89 | ret = tgrep.start_filter(c_index, c_filename, self.tempindex) 90 | if i == 1: 91 | self.assertEqual(ret, 3) 92 | else: 93 | self.assertEqual(ret, 4) 94 | 95 | class TestIndexAutodetection(unittest.TestCase): 96 | def test_parsable_chars(self): 97 | self.assertEqual( 98 | tgrep.get_index_from_regex('12345.54321'), 99 | tgrep.StringIndex([['12345', '54321']])) 100 | self.assertEqual( 101 | tgrep.get_index_from_regex('12345+54321'), 102 | tgrep.StringIndex([['12345', '54321']])) 103 | self.assertEqual( 104 | tgrep.get_index_from_regex('12345.*54321'), 105 | tgrep.StringIndex([['12345', '54321']])) 106 | 107 | def test_short(self): 108 | self.assertEqual( 109 | tgrep.get_index_from_regex('1234'), 110 | tgrep.empty_index()) 111 | self.assertEqual( 112 | tgrep.get_index_from_regex(''), 113 | tgrep.empty_index()) 114 | self.assertEqual( 115 | tgrep.get_index_from_regex('1234|4321'), 116 | tgrep.empty_index()) 117 | self.assertEqual( 118 | tgrep.get_index_from_regex('1234|4321.*4321'), 119 | tgrep.empty_index()) 120 | 121 | def test_regex_or(self): 122 | self.assertEqual( 123 | tgrep.get_index_from_regex('one111|two22|three'), 124 | tgrep.StringIndex([['one111'], ['two22'], ['three']])) 125 | self.assertTrue( 126 | tgrep.get_index_from_regex('one***111|two22|three').empty()) 127 | self.assertEqual( 128 | tgrep.get_index_from_regex('12345.54321|two22|three'), 129 | tgrep.StringIndex([['12345', '54321'], ['two22'], ['three']])) 130 | 131 | def test_literal(self): 132 | self.assertEqual( 133 | tgrep.get_index_from_regex('qwertyuiop'), 134 | tgrep.StringIndex([['qwertyuiop']])) 135 | 136 | def test_regex_question_mark(self): 137 | self.assertTrue( 138 | tgrep.get_index_from_regex('12345?').empty()) 139 | 140 | def test_regex_star(self): 141 | self.assertTrue( 142 | tgrep.get_index_from_regex('12345*').empty()) 143 | 144 | def test_regex_curly_braces(self): 145 | self.assertTrue( 146 | tgrep.get_index_from_regex('12345{0,9}').empty()) 147 | 148 | class TestStringIndex(unittest.TestCase): 149 | def test_get_index_struct(self): 150 | si = tgrep.StringIndex([['aaaaa']]) 151 | struct = si.get_index_struct() 152 | self.assertEqual(struct.num_rows, 1) 153 | self.assertEqual(struct.rows[0].length, 1) 154 | self.assertEqual(struct.rows[0].data[0], 0b00010001000100010001) 155 | 156 | si = tgrep.StringIndex([['aaaaa'], ['bbbbb']]) 157 | struct = si.get_index_struct() 158 | self.assertEqual(struct.num_rows, 2) 159 | self.assertEqual(struct.rows[0].length, 1) 160 | self.assertEqual(struct.rows[0].data[0], 0b00010001000100010001) 161 | self.assertEqual(struct.rows[1].length, 1) 162 | self.assertEqual(struct.rows[1].data[0], 0b00100010001000100010) 163 | 164 | si = tgrep.StringIndex([['aaaaa', 'bbbbb'], ['bbbbb']]) 165 | struct = si.get_index_struct() 166 | self.assertEqual(struct.num_rows, 2) 167 | self.assertEqual(struct.rows[0].length, 2) 168 | self.assertEqual(struct.rows[0].data[0], 0b00010001000100010001) 169 | self.assertEqual(struct.rows[0].data[1], 0b00100010001000100010) 170 | 171 | class TestTgrep(unittest.TestCase): 172 | def setUp(self): 173 | self.tempdir = tempfile.mkdtemp() 174 | 175 | def tearDown(self): 176 | shutil.rmtree(self.tempdir) 177 | 178 | def test_tgrep(self): 179 | index = str(10 ** tgrep.NGRAM_CHARS) 180 | for i in range(10): 181 | name = os.path.join(self.tempdir, '{}.txt'.format(i)) 182 | f = open(name, 'w') 183 | f.write(str(i * 10 ** tgrep.NGRAM_CHARS)) 184 | f.close() 185 | search = str(10 ** tgrep.NGRAM_CHARS) 186 | command = "{} {} {}/*.txt".format( 187 | TGREP_FILE, search, self.tempdir) 188 | out = subprocess.check_output(command, shell=True) 189 | self.assertEqual(out.strip(), self.tempdir + '/1.txt:' + search) 190 | 191 | def test_tgrep_or_regex(self): 192 | str1 = str(10 ** tgrep.NGRAM_CHARS) 193 | str2 = str(2 * 10 ** tgrep.NGRAM_CHARS) 194 | search = "{}|{}".format(str1, str2) 195 | for i in range(10): 196 | name = os.path.join(self.tempdir, '{}.txt'.format(i)) 197 | f = open(name, 'w') 198 | f.write(str(i * 10 ** tgrep.NGRAM_CHARS)) 199 | f.close() 200 | command = "{} -E '{}' {}/*.txt".format( 201 | TGREP_FILE, search, self.tempdir) 202 | out = subprocess.check_output(command, shell=True) 203 | self.assertEqual(out.strip(), 204 | self.tempdir + '/1.txt:' + str1 + "\n" 205 | + self.tempdir + "/2.txt:" + str2) 206 | 207 | def test_tgrep_unindexed(self): 208 | index = str(10 ** tgrep.NGRAM_CHARS) 209 | c_index = ctypes.c_char_p(index) 210 | for i in range(10): 211 | name = os.path.join(self.tempdir, '{}.txt'.format(i)) 212 | f = open(name, 'w') 213 | f.write(str(i * 10 ** tgrep.NGRAM_CHARS)) 214 | f.close() 215 | search = str(10 ** tgrep.NGRAM_CHARS) 216 | command = "{} --filter='' {} {}/*.txt".format( 217 | TGREP_FILE, search, self.tempdir) 218 | out = subprocess.check_output(command, shell=True) 219 | self.assertEqual(out.strip(), self.tempdir + '/1.txt:' + search) 220 | 221 | if __name__ == '__main__': 222 | unittest.main() 223 | -------------------------------------------------------------------------------- /tune.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | 4 | def set_params(n, b): 5 | f = open("./bitmap/src/util.h") 6 | lines = list(f.readlines()) 7 | n_set = b_set = False 8 | for i, line in enumerate(lines): 9 | if not n_set and 'NGRAM_CHARS' in line: 10 | lines[i] = "#define NGRAM_CHARS {}\n".format(n) 11 | n_set = True 12 | if b_set: 13 | break 14 | elif not b_set and 'NGRAM_CHAR_BITS' in line: 15 | lines[i] = '#define NGRAM_CHAR_BITS {}\n'.format(b) 16 | b_set = True 17 | if n_set: 18 | break 19 | f.close() 20 | with open('./bitmap/src/util.h', 'w') as f: 21 | f.writelines(lines) 22 | 23 | def test_params(n, b): 24 | set_params(n, b) 25 | subprocess.check_call('make') 26 | subprocess.check_call('rm -rf ~/.cache/4gram', shell=True) 27 | search = 'May 10 12:12:12' 28 | print('{} {}'.format(n, b)) 29 | for i in range(2): 30 | p = subprocess.Popen('find /home/mpfeiffer/logs/remote_logs -name "*.gz" -type f | 4grep --index="{}" "{}" > /dev/null'.format(search, search), shell=True, stderr=subprocess.PIPE) 31 | output = p.communicate()[1] 32 | lines = output.split('\n') 33 | lastline = output.split('\n')[-3] 34 | print(lastline) 35 | print(subprocess.check_output('du -h ~/.cache/4gram/packfile') 36 | 37 | for n in range(2, 10 + 1): 38 | for b in range(1, min(31 / n + 1, 8 + 1)): 39 | test_params(n, b) 40 | --------------------------------------------------------------------------------