├── .gitignore ├── .gitmodules ├── .travis.yml ├── BUILD.md ├── COPYING ├── ChangeLog ├── INSTALL ├── Jenkinsfile ├── Makefile.am ├── README.md ├── bootstrap.sh ├── c_example └── main.c ├── configure.ac ├── design ├── .gitignore ├── debruijn │ ├── 10.1.1.13.2152.pdf │ ├── bubble3.pdf │ ├── necklace_allerton.ps │ ├── p752-sawada.pdf │ ├── sdarticle.pdf │ └── weston.pdf ├── fragments.pdf ├── fragments.tex └── grammar ├── include ├── automata.h ├── basic.h ├── byteset.h ├── c_api_util.h ├── chain.h ├── codegen.h ├── compiler.h ├── container_out.h ├── decoders │ ├── asciidecoder.h │ ├── bytesource.h │ ├── decoder.h │ ├── decoderfactory.h │ ├── icudecoder.h │ ├── ocedecoder.h │ ├── rotdecoder.h │ ├── utf16.h │ ├── utf16decoder.h │ ├── utf32.h │ ├── utf32decoder.h │ ├── utf8.h │ ├── utf8decoder.h │ ├── utfdecoderbase.h │ └── xordecoder.h ├── encoders │ ├── ascii.h │ ├── byteencoder.h │ ├── caching_encoder.h │ ├── charencoder.h │ ├── concrete_encoders.h │ ├── decorating_encoder.h │ ├── encoder.h │ ├── encoderbase.h │ ├── encoderfactory.h │ ├── icuencoder.h │ ├── oceencoder.h │ ├── rotencoder.h │ ├── utf16.h │ ├── utf32.h │ ├── utf8.h │ ├── utfbase.h │ └── xorencoder.h ├── fragment.h ├── fsmthingy.h ├── fwd_pointers.h ├── graph.h ├── handles.h ├── icuconverter.h ├── icuutil.h ├── instructions.h ├── lightgrep │ ├── api.h │ ├── search_hit.h │ ├── transforms.h │ └── util.h ├── matchgen.h ├── nfabuilder.h ├── nfaoptimizer.h ├── ostream_join_iterator.h ├── pair_out.h ├── parsenode.h ├── parser.h ├── parsetree.h ├── parseutil.h ├── pattern.h ├── program.h ├── rangeset.h ├── rewriter.h ├── searchhit.h ├── sequences.h ├── simplevectorfamily.h ├── sparseset.h ├── states.h ├── thread.h ├── transition.h ├── transitionfactory.h ├── unicode.h ├── unparser.h ├── utility.h ├── vectorfamily.h ├── vm.h └── vm_interface.h ├── jenkins.sh ├── liblightgrep.spec.in ├── m4 ├── ax_append_compile_flags.m4 ├── ax_append_flag.m4 ├── ax_append_link_flags.m4 ├── ax_boost_base.m4 ├── ax_boost_program_options.m4 ├── ax_boost_system.m4 ├── ax_check_compile_flag.m4 ├── ax_check_icu.m4 ├── ax_check_library.m4 ├── ax_check_link_flag.m4 ├── ax_cxx_compile_stdcxx.m4 ├── ax_cxx_compile_stdcxx_11.m4 ├── ax_prog_bison.m4 ├── lg_remove_flags.m4 └── lg_replace_flag.m4 ├── mingw-liblightgrep.spec.in ├── pylightgrep └── lightgrep.py ├── pytest ├── analyze_trace.py ├── corpora │ ├── marktwainworks.txt │ ├── norvig1mb.txt │ ├── norvig6mb.txt │ ├── russian-utf16BE.txt │ ├── russian-utf16LE.txt │ ├── russian-utf32BE.txt │ ├── russian-utf32LE.txt │ ├── russian-utf7.txt │ ├── russian-utf8.txt │ └── utf16LE-norvig6mb.txt ├── guid_gen.py ├── keys │ ├── -----5.txt │ ├── ----10.txt │ ├── ----25.txt │ ├── ----50.txt │ ├── ---100.txt │ ├── ---200.txt │ ├── ---500.txt │ ├── --1000.txt │ ├── --1500.txt │ ├── --2000.txt │ ├── --2500.txt │ ├── --3000.txt │ ├── --4000.txt │ ├── --4500.txt │ ├── --5000.txt │ ├── --6000.txt │ ├── --7000.txt │ ├── --8000.txt │ ├── --9000.txt │ ├── -10000.txt │ ├── -11000.txt │ ├── -12000.txt │ ├── -13000.txt │ ├── -14000.txt │ ├── -15000.txt │ ├── -17500.txt │ ├── -20000.txt │ ├── -25000.txt │ ├── -30000.txt │ ├── -35000.txt │ ├── -40000.txt │ ├── -45000.txt │ ├── -50000.txt │ ├── -60000.txt │ ├── -70000.txt │ ├── -80000.txt │ ├── -90000.txt │ ├── 100000.txt │ ├── 114743.txt │ ├── fixed30.txt │ ├── shuf05.txt │ ├── shuf10.txt │ ├── shuf15.txt │ ├── shuf20.txt │ ├── shuf25.txt │ ├── shuf30.txt │ ├── shuf35.txt │ ├── shuf40.txt │ ├── shuf45.txt │ ├── shuf50.txt │ ├── shuf55.txt │ └── twain.txt ├── receive_data.py ├── regression.py ├── results │ ├── -----5.txt │ ├── ----10.txt │ ├── ----25.txt │ ├── ----50.txt │ ├── ---100.txt │ ├── ---200.txt │ ├── ---500.txt │ ├── --1000.txt │ ├── --1500.txt │ ├── --2000.txt │ ├── --2500.txt │ ├── --3000.txt │ ├── --4000.txt │ ├── --4500.txt │ ├── --5000.txt │ ├── --6000.txt │ ├── --7000.txt │ ├── --8000.txt │ ├── --9000.txt │ ├── -10000.txt │ ├── -11000.txt │ ├── -12000.txt │ ├── -13000.txt │ ├── -14000.txt │ ├── -15000.txt │ ├── -17500.txt │ ├── -20000.txt │ ├── -25000.txt │ ├── -30000.txt │ ├── -35000.txt │ ├── -40000.txt │ ├── -45000.txt │ ├── -50000.txt │ ├── -60000.txt │ ├── -70000.txt │ ├── -80000.txt │ ├── -90000.txt │ ├── 100000.txt │ └── 114743.txt ├── send_data_to_lg.py ├── tb_db_create.py ├── tinderbox.py └── trace_runner.py ├── re_gen ├── Makefile ├── aQ-3-3.bz2 ├── aQ-3.bz2 ├── basic.h ├── db-abcd-4 ├── lgtestlib.py ├── minfail.py ├── node.cpp ├── node.h ├── parsecheck.cpp ├── parsetree.h ├── pat2data-enc.py ├── pat2data.py ├── pat2sorttest.py ├── pat2swith.py ├── pat2test.py ├── pats-1000 ├── patsample.py ├── randpat.cpp ├── shitgrep.cpp ├── testregex │ ├── basic.dat │ ├── categorize.dat │ ├── forcedassoc.dat │ ├── leftassoc.dat │ ├── nullsubexpr.dat │ ├── repetition.dat │ └── rightassoc.dat ├── tests │ ├── dot-star │ └── dot-star_patterns ├── unparser.cpp ├── unparser.h ├── utf8-gen.py ├── utf8-singles.bz2 └── valid.bz2 ├── src ├── enc │ └── encodings.cpp ├── lib │ ├── ascii.cpp │ ├── automata.cpp │ ├── byteencoder.cpp │ ├── byteset.cpp │ ├── c_api_util.cpp │ ├── chain.cpp │ ├── charencoder.cpp │ ├── codegen.cpp │ ├── compiler.cpp │ ├── decoders │ │ ├── decoder.cpp │ │ ├── decoderfactory.cpp │ │ └── ocedecoder.cpp │ ├── encoderbase.cpp │ ├── encoderfactory.cpp │ ├── fsmthingy.cpp │ ├── icuconverter.cpp │ ├── icuencoder.cpp │ ├── icuutil.cpp │ ├── instructions.cpp │ ├── lightgrep.pc.in │ ├── lightgrep_c_api.cpp │ ├── lightgrep_c_util.cpp │ ├── matchgen.cpp │ ├── nfabuilder.cpp │ ├── nfaoptimizer.cpp │ ├── oceencoder.cpp │ ├── parsenode.cpp │ ├── parser.cpp │ ├── parsetree.cpp │ ├── parseutil.cpp │ ├── pattern.cpp │ ├── program.cpp │ ├── re_grammar.ypp │ ├── rewriter.cpp │ ├── states.cpp │ ├── thread.cpp │ ├── unparser.cpp │ ├── utf8.cpp │ ├── utfbase.cpp │ ├── utility.cpp │ ├── version.rc │ └── vm.cpp ├── val │ └── valid.cpp └── what │ └── what.cpp ├── test ├── data │ ├── hectotest │ ├── hectotest.dat │ ├── kilotest │ └── kilotest.dat ├── data_reader.cpp ├── data_reader.h ├── dtest.cpp ├── dtest.h ├── executor.h ├── mockcallback.cpp ├── mockcallback.h ├── stest.cpp ├── stest.h ├── test.cpp ├── test_ascii.cpp ├── test_auto_search_1.cpp ├── test_auto_search_2.cpp ├── test_auto_search_3.cpp ├── test_auto_search_4.cpp ├── test_auto_search_5.cpp ├── test_auto_search_6.cpp ├── test_auto_search_7.cpp ├── test_auto_search_8.cpp ├── test_auto_search_multi_1.cpp ├── test_auto_search_multi_2.cpp ├── test_auto_starts_with_1.cpp ├── test_auto_starts_with_2.cpp ├── test_auto_starts_with_3.cpp ├── test_auto_starts_with_4.cpp ├── test_auto_starts_with_5.cpp ├── test_auto_starts_with_6.cpp ├── test_auto_starts_with_7.cpp ├── test_auto_starts_with_multi_1.cpp ├── test_auto_starts_with_multi_2.cpp ├── test_basic.cpp ├── test_byteset.cpp ├── test_bytesource.cpp ├── test_c_api.cpp ├── test_c_util.cpp ├── test_compiler.cpp ├── test_graph.cpp ├── test_helper.cpp ├── test_helper.h ├── test_icu.cpp ├── test_icudecoder.cpp ├── test_icuutil.cpp ├── test_instructions.cpp ├── test_matchgen.cpp ├── test_nfabuilder.cpp ├── test_nfaoptimizer.cpp ├── test_oceencoder.cpp ├── test_ostream_join_iterator.cpp ├── test_parser.cpp ├── test_parseutil.cpp ├── test_program.cpp ├── test_rangeset.cpp ├── test_rewriter.cpp ├── test_rotencoder.cpp ├── test_search.cpp ├── test_search_assertions.cpp ├── test_search_data.cpp ├── test_search_data_driver.cpp ├── test_search_data_driver.h ├── test_sequences.h ├── test_sparseset.cpp ├── test_starts_with.cpp ├── test_states.cpp ├── test_testregex_basic_modified.cpp ├── test_thread.cpp ├── test_transitionfactory.cpp ├── test_unicode.cpp ├── test_unparser.cpp ├── test_utf16.cpp ├── test_utf32.cpp ├── test_utf8.cpp ├── test_utf8decoder.cpp ├── test_utility.cpp ├── test_vm.cpp └── test_xorencoder.cpp └── tools ├── macify.sh └── thread_dump.pl /.gitignore: -------------------------------------------------------------------------------- 1 | *.la 2 | *.lo 3 | *.o 4 | *.pyc 5 | .*.swp 6 | .deps/ 7 | .dirstamp 8 | .libs/ 9 | /INSTALL 10 | /Makefile 11 | /Makefile.in 12 | /aclocal.m4 13 | /autom4te.cache 14 | /autoscan.log 15 | /c_example/cex 16 | /c_example/cex.exe 17 | /config/ 18 | /config.h 19 | /config.h.in 20 | /config.h.in~ 21 | /config.log 22 | /config.status 23 | /configure 24 | /configure.scan 25 | /include/lightgrep/encodings.h 26 | /liblightgrep.spec 27 | /libtool 28 | /m4/libtool.m4 29 | /m4/ltoptions.m4 30 | /m4/ltsugar.m4 31 | /m4/ltversion.m4 32 | /m4/lt~obsolete.m4 33 | /mingw-liblightgrep.spec 34 | /re_gen/parsecheck 35 | /re_gen/randpat 36 | /re_gen/shitgrep 37 | /src/lib/lightgrep.pc 38 | /src/lib/re_grammar.cpp 39 | /src/lib/re_grammar.output 40 | /src/enc/enc 41 | /src/enc/enc.exe 42 | /src/val/val 43 | /src/val/val.exe 44 | /src/what/what 45 | /src/what/what.exe 46 | /stamp-h1 47 | /test-suite.log 48 | /test/test 49 | /test/test.log 50 | /test/test.exe 51 | /test/test.trs 52 | /tmp/ 53 | /vendors/ 54 | _libs/ 55 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "vendors/scope"] 2 | path = vendors/scope 3 | url = https://github.com/jonstewart/scope.git 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: cpp 2 | compiler: 3 | - gcc 4 | - clang 5 | before_install: 6 | - git submodule update --init 7 | - yes | sudo add-apt-repository ppa:mapnik/boost 8 | - yes | sudo add-apt-repository ppa:ubuntu-toolchain-r/test 9 | - sudo apt-get update 10 | install: 11 | - sudo apt-get install libicu-dev boost1.49 libboost-chrono1.49-dev libboost-filesystem1.49-dev libboost-program-options1.49-dev libboost-system1.49-dev libboost-thread1.49-dev gcc-4.8 g++-4.8 12 | - sudo ln -s -f /usr/bin/gcc-4.8 /usr/bin/gcc 13 | - sudo ln -s -f /usr/bin/g++-4.8 /usr/bin/g++ 14 | script: $CXX --version && autoreconf -fi && ./configure && make -j2 check 15 | -------------------------------------------------------------------------------- /ChangeLog: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/ChangeLog -------------------------------------------------------------------------------- /Jenkinsfile: -------------------------------------------------------------------------------- 1 | @Library('asdf_common') _ 2 | 3 | def BUILDS = [ 4 | 'linux/64/shared', 5 | 'windows/64/shared', 6 | // 'windows/64/shared-fat', 7 | 'windows/64/static', 8 | 'windows/32/shared', 9 | // 'windows/32/shared-fat', 10 | 'windows/32/static' 11 | ] 12 | 13 | def BASE_URL = 'ssh://git@stash.strozfriedberg.com/asdf' 14 | def DOWNSTREAM_REPOS = [ 15 | ['hasher', 'master'], 16 | ['lightgrep', 'master'], 17 | ['lightgrep-java', 'master'], 18 | ['bulk_extractor', 'master'] 19 | ] 20 | def UPSTREAM_REPOS = [['icu', 'master']] 21 | 22 | pipeline { 23 | agent none 24 | stages { 25 | stage('Handle Upstream Trigger') { 26 | steps { 27 | script { 28 | common.HandleUpstreamTrigger(env, params, BASE_URL, UPSTREAM_REPOS) 29 | } 30 | } 31 | } 32 | stage('Build') { 33 | steps { 34 | script { 35 | parallel common.makeConfigurations(scm, BUILDS) 36 | } 37 | } 38 | } 39 | stage('Trigger Downstream') { 40 | steps { 41 | script { 42 | common.TriggerDownstream(env, DOWNSTREAM_REPOS) 43 | } 44 | } 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | liblightgrep 2 | ============ 3 | 4 | not the worst forensics regexp engine 5 | 6 | About 7 | ----- 8 | Lightgrep is a new regular expression engine, designed specifically for digital forensics. Why another regexp engine? 9 | 10 | Lightgrep: 11 | * searches for many patterns simultaneously 12 | * searches binary data as a stream, not as discrete lines of text 13 | * searches for patterns in many different encodings; give it dirty data, lightgrep don't care 14 | * never, ever, ever, never, never looks at a byte twice or backs up in your input 15 | 16 | Lightgrep is still pretty new and doesn't have all the regexp features you might be used to. But it has enough features to be more than a toy, and what is supported is well-tested. 17 | 18 | liblightgrep is copyright (c) 2010-2015, Stroz Friedberg, LLC. liblightgrep is available under version 3 of the GNU Public License. See [COPYING](COPYING) for details. 19 | 20 | Technical Info 21 | -------------- 22 | Lightgrep is implemented in portable C++11 but exposes a concise C API. The core of the API is defined in [include/lightgrep/api.h](./include/lightgrep/api.h). You can see a small example program at [c_example/main.c](./c_example/main.c). 23 | 24 | Lightgrep depends on a number of [Boost](http://www.boost.org/) libraries and also on [ICU](http://www.icu-project.org). Currently you will need gcc 4.6+ or clang 3.1 to compile the libraries. 25 | 26 | Install 27 | ------- 28 | See the [BUILD.md file](BUILD.md) for installation instructions. 29 | -------------------------------------------------------------------------------- /bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | autoreconf -fi 4 | -------------------------------------------------------------------------------- /design/.gitignore: -------------------------------------------------------------------------------- 1 | *.aux 2 | *.log 3 | -------------------------------------------------------------------------------- /design/debruijn/10.1.1.13.2152.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/design/debruijn/10.1.1.13.2152.pdf -------------------------------------------------------------------------------- /design/debruijn/bubble3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/design/debruijn/bubble3.pdf -------------------------------------------------------------------------------- /design/debruijn/p752-sawada.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/design/debruijn/p752-sawada.pdf -------------------------------------------------------------------------------- /design/debruijn/sdarticle.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/design/debruijn/sdarticle.pdf -------------------------------------------------------------------------------- /design/debruijn/weston.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/design/debruijn/weston.pdf -------------------------------------------------------------------------------- /design/fragments.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/design/fragments.pdf -------------------------------------------------------------------------------- /include/automata.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "basic.h" 22 | #include "graph.h" 23 | #include "transition.h" 24 | #include "transitionfactory.h" 25 | #include "vectorfamily.h" 26 | 27 | struct Properties { 28 | Properties(): Deterministic(true), TransFac(new TransitionFactory()) {} 29 | 30 | bool Deterministic; 31 | std::shared_ptr TransFac; 32 | }; 33 | 34 | struct Glushkov { 35 | static const uint32_t NOLABEL; 36 | 37 | Glushkov(): Trans(0), IsMatch(false), Label(NOLABEL) {} 38 | 39 | std::string label() const; 40 | 41 | Transition* Trans; 42 | bool IsMatch; 43 | uint32_t Label; 44 | }; 45 | 46 | struct Empty {}; 47 | 48 | typedef Graph NFA; 49 | 50 | -------------------------------------------------------------------------------- /include/basic.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | typedef unsigned char byte; 29 | 30 | // typedef unsigned short uint16_t; 31 | 32 | // typedef unsigned int uint32_t; 33 | // typedef int int32_t; 34 | 35 | // typedef unsigned long long uint64_t; 36 | // typedef long long int64_t; 37 | 38 | #define THROW_WITH_OUTPUT(exceptType, expression) \ 39 | std::ostringstream buf; \ 40 | buf << __FILE__ << ":" << __LINE__ << ": " << expression; \ 41 | throw exceptType(buf.str()) 42 | 43 | #define THROW_RUNTIME_ERROR_WITH_OUTPUT(expression) THROW_WITH_OUTPUT(std::runtime_error, expression) 44 | 45 | #define THROW_RUNTIME_ERROR_WITH_CLEAN_OUTPUT(expression) \ 46 | std::ostringstream buf; \ 47 | buf << expression; \ 48 | throw std::runtime_error(buf.str()) 49 | -------------------------------------------------------------------------------- /include/c_api_util.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "lightgrep/api.h" 22 | #include "basic.h" 23 | 24 | LG_Error* makeError( 25 | const char* msg, 26 | const char* pattern = nullptr, 27 | const char* encodingChain = nullptr, 28 | const char* source = nullptr, 29 | int index = -1 30 | ); 31 | 32 | template 33 | auto trapWithRetval(F&& func, decltype(func()) fail, LG_Error** err) -> decltype(func()) { 34 | try { 35 | return func(); 36 | } 37 | catch (const std::exception& e) { 38 | if (err) { 39 | *err = makeError(e.what()); 40 | } 41 | } 42 | catch (...) { 43 | if (err) { 44 | *err = makeError("Unspecified exception"); 45 | } 46 | } 47 | 48 | return fail; 49 | } 50 | 51 | template 52 | auto trapWithRetval(F&& func, decltype(func()) fail) -> decltype(func()) { 53 | try { 54 | return func(); 55 | } 56 | catch (...) { 57 | return fail; 58 | } 59 | } 60 | 61 | template 62 | R trapWithVals(F&& func, R succ, R fail, LG_Error** err) { 63 | try { 64 | func(); 65 | return succ; 66 | } 67 | catch (const std::exception& e) { 68 | if (err) { 69 | *err = makeError(e.what()); 70 | } 71 | } 72 | catch (...) { 73 | if (err) { 74 | *err = makeError("Unspecified exception"); 75 | } 76 | } 77 | 78 | return fail; 79 | } 80 | 81 | template 82 | R trapWithVals(F&& func, R succ, R fail) { 83 | try { 84 | func(); 85 | return succ; 86 | } 87 | catch (...) { 88 | return fail; 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /include/chain.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | std::tuple< 26 | std::vector, 27 | std::string, 28 | std::vector 29 | > 30 | parseChain(const std::string& chain); 31 | -------------------------------------------------------------------------------- /include/compiler.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "basic.h" 22 | 23 | #include "fwd_pointers.h" 24 | 25 | class Compiler { 26 | public: 27 | 28 | static ProgramPtr createProgram(const NFA& graph); 29 | 30 | 31 | }; 32 | -------------------------------------------------------------------------------- /include/container_out.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "ostream_join_iterator.h" 22 | 23 | #include 24 | #include 25 | #include 26 | 27 | template > class C> 29 | std::ostream& operator<<(std::ostream& out, const C& con) { 30 | out << '['; 31 | std::copy(con.begin(), con.end(), ostream_join_iterator(out, ",")); 32 | return out << ']'; 33 | } 34 | 35 | template , typename A = std::allocator> class C> 37 | std::ostream& operator<<(std::ostream& out, const C& con) { 38 | out << '['; 39 | std::copy(con.begin(), con.end(), ostream_join_iterator(out, ",")); 40 | return out << ']'; 41 | } 42 | -------------------------------------------------------------------------------- /include/decoders/asciidecoder.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "decoders/decoder.h" 22 | 23 | class ASCIIDecoder: public Decoder { 24 | public: 25 | ASCIIDecoder(const Decoder& trans): 26 | Trans(trans.clone()) 27 | {} 28 | 29 | ASCIIDecoder(std::unique_ptr trans): 30 | Trans(std::move(trans)) 31 | {} 32 | 33 | ASCIIDecoder(const ASCIIDecoder& other): 34 | Trans(other.Trans->clone()) 35 | {} 36 | 37 | ASCIIDecoder(ASCIIDecoder&&) = default; 38 | 39 | ASCIIDecoder& operator=(const ASCIIDecoder& other) { 40 | Trans = std::unique_ptr(other.Trans->clone()); 41 | return *this; 42 | } 43 | 44 | ASCIIDecoder& operator=(ASCIIDecoder&&) = default; 45 | 46 | virtual ASCIIDecoder* clone() const { 47 | return new ASCIIDecoder(*this); 48 | } 49 | 50 | virtual std::string name() const { 51 | std::ostringstream ss; 52 | ss << "ASCII" << Trans->name(); 53 | return ss.str(); 54 | } 55 | 56 | virtual std::pair next() { 57 | std::pair n = Trans->next(); 58 | if (n.first >= 0x80) { 59 | n.first = -n.first-1; 60 | } 61 | return n; 62 | } 63 | 64 | virtual void reset(const byte* beg, const byte* end) { 65 | Trans->reset(beg, end); 66 | } 67 | 68 | virtual uint32_t maxByteLength() const { 69 | // ASCII is 1:1 70 | return Trans->maxByteLength(); 71 | } 72 | 73 | private: 74 | std::unique_ptr Trans; 75 | }; 76 | -------------------------------------------------------------------------------- /include/decoders/bytesource.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "basic.h" 22 | #include "decoders/decoder.h" 23 | 24 | class ByteSource: public Decoder { 25 | public: 26 | ByteSource(const byte* beg, const byte* end): 27 | Cur(beg), End(end) 28 | {} 29 | 30 | ByteSource(const ByteSource&) = default; 31 | 32 | ByteSource& operator=(const ByteSource&) = default; 33 | 34 | virtual ByteSource* clone() const { 35 | return new ByteSource(*this); 36 | } 37 | 38 | virtual std::string name() const { 39 | return ""; 40 | } 41 | 42 | virtual std::pair next() { 43 | if (Cur == End) { 44 | return std::make_pair(END, End); 45 | } 46 | else { 47 | const int32_t n = *Cur; 48 | return std::make_pair(n, Cur++); 49 | } 50 | } 51 | 52 | virtual void reset(const byte* beg, const byte* end) { 53 | Cur = beg; 54 | End = end; 55 | } 56 | 57 | virtual uint32_t maxByteLength() const { 58 | return 1; 59 | } 60 | 61 | private: 62 | const byte* Cur; 63 | const byte* End; 64 | }; 65 | -------------------------------------------------------------------------------- /include/decoders/decoder.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "basic.h" 22 | 23 | #include 24 | #include 25 | 26 | class Decoder { 27 | public: 28 | virtual ~Decoder() {} 29 | 30 | virtual Decoder* clone() const = 0; 31 | 32 | virtual std::string name() const = 0; 33 | 34 | virtual std::pair next() = 0; 35 | 36 | virtual void reset(const byte* beg, const byte* end) = 0; 37 | 38 | virtual uint32_t maxByteLength() const = 0; 39 | 40 | static const int32_t END; 41 | }; 42 | -------------------------------------------------------------------------------- /include/decoders/decoderfactory.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | #include "decoders/decoder.h" 26 | 27 | class DecoderFactory { 28 | public: 29 | std::shared_ptr get(const std::string& chain); 30 | 31 | private: 32 | std::map> Cache; 33 | }; 34 | -------------------------------------------------------------------------------- /include/decoders/utf16decoder.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include 22 | 23 | #include "decoders/decoder.h" 24 | #include "decoders/utf16.h" 25 | #include "decoders/utfdecoderbase.h" 26 | 27 | template 28 | class UTF16Decoder: public UTFDecoderBase { 29 | public: 30 | UTF16Decoder(const Decoder& trans): 31 | UTFDecoderBase(trans) 32 | {} 33 | 34 | UTF16Decoder(std::unique_ptr trans): 35 | UTFDecoderBase(std::move(trans)) 36 | {} 37 | 38 | UTF16Decoder(const UTF16Decoder&) = default; 39 | 40 | UTF16Decoder(UTF16Decoder&&) = default; 41 | 42 | UTF16Decoder& operator=(const UTF16Decoder&) = default; 43 | 44 | UTF16Decoder& operator=(UTF16Decoder&&) = default; 45 | 46 | virtual UTF16Decoder* clone() const { 47 | return new UTF16Decoder(*this); 48 | } 49 | 50 | virtual std::string name() const { 51 | std::ostringstream ss; 52 | ss << "UTF-16" << (LE ? "LE" : "BE") << Trans->name(); 53 | return ss.str(); 54 | } 55 | 56 | protected: 57 | virtual size_t decode(const byte* beg, const byte* end, int32_t& cp) { 58 | return utf16_to_cp(beg, end, cp); 59 | } 60 | }; 61 | 62 | typedef UTF16Decoder UTF16LEDecoder; 63 | typedef UTF16Decoder UTF16BEDecoder; 64 | -------------------------------------------------------------------------------- /include/decoders/utf32.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | template 22 | size_t utf32_to_cp(const B buf, const B end, int32_t& cp) { 23 | if (end - buf < 4) { 24 | // invalid, too few bytes 25 | return 0; 26 | } 27 | 28 | cp = (buf[LE ? 0 : 3] ) | 29 | (buf[LE ? 1 : 2] << 8) | 30 | (buf[LE ? 2 : 1] << 16) | 31 | (buf[LE ? 3 : 0] << 24); 32 | 33 | if (cp < 0 || (0xD800 <= cp && cp < 0xE000) || 0x110000 <= cp) { 34 | // out of range 35 | cp = -1; 36 | return 0; 37 | } 38 | else { 39 | return 4; 40 | } 41 | } 42 | 43 | template 44 | size_t cp_to_utf32(int32_t cp, B& buf) { 45 | if (cp < 0) { 46 | // too small 47 | return 0; 48 | } 49 | else if (cp < 0xD800) { 50 | buf[LE ? 0 : 3] = cp & 0xFF; 51 | buf[LE ? 1 : 2] = (cp >> 8) & 0xFF; 52 | buf[LE ? 2 : 1] = (cp >> 16) & 0xFF; 53 | buf[LE ? 3 : 0] = (cp >> 24) & 0xFF; 54 | return 4; 55 | } 56 | else if (cp < 0xE000) { 57 | // UTF-16 surrogates, invalid 58 | return 0; 59 | } 60 | else if (cp < 0x110000) { 61 | buf[LE ? 0 : 3] = cp & 0xFF; 62 | buf[LE ? 1 : 2] = (cp >> 8) & 0xFF; 63 | buf[LE ? 2 : 1] = (cp >> 16) & 0xFF; 64 | buf[LE ? 3 : 0] = (cp >> 24) & 0xFF; 65 | return 4; 66 | } 67 | else { 68 | // too large 69 | return 0; 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /include/decoders/utf32decoder.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include 22 | 23 | #include "decoders/decoder.h" 24 | #include "decoders/utf32.h" 25 | #include "decoders/utfdecoderbase.h" 26 | 27 | template 28 | class UTF32Decoder: public UTFDecoderBase { 29 | public: 30 | UTF32Decoder(const Decoder& trans): 31 | UTFDecoderBase(trans) 32 | {} 33 | 34 | UTF32Decoder(std::unique_ptr trans): 35 | UTFDecoderBase(std::move(trans)) 36 | {} 37 | 38 | UTF32Decoder(const UTF32Decoder&) = default; 39 | 40 | UTF32Decoder(UTF32Decoder&&) = default; 41 | 42 | UTF32Decoder& operator=(const UTF32Decoder&) = default; 43 | 44 | UTF32Decoder& operator=(UTF32Decoder&&) = default; 45 | 46 | virtual UTF32Decoder* clone() const { 47 | return new UTF32Decoder(*this); 48 | } 49 | 50 | virtual std::string name() const { 51 | std::ostringstream ss; 52 | ss << "UTF-32" << (LE ? "LE" : "BE") << Trans->name(); 53 | return ss.str(); 54 | } 55 | 56 | protected: 57 | virtual size_t decode(const byte* beg, const byte* end, int32_t& cp) { 58 | return utf32_to_cp(beg, end, cp); 59 | } 60 | }; 61 | 62 | typedef UTF32Decoder UTF32LEDecoder; 63 | typedef UTF32Decoder UTF32BEDecoder; 64 | -------------------------------------------------------------------------------- /include/decoders/utf8decoder.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include 22 | 23 | #include "decoders/decoder.h" 24 | #include "decoders/utf8.h" 25 | #include "decoders/utfdecoderbase.h" 26 | 27 | class UTF8Decoder: public UTFDecoderBase { 28 | public: 29 | UTF8Decoder(const Decoder& trans): 30 | UTFDecoderBase(trans) 31 | {} 32 | 33 | UTF8Decoder(std::unique_ptr trans): 34 | UTFDecoderBase(std::move(trans)) 35 | {} 36 | 37 | UTF8Decoder(const UTF8Decoder&) = default; 38 | 39 | UTF8Decoder(UTF8Decoder&&) = default; 40 | 41 | UTF8Decoder& operator=(const UTF8Decoder&) = default; 42 | 43 | UTF8Decoder& operator=(UTF8Decoder&&) = default; 44 | 45 | virtual UTF8Decoder* clone() const { 46 | return new UTF8Decoder(*this); 47 | } 48 | 49 | virtual std::string name() const { 50 | std::ostringstream ss; 51 | ss << "UTF-8" << Trans->name(); 52 | return ss.str(); 53 | } 54 | 55 | protected: 56 | virtual size_t decode(const byte* beg, const byte* end, int32_t& cp) { 57 | return utf8_to_cp(beg, end, cp); 58 | } 59 | }; 60 | -------------------------------------------------------------------------------- /include/encoders/ascii.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "encoders/encoderbase.h" 22 | 23 | class ASCII: public EncoderBase { 24 | public: 25 | ASCII(): EncoderBase(UnicodeSet{{0, 0x80}}) {} 26 | 27 | virtual ASCII* clone() const { return new ASCII(); } 28 | 29 | virtual uint32_t maxByteLength() const { return 1; } 30 | 31 | virtual std::string name() const { return "ASCII"; } 32 | 33 | virtual uint32_t write(int32_t cp, byte buf[]) const; 34 | 35 | virtual void write(const UnicodeSet& user, std::vector>& v) const; 36 | 37 | virtual uint32_t write(const byte buf[], int32_t& cp) const; 38 | }; 39 | -------------------------------------------------------------------------------- /include/encoders/caching_encoder.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "encoders/decorating_encoder.h" 22 | 23 | #include 24 | 25 | class CachingEncoder: public DecoratingEncoder { 26 | public: 27 | typedef std::map>> CacheType; 28 | 29 | CachingEncoder(std::unique_ptr enc): 30 | DecoratingEncoder(std::move(enc)) 31 | {} 32 | 33 | CachingEncoder(std::unique_ptr enc, CacheType&& cache): 34 | DecoratingEncoder(std::move(enc)), 35 | Cache(std::forward(cache)) 36 | {} 37 | 38 | CachingEncoder(const Encoder& enc): 39 | DecoratingEncoder(enc) 40 | {} 41 | 42 | CachingEncoder(const Encoder& enc, CacheType&& cache): 43 | DecoratingEncoder(enc), 44 | Cache(std::forward(cache)) 45 | {} 46 | 47 | CachingEncoder(const CachingEncoder&) = default; 48 | 49 | CachingEncoder& operator=(const CachingEncoder&) = default; 50 | 51 | CachingEncoder(CachingEncoder&&) = default; 52 | 53 | CachingEncoder& operator=(CachingEncoder&&) = default; 54 | 55 | virtual CachingEncoder* clone() const { 56 | return new CachingEncoder(*this); 57 | } 58 | 59 | virtual void write(const UnicodeSet& uset, std::vector>& vo) const { 60 | auto i = Cache.find(uset); 61 | if (i != Cache.end()) { 62 | vo = i->second; 63 | } 64 | else { 65 | BaseEnc->write(uset, vo); 66 | Cache.insert(std::make_pair(uset, vo)); 67 | } 68 | } 69 | 70 | private: 71 | mutable CacheType Cache; 72 | }; 73 | -------------------------------------------------------------------------------- /include/encoders/charencoder.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include 22 | 23 | #include "encoders/encoderbase.h" 24 | 25 | class CharEncoder: public EncoderBase { 26 | public: 27 | CharEncoder(std::string&& name, std::unique_ptr enc): 28 | EncoderBase(), 29 | Name(std::forward(name)), 30 | BaseEnc(std::move(enc)) {} 31 | 32 | CharEncoder(std::string&& name, const Encoder& enc): 33 | EncoderBase(), 34 | Name(std::forward(name)), 35 | BaseEnc(enc.clone()) {} 36 | 37 | CharEncoder(const CharEncoder& other): 38 | EncoderBase(other), 39 | Name(other.Name), 40 | BaseEnc(other.BaseEnc->clone()) {} 41 | 42 | CharEncoder& operator=(const CharEncoder& other) { 43 | EncoderBase::operator=(other); 44 | Name = other.Name; 45 | BaseEnc = std::unique_ptr(other.BaseEnc->clone()); 46 | return *this; 47 | } 48 | 49 | CharEncoder(CharEncoder&&) = default; 50 | 51 | CharEncoder& operator=(CharEncoder&&) = default; 52 | 53 | virtual std::string name() const; 54 | 55 | virtual uint32_t write(int32_t cp, byte buf[]) const; 56 | 57 | using EncoderBase::write; 58 | 59 | virtual uint32_t write(const byte buf[], int32_t& cp) const; 60 | 61 | protected: 62 | virtual int32_t charTransform(int32_t cp) const = 0; 63 | virtual int32_t charUntransform(int32_t cp) const = 0; 64 | 65 | std::string Name; 66 | std::unique_ptr BaseEnc; 67 | }; 68 | -------------------------------------------------------------------------------- /include/encoders/concrete_encoders.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "encoders/ascii.h" 22 | #include "encoders/icuencoder.h" 23 | #include "encoders/utf8.h" 24 | #include "encoders/utf16.h" 25 | #include "encoders/utf32.h" 26 | 27 | -------------------------------------------------------------------------------- /include/encoders/encoder.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "basic.h" 22 | #include "byteset.h" 23 | #include "rangeset.h" 24 | 25 | #include 26 | 27 | class Encoder { 28 | public: 29 | virtual ~Encoder() {} 30 | 31 | virtual Encoder* clone() const = 0; 32 | 33 | virtual uint32_t maxByteLength() const = 0; 34 | 35 | virtual std::string name() const = 0; 36 | 37 | virtual const UnicodeSet& validCodePoints() const = 0; 38 | 39 | virtual uint32_t write(int32_t cp, byte buf[]) const = 0; 40 | 41 | virtual void write(const UnicodeSet& uset, std::vector>& v) const = 0; 42 | 43 | virtual uint32_t write(const byte buf[], int32_t& cp) const = 0; 44 | }; 45 | -------------------------------------------------------------------------------- /include/encoders/encoderbase.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "encoders/encoder.h" 22 | 23 | class EncoderBase: public Encoder { 24 | public: 25 | virtual const UnicodeSet& validCodePoints() const { return Valid; }; 26 | 27 | virtual void write(const UnicodeSet& uset, std::vector>& v) const; 28 | 29 | using Encoder::write; 30 | 31 | protected: 32 | EncoderBase(): Valid() {} 33 | 34 | EncoderBase(UnicodeSet&& valid): Valid(std::forward(valid)) {} 35 | 36 | EncoderBase(const EncoderBase&) = default; 37 | 38 | EncoderBase(EncoderBase&&) = default; 39 | 40 | EncoderBase& operator=(const EncoderBase&) = default; 41 | 42 | EncoderBase& operator=(EncoderBase&&) = default; 43 | 44 | virtual void collectRanges(const UnicodeSet& user, std::vector>& v) const; 45 | 46 | UnicodeSet Valid; 47 | }; 48 | -------------------------------------------------------------------------------- /include/encoders/encoderfactory.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | #include "encoder.h" 26 | 27 | class EncoderFactory { 28 | public: 29 | EncoderFactory(); 30 | 31 | std::shared_ptr get(const std::string& chain); 32 | 33 | private: 34 | std::map> Cache; 35 | }; 36 | -------------------------------------------------------------------------------- /include/encoders/icuencoder.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "encoders/encoderbase.h" 22 | #include "icuconverter.h" 23 | 24 | #include 25 | #include 26 | 27 | class ICUEncoder: public EncoderBase { 28 | public: 29 | ICUEncoder(const std::string& name); 30 | 31 | ICUEncoder(const ICUEncoder&) = default; 32 | 33 | ICUEncoder& operator=(const ICUEncoder&) = default; 34 | 35 | ICUEncoder(ICUEncoder&&) = default; 36 | 37 | ICUEncoder& operator=(ICUEncoder&&) = default; 38 | 39 | virtual ICUEncoder* clone() const { return new ICUEncoder(*this); } 40 | 41 | virtual uint32_t maxByteLength() const { return Conv.maxByteLength(); } 42 | 43 | virtual std::string name() const { return Conv.name(); } 44 | 45 | virtual uint32_t write(int32_t cp, byte buf[]) const; 46 | 47 | using EncoderBase::write; 48 | 49 | virtual uint32_t write(const byte buf[], int32_t& cp) const; 50 | 51 | private: 52 | ICUConverter Conv; 53 | }; 54 | -------------------------------------------------------------------------------- /include/encoders/oceencoder.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include 22 | 23 | #include "encoders/byteencoder.h" 24 | 25 | class OCEEncoder: public ByteEncoder { 26 | public: 27 | OCEEncoder(std::unique_ptr enc): 28 | ByteEncoder("OCE", std::move(enc)) {} 29 | 30 | OCEEncoder(const Encoder& enc): 31 | ByteEncoder("OCE", enc) {} 32 | 33 | OCEEncoder(const OCEEncoder&) = default; 34 | 35 | OCEEncoder& operator=(const OCEEncoder&) = default; 36 | 37 | OCEEncoder(OCEEncoder&&) = default; 38 | 39 | OCEEncoder& operator=(OCEEncoder&&) = default; 40 | 41 | virtual OCEEncoder* clone() const { 42 | return new OCEEncoder(*this); 43 | } 44 | 45 | // OCE: bytes -> bytes 46 | static const byte OCE[], unOCE[]; 47 | 48 | protected: 49 | virtual void byteTransform(byte buf[], uint32_t blen) const; 50 | virtual void byteUntransform(byte buf[], uint32_t blen) const; 51 | }; 52 | -------------------------------------------------------------------------------- /include/encoders/utf8.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "encoders/caching_encoder.h" 22 | #include "encoders/utfbase.h" 23 | 24 | class UTF8: public UTFBase { 25 | public: 26 | virtual UTF8* clone() const { return new UTF8(); } 27 | 28 | virtual uint32_t maxByteLength() const { return 4; } 29 | 30 | virtual std::string name() const { return "UTF-8"; } 31 | 32 | virtual uint32_t write(int32_t cp, byte buf[]) const; 33 | 34 | using UTFBase::write; 35 | 36 | virtual uint32_t write(const byte buf[], int32_t& cp) const; 37 | 38 | protected: 39 | virtual void collectRanges(const UnicodeSet& user, std::vector>& v) const; 40 | 41 | virtual void writeRangeBlock(std::vector& v, uint32_t& l, uint32_t h, uint32_t len, uint32_t blimit) const; 42 | }; 43 | 44 | class CachingUTF8: public CachingEncoder { 45 | public: 46 | CachingUTF8(): CachingEncoder( 47 | UTF8(), 48 | { 49 | // \p{Any}, . 50 | { 51 | {{0, 0xD800}, {0xE000, 0x110000}}, 52 | { 53 | { {{0x00, 0x80}} }, 54 | { {{0xC2, 0xE0}}, {{0x80, 0xC0}} }, 55 | { 0xE0, {{0xA0, 0xC0}}, {{0x80, 0xC0}} }, 56 | { 0xED, {{0x80, 0xA0}}, {{0x80, 0xC0}} }, 57 | { {{0xE1,0xED}, {0xEE,0xF0}}, {{0x80, 0xC0}}, {{0x80, 0xC0}} }, 58 | { 0xF0, {{0x90, 0xC0}}, {{0x80, 0xC0}}, {{0x80, 0xC0}} }, 59 | { 0xF4, {{0x80, 0x90}}, {{0x80, 0xC0}}, {{0x80, 0xC0}} }, 60 | { {{0xF1, 0xF4}}, {{0x80, 0xC0}}, {{0x80, 0xC0}}, {{0x80, 0xC0}} } 61 | } 62 | } 63 | } 64 | ) {} 65 | }; 66 | -------------------------------------------------------------------------------- /include/encoders/utfbase.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "encoders/encoderbase.h" 22 | 23 | class UTFBase: public EncoderBase { 24 | public: 25 | using EncoderBase::write; 26 | 27 | protected: 28 | UTFBase(): EncoderBase(UnicodeSet{{0, 0xD800}, {0xE000, 0x110000}}) {} 29 | 30 | virtual void writeRangeBlock(std::vector& v, uint32_t& l, uint32_t h, uint32_t len, uint32_t blimit) const = 0; 31 | 32 | virtual void writeRange(std::vector>& va, UnicodeSet::const_iterator& i, const UnicodeSet::const_iterator& iend, uint32_t& l, uint32_t& h, byte* cur, uint32_t len, uint32_t blimit) const; 33 | 34 | virtual void skipRange(UnicodeSet::const_iterator& i, const UnicodeSet::const_iterator& iend, uint32_t& l, uint32_t& h, uint32_t ubound) const; 35 | }; 36 | -------------------------------------------------------------------------------- /include/encoders/xorencoder.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include 22 | #include 23 | 24 | #include 25 | 26 | #include "encoders/byteencoder.h" 27 | 28 | class XOREncoder: public ByteEncoder { 29 | public: 30 | XOREncoder(byte key, std::unique_ptr enc): 31 | ByteEncoder("XOR" + boost::lexical_cast((uint32_t) key), std::move(enc)), 32 | Key(key) {} 33 | 34 | XOREncoder(byte key, const Encoder& enc): 35 | ByteEncoder("XOR" + boost::lexical_cast((uint32_t) key), enc), 36 | Key(key) {} 37 | 38 | XOREncoder(const XOREncoder&) = default; 39 | 40 | XOREncoder& operator=(const XOREncoder&) = default; 41 | 42 | XOREncoder(XOREncoder&&) = default; 43 | 44 | XOREncoder& operator=(XOREncoder&&) = default; 45 | 46 | virtual XOREncoder* clone() const { 47 | return new XOREncoder(*this); 48 | } 49 | 50 | protected: 51 | virtual void byteTransform(byte buf[], uint32_t blen) const { 52 | const byte key = Key; 53 | std::transform(buf, buf+blen, buf, [key](byte b){ return b ^ key; }); 54 | } 55 | 56 | virtual void byteUntransform(byte buf[], uint32_t blen) const { 57 | byteTransform(buf, blen); 58 | } 59 | 60 | private: 61 | byte Key; 62 | }; 63 | -------------------------------------------------------------------------------- /include/fragment.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "basic.h" 22 | #include "automata.h" 23 | #include "parsenode.h" 24 | 25 | #include 26 | 27 | typedef std::vector InListT; 28 | typedef std::vector> OutListT; 29 | 30 | static const uint32_t NOSKIP = std::numeric_limits::max(); 31 | 32 | struct Fragment { 33 | Fragment(): Skippable(NOSKIP) {} 34 | 35 | Fragment(NFA::VertexDescriptor in, const ParseNode& n): 36 | InList(1, in), N(n), Skippable(NOSKIP) {} 37 | 38 | /* 39 | * InList is the list of vertices in this fragment which have incoming 40 | * edges from outside the fragment. OutList is the is the list of vertices 41 | * in this fragment which have edges leaving the fragment. 42 | */ 43 | InListT InList; 44 | OutListT OutList; 45 | ParseNode N; 46 | 47 | uint32_t Skippable; 48 | 49 | void initFull(NFA::VertexDescriptor in, const ParseNode& n) { 50 | N = n; 51 | Skippable = NOSKIP; 52 | InList.clear(); 53 | InList.push_back(in); 54 | OutList.clear(); 55 | OutList.emplace_back(in, 0); 56 | } 57 | 58 | void reset(const ParseNode& n) { 59 | N = n; 60 | Skippable = NOSKIP; 61 | InList.clear(); 62 | OutList.clear(); 63 | } 64 | 65 | void assign(Fragment& f) { 66 | InList.swap(f.InList); 67 | OutList.swap(f.OutList); 68 | N = f.N; 69 | Skippable = f.Skippable; 70 | } 71 | }; 72 | -------------------------------------------------------------------------------- /include/fsmthingy.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "basic.h" 4 | #include "nfabuilder.h" 5 | #include "nfaoptimizer.h" 6 | #include "encoders/encoderfactory.h" 7 | 8 | #include 9 | 10 | class FSMThingy { 11 | public: 12 | FSMThingy(uint32_t sizeHint); 13 | 14 | EncoderFactory EncFac; 15 | NFABuilder Nfab; 16 | NFAOptimizer Comp; 17 | NFAPtr Fsm; 18 | 19 | void addPattern(const ParseTree& tree, const char* chain, uint32_t label); 20 | 21 | void finalizeGraph(bool determinize); 22 | }; 23 | -------------------------------------------------------------------------------- /include/fwd_pointers.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "basic.h" 22 | 23 | template class EdgeDescriptorStorage> 27 | class Graph; 28 | 29 | struct Properties; 30 | struct Glushkov; 31 | struct Empty; 32 | template class VectorFamily; 33 | 34 | typedef Graph NFA; 35 | 36 | typedef std::shared_ptr NFAPtr; 37 | 38 | class Program; 39 | 40 | typedef std::shared_ptr ProgramPtr; 41 | -------------------------------------------------------------------------------- /include/handles.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include 22 | #include 23 | 24 | #include "lightgrep/api.h" 25 | #include "lightgrep/util.h" 26 | 27 | #include "basic.h" 28 | #include "fsmthingy.h" 29 | #include "fwd_pointers.h" 30 | #include "parsetree.h" 31 | #include "vm_interface.h" 32 | #include "pattern.h" 33 | #include "decoders/decoderfactory.h" 34 | 35 | struct PatternHandle { 36 | Pattern Pat; 37 | ParseTree Tree; 38 | }; 39 | 40 | struct PatternMapHandle { 41 | std::vector Patterns; 42 | 43 | PatternMapHandle(unsigned int sizeHint): Patterns() { Patterns.reserve(sizeHint); } 44 | 45 | ~PatternMapHandle() { 46 | for (LG_PatternInfo& pi : Patterns) { 47 | delete[] pi.Pattern; 48 | delete[] pi.EncodingChain; 49 | } 50 | } 51 | 52 | void addPattern(const char* pattern, const char* chain) { 53 | std::unique_ptr patcopy(new char[std::strlen(pattern)+1]); 54 | std::strcpy(patcopy.get(), pattern); 55 | 56 | std::unique_ptr chcopy(new char[std::strlen(chain)+1]); 57 | std::strcpy(chcopy.get(), chain); 58 | 59 | Patterns.push_back({patcopy.get(), chcopy.get(), nullptr}); 60 | patcopy.release(); 61 | chcopy.release(); 62 | } 63 | }; 64 | 65 | struct FSMHandle { 66 | std::unique_ptr Impl; 67 | }; 68 | 69 | struct ProgramHandle { 70 | ProgramPtr Impl; 71 | }; 72 | 73 | struct ContextHandle { 74 | std::shared_ptr Impl; 75 | }; 76 | 77 | struct DecoderHandle { 78 | DecoderFactory Factory; 79 | }; 80 | -------------------------------------------------------------------------------- /include/icuconverter.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "basic.h" 22 | #include "rangeset.h" 23 | 24 | #include 25 | #include 26 | 27 | #include 28 | 29 | class ICUConverter { 30 | public: 31 | ICUConverter(const char* encname); 32 | 33 | ICUConverter(const std::string& encname); 34 | 35 | ICUConverter(const ICUConverter& other); 36 | 37 | ICUConverter& operator=(const ICUConverter& other); 38 | 39 | ICUConverter(ICUConverter&&) = default; 40 | 41 | ICUConverter& operator=(ICUConverter&&) = default; 42 | 43 | size_t maxByteLength() const { return max_bytes; } 44 | 45 | std::string name() const { return Name; } 46 | 47 | UnicodeSet validCodePoints() const; 48 | 49 | size_t bytes_to_cp(const byte* beg, const byte* end, int32_t& cp) const; 50 | 51 | size_t cp_to_bytes(int32_t cp, byte buf[]) const; 52 | 53 | private: 54 | void init(); 55 | 56 | std::string Name; 57 | 58 | size_t max_bytes; 59 | 60 | std::unique_ptr bytes_conv, cp_conv; 61 | std::unique_ptr bytes_pivot, cp_pivot; 62 | }; 63 | -------------------------------------------------------------------------------- /include/icuutil.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "basic.h" 22 | #include "rangeset.h" 23 | 24 | struct USet; 25 | 26 | void convUnicodeSet(UnicodeSet& dst, const USet* src); 27 | 28 | void convUnicodeSet(USet* dst, const UnicodeSet& src); 29 | -------------------------------------------------------------------------------- /include/lightgrep/search_hit.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #ifndef LIGHTGREP_C_SEARCH_HIT_H_ 20 | #define LIGHTGREP_C_SEARCH_HIT_H_ 21 | 22 | #include 23 | 24 | #ifdef __cplusplus 25 | extern "C" { 26 | #endif 27 | 28 | // Basic Search Hit struct 29 | 30 | typedef struct { 31 | uint64_t Start, // starting offset of the hit 32 | End; // one past the hit, i.e., End - Start = Length 33 | uint32_t KeywordIndex; // index of keyword that hit 34 | } LG_SearchHit; 35 | 36 | // function you specify to handle the search hit, e.g., 37 | // void gotASearchHit(void* userData, const LG_SearchHit const* hit) { 38 | // print("hit at %d, ending %d, on keyword %d", 39 | // hit->Start, hit->End, hit->KeywordIndex); 40 | // } 41 | typedef void (*LG_HITCALLBACK_FN)(void* userData, const LG_SearchHit* const hit); 42 | 43 | 44 | #ifdef __cplusplus 45 | } 46 | #endif 47 | 48 | #endif /* LIGHTGREP_C_SEARCH_HIT_H_ */ 49 | -------------------------------------------------------------------------------- /include/lightgrep/transforms.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #ifndef LIGHTGREP_C_TRANSFORMS_H_ 20 | #define LIGHTGREP_C_TRANSFORMS_H_ 21 | 22 | #include "util.h" 23 | 24 | #ifdef __cplusplus 25 | extern "C" { 26 | #endif 27 | 28 | static const LG_TRANS LG_CHAR_TRANSFORMS[] = { 29 | { "identity", 0 } 30 | }; 31 | 32 | static const char* const LG_CANONICAL_CHAR_TRANSFORMS[] = { 33 | "identity" // 0 34 | }; 35 | 36 | // identity 37 | static const int LG_CHAR_TRANSFORM_IDENTITY = 0; 38 | 39 | static const LG_TRANS LG_BYTE_TRANSFORMS[] = { 40 | { "identity", 0 }, 41 | { "OCE", 1 } 42 | }; 43 | 44 | static const char* const LG_CANONICAL_BYTE_TRANSFORMS[] = { 45 | "identity", // 0 46 | "OCE", // 1 47 | }; 48 | 49 | // identity 50 | static const int LG_BYTE_TRANSFORM_IDENTITY = 0; 51 | 52 | // Outlook Compressible Encryption 53 | static const int LG_BYTE_TRANSFORM_OUTLOOK = 1; 54 | 55 | #ifdef __cplusplus 56 | } 57 | #endif 58 | 59 | #endif /* LIGHTGREP_C_TRANSFORMS_H_ */ 60 | -------------------------------------------------------------------------------- /include/matchgen.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "basic.h" 22 | #include "fwd_pointers.h" 23 | 24 | #include 25 | 26 | void matchgen(const NFA& g, std::set& matches, uint32_t maxMatches = std::numeric_limits::max(), uint32_t maxLoops = 1); 27 | 28 | -------------------------------------------------------------------------------- /include/nfaoptimizer.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "basic.h" 22 | #include "automata.h" 23 | 24 | #include 25 | #include 26 | #include 27 | 28 | class NFAOptimizer { 29 | public: 30 | typedef std::pair StatePair; 31 | typedef std::pair EdgePair; 32 | 33 | void mergeIntoFSM(NFA& dst, const NFA& src); 34 | 35 | void labelGuardStates(NFA& g); 36 | 37 | void propagateMatchLabels(NFA& g); 38 | void removeNonMinimalLabels(NFA& g); 39 | 40 | void subsetDFA(NFA& dst, const NFA& src); 41 | 42 | void pruneBranches(NFA& g); 43 | 44 | StatePair processChild(const NFA& src, NFA& dst, uint32_t si, NFA::VertexDescriptor srcHead, NFA::VertexDescriptor dstHead); 45 | 46 | bool canMerge(const NFA& dst, NFA::VertexDescriptor dstTail, const Transition* dstTrans, ByteSet& dstBits, const NFA& src, NFA::VertexDescriptor srcTail, const ByteSet& srcBits) const; 47 | 48 | private: 49 | std::map> Dst2Src; 50 | std::vector Src2Dst; 51 | std::stack Edges; 52 | std::set Visited; 53 | std::map DstPos; 54 | }; 55 | -------------------------------------------------------------------------------- /include/ostream_join_iterator.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include 22 | #include 23 | 24 | template> 26 | class ostream_join_iterator : 27 | public std::iterator 28 | { 29 | public: 30 | typedef CharT char_type; 31 | typedef Traits traits_type; 32 | typedef std::basic_ostream ostream_type; 33 | 34 | ostream_join_iterator(ostream_type& s, const char_type* c = 0) : 35 | stream(&s), string(c), print_string(0) {} 36 | 37 | ostream_join_iterator& operator=(const T& val) { 38 | if (print_string) *stream << print_string; 39 | print_string = string; 40 | *stream << val; 41 | return *this; 42 | } 43 | 44 | ostream_join_iterator& operator*() { return *this; } 45 | ostream_join_iterator& operator++() { return *this; } 46 | ostream_join_iterator& operator++(int) { return *this; } 47 | 48 | private: 49 | ostream_type* stream; 50 | const char_type* string; 51 | const char_type* print_string; 52 | }; 53 | -------------------------------------------------------------------------------- /include/pair_out.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include 22 | #include 23 | 24 | template 25 | std::ostream& operator<<(std::ostream& o, const std::pair& p) { 26 | return o << '(' << p.first << ',' << p.second << ')'; 27 | } 28 | -------------------------------------------------------------------------------- /include/parser.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "parsetree.h" 22 | #include "pattern.h" 23 | 24 | bool parse(const Pattern& pattern, ParseTree& tree); 25 | 26 | void parseAndReduce(const Pattern& pattern, ParseTree& tree); 27 | 28 | void reduce(const std::string& text, ParseTree& tree); 29 | -------------------------------------------------------------------------------- /include/pattern.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "basic.h" 22 | 23 | class Pattern { 24 | public: 25 | std::string Expression; 26 | bool FixedString, 27 | CaseInsensitive; 28 | std::string Encoding; 29 | 30 | Pattern(const char* expr, 31 | bool fixed = false, 32 | bool insensitive = false, 33 | const std::string& enc = "US-ASCII"): 34 | Expression(expr), 35 | FixedString(fixed), 36 | CaseInsensitive(insensitive), 37 | Encoding(enc) {} 38 | 39 | Pattern(const std::string& expr = "", 40 | bool fixed = false, 41 | bool insensitive = false, 42 | const std::string& enc = "US-ASCII"): 43 | Expression(expr), 44 | FixedString(fixed), 45 | CaseInsensitive(insensitive), 46 | Encoding(enc) {} 47 | 48 | Pattern(const Pattern&) = default; 49 | 50 | Pattern(Pattern&&) = default; 51 | 52 | Pattern& operator=(const Pattern&) = default; 53 | 54 | Pattern& operator=(Pattern&&) = default; 55 | 56 | bool operator==(const Pattern& p) const { 57 | return FixedString == p.FixedString && 58 | CaseInsensitive == p.CaseInsensitive && 59 | Expression == p.Expression && 60 | Encoding == p.Encoding; 61 | } 62 | 63 | virtual ~Pattern() {} 64 | }; 65 | 66 | std::ostream& operator<<(std::ostream&, const Pattern&); 67 | -------------------------------------------------------------------------------- /include/rewriter.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "parsenode.h" 22 | 23 | #include 24 | 25 | bool hasZeroLengthMatch(const ParseNode* root); 26 | 27 | bool reduceEmptySubtrees(ParseNode* root); 28 | bool reduceUselessRepetitions(ParseNode* root); 29 | bool reduceTrailingNongreedyThenEmpty(ParseNode* root); 30 | bool reduceTrailingNongreedyThenGreedy(ParseNode* root); 31 | 32 | bool combineConsecutiveRepetitions(ParseNode* root); 33 | bool makeBinopsRightAssociative(ParseNode* root); 34 | 35 | 36 | void spliceOutParent(ParseNode* gp, const ParseNode* p, ParseNode* c); 37 | -------------------------------------------------------------------------------- /include/searchhit.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "lightgrep/search_hit.h" 22 | #include "basic.h" 23 | 24 | class SearchHit: public LG_SearchHit { 25 | public: 26 | SearchHit() {} 27 | 28 | // note that this takes the length 29 | SearchHit(uint64_t start, uint64_t end, uint32_t lbl) { 30 | Start = start; 31 | End = end; 32 | KeywordIndex = lbl; 33 | } 34 | 35 | uint64_t length() const { 36 | return End - Start; 37 | } 38 | 39 | bool operator==(const SearchHit& x) const { 40 | return x.Start == Start && x.End == End && x.KeywordIndex == KeywordIndex; 41 | } 42 | 43 | bool operator<(const SearchHit& x) const { 44 | return Start < x.Start || 45 | (Start == x.Start && 46 | (End < x.End || 47 | (End == x.End && KeywordIndex < x.KeywordIndex))); 48 | } 49 | }; 50 | 51 | template 52 | OutStream& operator<<(OutStream& out, const SearchHit& hit) { 53 | out << '(' << hit.Start << ", " << hit.End << ", " << hit.KeywordIndex << ')'; 54 | return out; 55 | } 56 | 57 | typedef LG_HITCALLBACK_FN HitCallback; 58 | -------------------------------------------------------------------------------- /include/sequences.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | template 26 | struct Duplicate { 27 | bool operator()(const T& a) { 28 | return !seen.insert(a).second; 29 | } 30 | 31 | std::set seen; 32 | }; 33 | 34 | template > class C> 36 | void removeRightDuplicates(C& list) { 37 | using namespace std::placeholders; 38 | 39 | Duplicate dup; 40 | list.erase( 41 | std::remove_if( 42 | list.begin(), list.end(), 43 | std::bind(&Duplicate::operator(), std::ref(dup), _1) 44 | ), 45 | list.end() 46 | ); 47 | } 48 | -------------------------------------------------------------------------------- /include/sparseset.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "basic.h" 22 | 23 | #include 24 | 25 | // for the time-being, we're only going to support uint32_t, but could obviously template 26 | // also, this doesn't work so well (er, at all) with values > 2^31-1 27 | // low-part of array is non-sparse, high-part is dense 28 | class SparseSet { 29 | public: 30 | SparseSet(uint32_t maxSize = 0) { resize(maxSize); } 31 | 32 | uint32_t size() const { return End - Max; } 33 | 34 | // e had damn well better be less than Max, because we don't check 35 | bool find(uint32_t e) const { 36 | const uint32_t i = Data[e] + Max; 37 | return i < End && Data[i] == e; 38 | } 39 | 40 | // e had damn well better be less than Max, because we don't check 41 | void insert(uint32_t e) { 42 | Data[End] = e; 43 | Data[e] = End - Max; 44 | ++End; 45 | } 46 | 47 | void clear() { 48 | End = Max; 49 | } 50 | 51 | void resize(uint32_t maxSize) { 52 | Data.reset(new uint32_t[2 * maxSize]); 53 | End = Max = maxSize; 54 | // we don't have to do this, but it'll make things like valgrind happy 55 | std::fill(Data.get(), Data.get() + maxSize, 0); 56 | } 57 | 58 | size_t max_size() const { return std::numeric_limits::max()/2+1; } 59 | 60 | private: 61 | std::unique_ptr Data; 62 | uint32_t End, 63 | Max; 64 | }; 65 | -------------------------------------------------------------------------------- /include/transition.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "basic.h" 22 | #include "byteset.h" 23 | 24 | struct Instruction; 25 | 26 | class Transition { 27 | public: 28 | Transition() {} 29 | virtual ~Transition() {} 30 | 31 | virtual const byte* allowed(const byte* beg, const byte* end) const = 0; 32 | 33 | virtual ByteSet& getBytes(ByteSet& bs) const { 34 | bs.reset(); 35 | return orBytes(bs); 36 | } 37 | 38 | virtual ByteSet& orBytes(ByteSet& bs) const = 0; 39 | virtual byte type() const = 0; 40 | virtual size_t objSize() const = 0; 41 | virtual Transition* clone(void* buffer = 0) const = 0; 42 | virtual size_t numInstructions() const = 0; 43 | virtual bool toInstruction(Instruction* addr) const = 0; 44 | virtual std::string label() const = 0; 45 | 46 | private: 47 | Transition(const Transition&) {} 48 | Transition& operator=(const Transition&) {return *this;} 49 | }; 50 | -------------------------------------------------------------------------------- /include/unparser.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "basic.h" 22 | #include "parsetree.h" 23 | 24 | #include 25 | 26 | std::string byteToCharacterString(uint32_t i); 27 | std::string byteToLiteralString(uint32_t i); 28 | std::string byteSetToCharacterClass(const ByteSet& bs); 29 | 30 | std::string unparse(const ParseTree& tree); 31 | 32 | -------------------------------------------------------------------------------- /include/utility.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "basic.h" 22 | 23 | #include 24 | #include 25 | 26 | #include "automata.h" 27 | #include "pattern.h" 28 | 29 | struct SearchInfo {}; 30 | 31 | template 32 | uint32_t estimateGraphSize(const std::vector& keywords) { 33 | uint32_t ret = 0; 34 | for (const auto& p : keywords) { 35 | uint32_t pSize = p.Expression.size(); 36 | const std::string& enc = p.Encoding; 37 | // FIXME: Shouldn't we use something from the Encoders for this? 38 | if (enc == "UTF-16LE" || enc == "UTF-16BE") { 39 | pSize <<= 1; 40 | } 41 | else if (enc == "UTF-8") { 42 | pSize *= 3; 43 | pSize >>= 1; 44 | } 45 | else if (enc == "UTF-32LE" || enc == "UTF-32BE") { 46 | pSize <<= 2; 47 | } 48 | ret += pSize; 49 | } 50 | uint32_t fudgeFactor = ret; 51 | fudgeFactor >>= 2; 52 | ret += fudgeFactor; 53 | return ret; 54 | } 55 | 56 | std::pair> bestPair(const NFA& graph); 57 | 58 | std::vector> pivotStates(NFA::VertexDescriptor source, const NFA& graph); 59 | 60 | uint32_t maxOutbound(const std::vector>& tranTable); 61 | 62 | void writeGraphviz(std::ostream& out, const NFA& graph); 63 | -------------------------------------------------------------------------------- /include/vm_interface.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see . 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "basic.h" 22 | #include "fwd_pointers.h" 23 | #include "searchhit.h" 24 | 25 | class VmInterface { 26 | public: 27 | virtual ~VmInterface() {} 28 | 29 | virtual void startsWith(const byte* const beg, const byte* const end, const uint64_t startOffset, HitCallback hitFn, void* userData) = 0; 30 | virtual uint64_t search(const byte* const beg, const byte* const end, const uint64_t startOffset, HitCallback hitFn, void* userData) = 0; 31 | virtual uint64_t searchResolve(const byte* const beg, const byte* const end, const uint64_t startOffset, HitCallback hitFn, void* userData) = 0; 32 | virtual void closeOut(HitCallback hitFn, void* userData) = 0; 33 | virtual void reset() = 0; 34 | 35 | #ifdef LBT_TRACE_ENABLED 36 | virtual void setDebugRange(uint64_t beg, uint64_t end) = 0; 37 | #endif 38 | 39 | static std::shared_ptr create(ProgramPtr prog); 40 | }; 41 | -------------------------------------------------------------------------------- /jenkins.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -ex 2 | 3 | . $HOME/vendors/build_config.sh 4 | 5 | clean_it 6 | 7 | ./bootstrap.sh 8 | 9 | build_it 10 | install_it 11 | 12 | mkdir -p $INST/lib/python 13 | cp pylightgrep/lightgrep.py $INST/lib/python 14 | 15 | if [ $Target = 'linux' -a $Linkage = 'shared' ]; then 16 | ln -fsr $INST/lib/liblightgrep.so.0.0.0 $INST/lib/liblightgrep.so.0 17 | ln -fsr $INST/lib/liblightgrep.so.0.0.0 $INST/lib/liblightgrep.so.0.0 18 | fi 19 | 20 | case "$Target" in 21 | linux) 22 | STAGE='src/lib/.libs/liblightgrep.so*' 23 | ;; 24 | 25 | windows) 26 | case "$Linkage" in 27 | shared*) 28 | DLL='src/lib/.libs/liblightgrep.dll' 29 | STAGE="$DLL $($VENDORS/gather.sh $DLL $MINGW_ROOT/bin $DEPS/bin)" 30 | ;; 31 | static) 32 | STAGE='src/lib/.libs/liblightgrep.a' 33 | ;; 34 | esac 35 | ;; 36 | esac 37 | 38 | STAGE+=' pylightgrep/lightgrep.py' 39 | 40 | archive_it 41 | -------------------------------------------------------------------------------- /m4/ax_cxx_compile_stdcxx_11.m4: -------------------------------------------------------------------------------- 1 | # ============================================================================= 2 | # https://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx_11.html 3 | # ============================================================================= 4 | # 5 | # SYNOPSIS 6 | # 7 | # AX_CXX_COMPILE_STDCXX_11([ext|noext], [mandatory|optional]) 8 | # 9 | # DESCRIPTION 10 | # 11 | # Check for baseline language coverage in the compiler for the C++11 12 | # standard; if necessary, add switches to CXX and CXXCPP to enable 13 | # support. 14 | # 15 | # This macro is a convenience alias for calling the AX_CXX_COMPILE_STDCXX 16 | # macro with the version set to C++11. The two optional arguments are 17 | # forwarded literally as the second and third argument respectively. 18 | # Please see the documentation for the AX_CXX_COMPILE_STDCXX macro for 19 | # more information. If you want to use this macro, you also need to 20 | # download the ax_cxx_compile_stdcxx.m4 file. 21 | # 22 | # LICENSE 23 | # 24 | # Copyright (c) 2008 Benjamin Kosnik 25 | # Copyright (c) 2012 Zack Weinberg 26 | # Copyright (c) 2013 Roy Stogner 27 | # Copyright (c) 2014, 2015 Google Inc.; contributed by Alexey Sokolov 28 | # Copyright (c) 2015 Paul Norman 29 | # Copyright (c) 2015 Moritz Klammler 30 | # 31 | # Copying and distribution of this file, with or without modification, are 32 | # permitted in any medium without royalty provided the copyright notice 33 | # and this notice are preserved. This file is offered as-is, without any 34 | # warranty. 35 | 36 | #serial 18 37 | 38 | AX_REQUIRE_DEFINED([AX_CXX_COMPILE_STDCXX]) 39 | AC_DEFUN([AX_CXX_COMPILE_STDCXX_11], [AX_CXX_COMPILE_STDCXX([11], [$1], [$2])]) 40 | -------------------------------------------------------------------------------- /m4/lg_remove_flags.m4: -------------------------------------------------------------------------------- 1 | AC_DEFUN([LG_REMOVE_FLAGS], [{ 2 | lg_remove_flags_out_save=$out 3 | lg_remove_flags_flag_save=$flag 4 | out= 5 | for flag in [$]$1 ; do 6 | case $2 in 7 | $flag\ * | *\ $flag | *\ $flag\ * | $flag) 8 | echo "removing $flag from $1" 9 | ;; 10 | *) 11 | out="$out $flag" 12 | ;; 13 | esac 14 | done 15 | $1=$out 16 | out=$lg_remove_flags_out_save 17 | flag=$lg_remove_flags_flag_save 18 | }]) 19 | -------------------------------------------------------------------------------- /m4/lg_replace_flag.m4: -------------------------------------------------------------------------------- 1 | AC_DEFUN([LG_REPLACE_FLAG], [{ 2 | lg_replace_flag_out_save=$out 3 | lg_replace_flag_flag_save=$flag 4 | out= 5 | for flag in [$]$1 ; do 6 | if test x"$flag" = x"$2" ; then 7 | echo "replacing $flag with $3 in $1" 8 | out="$out $3" 9 | else 10 | out="$out $flag" 11 | fi 12 | done 13 | $1=$out 14 | out=$lg_replace_flag_out_save 15 | flag=$lg_replace_flag_flag_save 16 | }]) 17 | -------------------------------------------------------------------------------- /pytest/corpora/marktwainworks.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/corpora/marktwainworks.txt -------------------------------------------------------------------------------- /pytest/corpora/russian-utf16BE.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/corpora/russian-utf16BE.txt -------------------------------------------------------------------------------- /pytest/corpora/russian-utf16LE.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/corpora/russian-utf16LE.txt -------------------------------------------------------------------------------- /pytest/corpora/russian-utf32BE.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/corpora/russian-utf32BE.txt -------------------------------------------------------------------------------- /pytest/corpora/russian-utf32LE.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/corpora/russian-utf32LE.txt -------------------------------------------------------------------------------- /pytest/corpora/russian-utf7.txt: -------------------------------------------------------------------------------- 1 | +/v8EEg 2009 +BDMEPgQ0BEM +BDI +BEAEPgRBBEEEOAQ5BEEEOgQ+--+BDAEPAQ1BEAEOAQ6BDAEPQRBBDoEOARF +BD4EQgQ9BD4ESAQ1BD0EOARPBEU +BD0EMARHBDAEOwRBBE8 +BD0EPgQyBEsEOQ +BE0EQgQwBD8. +BB8EQAQ1BDcEOAQ0BDUEPQRCBEs +BCAEPgRBBEEEOAQ4 +BDg +BCEEKAQQ +BBQ.+BBA.+BBwENQQ0BDIENQQ0BDUEMg +BDg +BBE.+BB4EMQQwBDwEMA +BD8EPgQ0BEIEMgQ1BEAENAQ4BDsEOA +BDIEPgRBBEIEQAQ1BDEEPgQyBDAEPQQ9BD4EQQRCBEw +BD8EQAQ4BDQEMAQ9BDgETw +BD4EMQQ9BD4EMgQ7BDUEPQQ9BD4EMwQ+ +BDoEMARHBDUEQQRCBDIEMA +BD4EQgQ9BD4ESAQ1BD0EOARPBDw +BDwENQQ2BDQEQw +BDQEMgRDBDwETw +BEEEQgRABDAEPQQwBDwEOA +BDg +BD4EMQRKBE8EMgQ4BDsEOA +BD4 +BD0EMARHBDAEOwQ1 +BD8EQAQ+BEYENQRBBEEEMA, +BDoEPgRCBD4EQARLBDk +BEIENQQ/BDUEQARM +BDgENwQyBDUEQQRCBDUEPQ +BDoEMAQ6 +AKsEPwQ1BEAENQQ3BDAEMwRABEMENwQ6BDAAuw. 2 | +BCEEQAQ1BDQEOA +BD8EQAQ4BD4EQAQ4BEIENQRCBD0ESwRF +BD0EMAQ/BEAEMAQyBDsENQQ9BDgEOQ - +BEEEPgQyBDwENQRBBEIEPQQwBE8 +BEAEMAQxBD4EQgQw +BDI +BDgEPQRCBDUEQAQ1BEEEMARF +BDwENQQ2BDQEQwQ9BDAEQAQ+BDQEPQQ+BDk +BDEENQQ3BD4EPwQwBEEEPQQ+BEEEQgQ4 +BDg +BEEEQgRABDAEQgQ1BDMEOARHBDUEQQQ6BD4EOQ +BEEEQgQwBDEEOAQ7BEwEPQQ+BEEEQgQ4, +BDEEPgRABEwEMQQw +BEE +BDwENQQ2BDQEQwQ9BDAEQAQ+BDQEPQRLBDw +BEIENQRABEAEPgRABDgENwQ8BD4EPA, +BEEEPgQ0BDUEOQRBBEIEMgQ4BDU +BEAENQRIBDUEPQQ4BE4 +BEAENQQzBDgEPgQ9BDAEOwRMBD0ESwRF +BDoEPgQ9BEQEOwQ4BDoEQgQ+BDI, +BEAEMAQ3BDIEOARCBDgENQ +BEIEPgRABDMEPgQyBD4--+BE0EOgQ+BD0EPgQ8BDgERwQ1BEEEOgQ4BEU +BEEEMgRPBDcENQQ5, +BEAEMARBBEgEOARABDUEPQQ4BDU +BDoEPgQ9BEIEMAQ6BEIEPgQy +BDwENQQ2BDQEQw +BDsETgQ0BEwEPAQ4. -------------------------------------------------------------------------------- /pytest/corpora/russian-utf8.txt: -------------------------------------------------------------------------------- 1 | В 2009 году в российско-американских отношениях начался новый этап. Президенты России и США Д.А.Медведев и Б.Обама подтвердили востребованность придания обновленного качества отношениям между двумя странами и объявили о начале процесса, который теперь известен как «перезагрузка». 2 | Среди приоритетных направлений - совместная работа в интересах международной безопасности и стратегической стабильности, борьба с международным терроризмом, содействие решению региональных конфликтов, развитие торгово-экономических связей, расширение контактов между людьми. -------------------------------------------------------------------------------- /pytest/corpora/utf16LE-norvig6mb.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/corpora/utf16LE-norvig6mb.txt -------------------------------------------------------------------------------- /pytest/guid_gen.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import uuid 4 | import sys 5 | 6 | def nGuids(n): 7 | for i in xrange(n): 8 | print uuid.uuid4().hex 9 | 10 | if __name__ == "__main__": 11 | nGuids(int(sys.argv[1])) 12 | -------------------------------------------------------------------------------- /pytest/keys/-----5.txt: -------------------------------------------------------------------------------- 1 | roisterers 2 | cajoled 3 | decahedra 4 | ultramicroscopic 5 | certificatory 6 | -------------------------------------------------------------------------------- /pytest/keys/----10.txt: -------------------------------------------------------------------------------- 1 | watchings 2 | execration 3 | rebuker 4 | epitaxies 5 | poppy 6 | faintness 7 | wittedness 8 | nourish 9 | uncoupled 10 | specification 11 | -------------------------------------------------------------------------------- /pytest/keys/----25.txt: -------------------------------------------------------------------------------- 1 | rosaniline 2 | maligned 3 | special 4 | puking 5 | suffragans 6 | dispireme 7 | insusceptibility 8 | muliebria 9 | trilobate 10 | scatterers 11 | epidemiology 12 | price 13 | outlawry 14 | nonkosher 15 | imprisoning 16 | lambdacism 17 | clownishness 18 | depression 19 | thalamencephala 20 | morbilli 21 | undogmatic 22 | psorergates 23 | counterweighted 24 | corrigan 25 | discards 26 | -------------------------------------------------------------------------------- /pytest/keys/----50.txt: -------------------------------------------------------------------------------- 1 | watermark 2 | transcalent 3 | navis 4 | dazzlingly 5 | meningopathy 6 | campanili 7 | pastry 8 | gulflike 9 | semiweekly 10 | grumbler 11 | speaker 12 | interrogatively 13 | resolves 14 | pulverulent 15 | nonsubmissive 16 | stabbing 17 | unaesthetic 18 | sojourning 19 | upcountry 20 | pairings 21 | cockles 22 | intrusiveness 23 | reacclimating 24 | flattery 25 | affine 26 | spiritualizes 27 | beleaguer 28 | melanotic 29 | acrocyanosis 30 | sarcoplasmic 31 | obdurating 32 | ototoxic 33 | inflater 34 | edges 35 | styrax 36 | adenocystomata 37 | liquors 38 | buttonholed 39 | masochists 40 | destroys 41 | matinee 42 | crescendos 43 | topologies 44 | hypodermal 45 | seethingly 46 | jurisprudentia 47 | punsters 48 | epithelialize 49 | parry 50 | foliose 51 | -------------------------------------------------------------------------------- /pytest/keys/---100.txt: -------------------------------------------------------------------------------- 1 | wayfaring 2 | libber 3 | depletions 4 | depends 5 | calorie 6 | likenesses 7 | comforters 8 | quieta 9 | dysmnesia 10 | tenaculums 11 | ferris 12 | bussing 13 | viewpoints 14 | papery 15 | causeways 16 | wuchereria 17 | naphthous 18 | mastoiditis 19 | slattern 20 | julep 21 | leveling 22 | prediabetic 23 | birefringent 24 | lithoscope 25 | reawakens 26 | fibbing 27 | milkweed 28 | manum 29 | tracheoschisis 30 | unincumbered 31 | ketoheptose 32 | cholecystectasia 33 | chromatographic 34 | mechanist 35 | medicator 36 | lewdest 37 | smoothers 38 | visions 39 | squashier 40 | autoagglutinnin 41 | unfinished 42 | wenching 43 | overelaborate 44 | alloyed 45 | drywall 46 | simulating 47 | versicolor 48 | illustrious 49 | profiteered 50 | unluckily 51 | fawns 52 | cartes 53 | affiliated 54 | cahoot 55 | bostonians 56 | rarer 57 | conveniences 58 | mycotic 59 | cobblestone 60 | beanbag 61 | onetime 62 | mycoplasmal 63 | competency 64 | lonelily 65 | revelation 66 | skein 67 | territorial 68 | fastening 69 | scotomagraph 70 | approximations 71 | torso 72 | interposers 73 | straightway 74 | reissuable 75 | antinuke 76 | faultfinders 77 | semblance 78 | brontosaurs 79 | furies 80 | praises 81 | gonecystolith 82 | lavished 83 | paganish 84 | defrays 85 | renegotiated 86 | sidney 87 | antineutrino 88 | pithiness 89 | algedonic 90 | enamor 91 | unmanly 92 | refillable 93 | fertilizing 94 | monies 95 | decolonizing 96 | beasts 97 | undiluted 98 | disgraceful 99 | accelerometers 100 | gaited 101 | -------------------------------------------------------------------------------- /pytest/keys/fixed30.txt: -------------------------------------------------------------------------------- 1 | deniable 2 | START 3 | scary 4 | google 5 | shines through 6 | RIFF 7 | jpg 8 | osama bin laden 9 | USA 10 | $500 11 | dependable 12 | burst into tears 13 | case 14 | warez 15 | download 16 | Firefox 17 | explorer 18 | Favorites 19 | vegetable 20 | understand 21 | TGIF 22 | INDX 23 | ncldobson 24 | @gmail 25 | territory 26 | terrorist 27 | 28 | independent 29 | Hotmail 30 | well informed 31 | -------------------------------------------------------------------------------- /pytest/keys/shuf05.txt: -------------------------------------------------------------------------------- 1 | 3(4|7)##[\x20\x2D]?######[\x20\x2D]?##### 2 | un((deniabl(e|y))|(couth)) 3 | [\x2A\x2E]?START 4 | s((cary)|(pooky))\x20?pe((ople)|(rsons)) 5 | [a-z][a-z_\-#]+\.[a-z][a-z_\-#]+\.[a-z][a-z][a-z][a-z]?([a-z][a-z][a-z]?[a-z]?)? 6 | -------------------------------------------------------------------------------- /pytest/keys/shuf10.txt: -------------------------------------------------------------------------------- 1 | 3(4|7)##[\x20\x2D]?######[\x20\x2D]?##### 2 | un((deniabl(e|y))|(couth)) 3 | [\x2A\x2E]?START 4 | s((cary)|(pooky))\x20?pe((ople)|(rsons)) 5 | [a-z][a-z_\-#]+\.[a-z][a-z_\-#]+\.[a-z][a-z][a-z][a-z]?([a-z][a-z][a-z]?[a-z]?)? 6 | \x2E\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x10....................\x2E\x2E\x20\x20\x20\x20\x20\x20\x20\x20\x20\x10.................... 7 | shines? through 8 | RIF[FX] 9 | \xE5.......jp(e|g).........\x00\x00....\xF9\xC0.... 10 | osama[\x5F\x20\x2D]?bin[\x5F\x20\x2D]?laden 11 | -------------------------------------------------------------------------------- /pytest/keys/shuf15.txt: -------------------------------------------------------------------------------- 1 | 3(4|7)##[\x20\x2D]?######[\x20\x2D]?##### 2 | un((deniabl(e|y))|(couth)) 3 | [\x2A\x2E]?START 4 | s((cary)|(pooky))\x20?pe((ople)|(rsons)) 5 | [a-z][a-z_\-#]+\.[a-z][a-z_\-#]+\.[a-z][a-z][a-z][a-z]?([a-z][a-z][a-z]?[a-z]?)? 6 | \x2E\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x10....................\x2E\x2E\x20\x20\x20\x20\x20\x20\x20\x20\x20\x10.................... 7 | shines? through 8 | RIF[FX] 9 | \xE5.......jp(e|g).........\x00\x00....\xF9\xC0.... 10 | osama[\x5F\x20\x2D]?bin[\x5F\x20\x2D]?laden 11 | [^a-zA-Z0-9]U\.S\.A\.[^a-zA-Z0-9] 12 | $500* 13 | \x52\x61\x72\x21\x1A\x07 14 | \x21\x42\x44\x4E 15 | depend((able)|(ent)|s) 16 | -------------------------------------------------------------------------------- /pytest/keys/shuf20.txt: -------------------------------------------------------------------------------- 1 | 3(4|7)##[\x20\x2D]?######[\x20\x2D]?##### 2 | un((deniabl(e|y))|(couth)) 3 | [\x2A\x2E]?START 4 | s((cary)|(pooky))\x20?pe((ople)|(rsons)) 5 | [a-z][a-z_\-#]+\.[a-z][a-z_\-#]+\.[a-z][a-z][a-z][a-z]?([a-z][a-z][a-z]?[a-z]?)? 6 | \x2E\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x10....................\x2E\x2E\x20\x20\x20\x20\x20\x20\x20\x20\x20\x10.................... 7 | shines? through 8 | RIF[FX] 9 | \xE5.......jp(e|g).........\x00\x00....\xF9\xC0.... 10 | osama[\x5F\x20\x2D]?bin[\x5F\x20\x2D]?laden 11 | [^a-zA-Z0-9]U\.S\.A\.[^a-zA-Z0-9] 12 | $500* 13 | \x52\x61\x72\x21\x1A\x07 14 | \x21\x42\x44\x4E 15 | depend((able)|(ent)|s) 16 | [^a-zA-Z0-9][A-Z]\.[A-Z]\.[A-Z][^a-zA-Z0-9] 17 | rpg[\x5F\x20]?suppl(y|(ier)) 18 | burst(ing)? into tears 19 | GIF8[79](([^\x00])|([\x00][^\x3B]))+\x00\x3B 20 | \x01\x00\x00\x00....................................\x20EMF 21 | -------------------------------------------------------------------------------- /pytest/keys/shuf25.txt: -------------------------------------------------------------------------------- 1 | 3(4|7)##[\x20\x2D]?######[\x20\x2D]?##### 2 | un((deniabl(e|y))|(couth)) 3 | [\x2A\x2E]?START 4 | s((cary)|(pooky))\x20?pe((ople)|(rsons)) 5 | [a-z][a-z_\-#]+\.[a-z][a-z_\-#]+\.[a-z][a-z][a-z][a-z]?([a-z][a-z][a-z]?[a-z]?)? 6 | \x2E\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x10....................\x2E\x2E\x20\x20\x20\x20\x20\x20\x20\x20\x20\x10.................... 7 | shines? through 8 | RIF[FX] 9 | \xE5.......jp(e|g).........\x00\x00....\xF9\xC0.... 10 | osama[\x5F\x20\x2D]?bin[\x5F\x20\x2D]?laden 11 | [^a-zA-Z0-9]U\.S\.A\.[^a-zA-Z0-9] 12 | $500* 13 | \x52\x61\x72\x21\x1A\x07 14 | \x21\x42\x44\x4E 15 | depend((able)|(ent)|s) 16 | [^a-zA-Z0-9][A-Z]\.[A-Z]\.[A-Z][^a-zA-Z0-9] 17 | rpg[\x5F\x20]?suppl(y|(ier)) 18 | burst(ing)? into tears 19 | GIF8[79](([^\x00])|([\x00][^\x3B]))+\x00\x3B 20 | \x01\x00\x00\x00....................................\x20EMF 21 | \x28?###[\x20\x29\x2D\x2E]*###[\x20\x2D\x2E]?####[^#] 22 | ##?[\.\-/, ]##?[\.\-/, ]##(##)? 23 | vegeta((ble)|(rian)|(t(ive)|(e)|(ion))) 24 | understand(\x3F|(abl(e|y))) 25 | 1Z\x20?[#a-z][#a-z][#a-z]\x20?[#a-z][#a-z][#a-z]\x20?[#a-z]#\x20?####\x20?###\x20?# 26 | -------------------------------------------------------------------------------- /pytest/keys/shuf30.txt: -------------------------------------------------------------------------------- 1 | 3(4|7)##[\x20\x2D]?######[\x20\x2D]?##### 2 | un((deniabl(e|y))|(couth)) 3 | [\x2A\x2E]?START 4 | s((cary)|(pooky))\x20?pe((ople)|(rsons)) 5 | [a-z][a-z_\-#]+\.[a-z][a-z_\-#]+\.[a-z][a-z][a-z][a-z]?([a-z][a-z][a-z]?[a-z]?)? 6 | \x2E\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x10....................\x2E\x2E\x20\x20\x20\x20\x20\x20\x20\x20\x20\x10.................... 7 | shines? through 8 | RIF[FX] 9 | \xE5.......jp(e|g).........\x00\x00....\xF9\xC0.... 10 | osama[\x5F\x20\x2D]?bin[\x5F\x20\x2D]?laden 11 | [^a-zA-Z0-9]U\.S\.A\.[^a-zA-Z0-9] 12 | $500* 13 | \x52\x61\x72\x21\x1A\x07 14 | \x21\x42\x44\x4E 15 | depend((able)|(ent)|s) 16 | [^a-zA-Z0-9][A-Z]\.[A-Z]\.[A-Z][^a-zA-Z0-9] 17 | rpg[\x5F\x20]?suppl(y|(ier)) 18 | burst(ing)? into tears 19 | GIF8[79](([^\x00])|([\x00][^\x3B]))+\x00\x3B 20 | \x01\x00\x00\x00....................................\x20EMF 21 | \x28?###[\x20\x29\x2D\x2E]*###[\x20\x2D\x2E]?####[^#] 22 | ##?[\.\-/, ]##?[\.\-/, ]##(##)? 23 | vegeta((ble)|(rian)|(t(ive)|(e)|(ion))) 24 | understand(\x3F|(abl(e|y))) 25 | 1Z\x20?[#a-z][#a-z][#a-z]\x20?[#a-z][#a-z][#a-z]\x20?[#a-z]#\x20?####\x20?###\x20?# 26 | [^a-zA-Z0-9]T\.G\.I\.F\.[^a-zA-Z0-9] 27 | INDX\x28\x00\x09 28 | ncldobson@yahoo.com 29 | \x89\x50\x4E\x47([^\x49]|(\x49[^\x45])|(\x49\x45[^\x4E])|(\x49\x45\x4E[^\x44])|(\x49\x45\x4E\x44[^\xAE])|(\x49\x45\x4E\x44\xAE[^\x42])|(\x49\x45\x4E\x44\xAE\x42[^\x60])|(\x49\x45\x4E\x44\xAE\x42\x60[^\x82]))+\x49\x45\x4E\x44\xAE\x42\x60\x82 30 | \xFF\xD8\xFF[\xFE\xE0\xDB\xC4\xE1\xEE]([^\xFF]|(\xFF[^\xD9]))+\xFF\xD9 31 | -------------------------------------------------------------------------------- /pytest/keys/shuf35.txt: -------------------------------------------------------------------------------- 1 | 3(4|7)##[\x20\x2D]?######[\x20\x2D]?##### 2 | un((deniabl(e|y))|(couth)) 3 | [\x2A\x2E]?START 4 | s((cary)|(pooky))\x20?pe((ople)|(rsons)) 5 | [a-z][a-z_\-#]+\.[a-z][a-z_\-#]+\.[a-z][a-z][a-z][a-z]?([a-z][a-z][a-z]?[a-z]?)? 6 | \x2E\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x10....................\x2E\x2E\x20\x20\x20\x20\x20\x20\x20\x20\x20\x10.................... 7 | shines? through 8 | RIF[FX] 9 | \xE5.......jp(e|g).........\x00\x00....\xF9\xC0.... 10 | osama[\x5F\x20\x2D]?bin[\x5F\x20\x2D]?laden 11 | [^a-zA-Z0-9]U\.S\.A\.[^a-zA-Z0-9] 12 | $500* 13 | \x52\x61\x72\x21\x1A\x07 14 | \x21\x42\x44\x4E 15 | depend((able)|(ent)|s) 16 | [^a-zA-Z0-9][A-Z]\.[A-Z]\.[A-Z][^a-zA-Z0-9] 17 | rpg[\x5F\x20]?suppl(y|(ier)) 18 | burst(ing)? into tears 19 | GIF8[79](([^\x00])|([\x00][^\x3B]))+\x00\x3B 20 | \x01\x00\x00\x00....................................\x20EMF 21 | \x28?###[\x20\x29\x2D\x2E]*###[\x20\x2D\x2E]?####[^#] 22 | ##?[\.\-/, ]##?[\.\-/, ]##(##)? 23 | vegeta((ble)|(rian)|(t(ive)|(e)|(ion))) 24 | understand(\x3F|(abl(e|y))) 25 | 1Z\x20?[#a-z][#a-z][#a-z]\x20?[#a-z][#a-z][#a-z]\x20?[#a-z]#\x20?####\x20?###\x20?# 26 | [^a-zA-Z0-9]T\.G\.I\.F\.[^a-zA-Z0-9] 27 | INDX\x28\x00\x09 28 | ncldobson@yahoo.com 29 | \x89\x50\x4E\x47([^\x49]|(\x49[^\x45])|(\x49\x45[^\x4E])|(\x49\x45\x4E[^\x44])|(\x49\x45\x4E\x44[^\xAE])|(\x49\x45\x4E\x44\xAE[^\x42])|(\x49\x45\x4E\x44\xAE\x42[^\x60])|(\x49\x45\x4E\x44\xAE\x42\x60[^\x82]))+\x49\x45\x4E\x44\xAE\x42\x60\x82 30 | \xFF\xD8\xFF[\xFE\xE0\xDB\xC4\xE1\xEE]([^\xFF]|(\xFF[^\xD9]))+\xFF\xD9 31 | @((hotmail)|(yahoo)|(gmail))\x2E((com)|(net)) 32 | terr((itory)|(or(ist)?)) 33 | \x46\x4C\x56.\x05\x00\x00\x00.\x00\x00\x00\x00 34 | \x43\x48\x49\x55\x20?\x54\x60\x41\x4E\x47 35 | <html>([^h]|(h[^t]))+</html> 36 | -------------------------------------------------------------------------------- /pytest/keys/shuf40.txt: -------------------------------------------------------------------------------- 1 | 3(4|7)##[\x20\x2D]?######[\x20\x2D]?##### 2 | un((deniabl(e|y))|(couth)) 3 | [\x2A\x2E]?START 4 | s((cary)|(pooky))\x20?pe((ople)|(rsons)) 5 | [a-z][a-z_\-#]+\.[a-z][a-z_\-#]+\.[a-z][a-z][a-z][a-z]?([a-z][a-z][a-z]?[a-z]?)? 6 | \x2E\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x10....................\x2E\x2E\x20\x20\x20\x20\x20\x20\x20\x20\x20\x10.................... 7 | shines? through 8 | RIF[FX] 9 | \xE5.......jp(e|g).........\x00\x00....\xF9\xC0.... 10 | osama[\x5F\x20\x2D]?bin[\x5F\x20\x2D]?laden 11 | [^a-zA-Z0-9]U\.S\.A\.[^a-zA-Z0-9] 12 | $500* 13 | \x52\x61\x72\x21\x1A\x07 14 | \x21\x42\x44\x4E 15 | depend((able)|(ent)|s) 16 | [^a-zA-Z0-9][A-Z]\.[A-Z]\.[A-Z][^a-zA-Z0-9] 17 | rpg[\x5F\x20]?suppl(y|(ier)) 18 | burst(ing)? into tears 19 | GIF8[79](([^\x00])|([\x00][^\x3B]))+\x00\x3B 20 | \x01\x00\x00\x00....................................\x20EMF 21 | \x28?###[\x20\x29\x2D\x2E]*###[\x20\x2D\x2E]?####[^#] 22 | ##?[\.\-/, ]##?[\.\-/, ]##(##)? 23 | vegeta((ble)|(rian)|(t(ive)|(e)|(ion))) 24 | understand(\x3F|(abl(e|y))) 25 | 1Z\x20?[#a-z][#a-z][#a-z]\x20?[#a-z][#a-z][#a-z]\x20?[#a-z]#\x20?####\x20?###\x20?# 26 | [^a-zA-Z0-9]T\.G\.I\.F\.[^a-zA-Z0-9] 27 | INDX\x28\x00\x09 28 | ncldobson@yahoo.com 29 | \x89\x50\x4E\x47([^\x49]|(\x49[^\x45])|(\x49\x45[^\x4E])|(\x49\x45\x4E[^\x44])|(\x49\x45\x4E\x44[^\xAE])|(\x49\x45\x4E\x44\xAE[^\x42])|(\x49\x45\x4E\x44\xAE\x42[^\x60])|(\x49\x45\x4E\x44\xAE\x42\x60[^\x82]))+\x49\x45\x4E\x44\xAE\x42\x60\x82 30 | \xFF\xD8\xFF[\xFE\xE0\xDB\xC4\xE1\xEE]([^\xFF]|(\xFF[^\xD9]))+\xFF\xD9 31 | @((hotmail)|(yahoo)|(gmail))\x2E((com)|(net)) 32 | terr((itory)|(or(ist)?)) 33 | \x46\x4C\x56.\x05\x00\x00\x00.\x00\x00\x00\x00 34 | \x43\x48\x49\x55\x20?\x54\x60\x41\x4E\x47 35 | <html>([^h]|(h[^t]))+</html> 36 | ##?[\x20\x2C\x2D\x2E\x2F]##?[\x20\x2C\x2D\x2E\x2F]##(##)? 37 | \xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1 38 | independen(t|(ce)) 39 | <title>(MSN\x20)?Hotmail 40 | \x50\x4B\x03\x04([^\x50]|(\x50[^\x4B]))+\x50\x4B\x05\x06 41 | -------------------------------------------------------------------------------- /pytest/keys/shuf45.txt: -------------------------------------------------------------------------------- 1 | 3(4|7)##[\x20\x2D]?######[\x20\x2D]?##### 2 | un((deniabl(e|y))|(couth)) 3 | [\x2A\x2E]?START 4 | s((cary)|(pooky))\x20?pe((ople)|(rsons)) 5 | [a-z][a-z_\-#]+\.[a-z][a-z_\-#]+\.[a-z][a-z][a-z][a-z]?([a-z][a-z][a-z]?[a-z]?)? 6 | \x2E\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x10....................\x2E\x2E\x20\x20\x20\x20\x20\x20\x20\x20\x20\x10.................... 7 | shines? through 8 | RIF[FX] 9 | \xE5.......jp(e|g).........\x00\x00....\xF9\xC0.... 10 | osama[\x5F\x20\x2D]?bin[\x5F\x20\x2D]?laden 11 | [^a-zA-Z0-9]U\.S\.A\.[^a-zA-Z0-9] 12 | $500* 13 | \x52\x61\x72\x21\x1A\x07 14 | \x21\x42\x44\x4E 15 | depend((able)|(ent)|s) 16 | [^a-zA-Z0-9][A-Z]\.[A-Z]\.[A-Z][^a-zA-Z0-9] 17 | rpg[\x5F\x20]?suppl(y|(ier)) 18 | burst(ing)? into tears 19 | GIF8[79](([^\x00])|([\x00][^\x3B]))+\x00\x3B 20 | \x01\x00\x00\x00....................................\x20EMF 21 | \x28?###[\x20\x29\x2D\x2E]*###[\x20\x2D\x2E]?####[^#] 22 | ##?[\.\-/, ]##?[\.\-/, ]##(##)? 23 | vegeta((ble)|(rian)|(t(ive)|(e)|(ion))) 24 | understand(\x3F|(abl(e|y))) 25 | 1Z\x20?[#a-z][#a-z][#a-z]\x20?[#a-z][#a-z][#a-z]\x20?[#a-z]#\x20?####\x20?###\x20?# 26 | [^a-zA-Z0-9]T\.G\.I\.F\.[^a-zA-Z0-9] 27 | INDX\x28\x00\x09 28 | ncldobson@yahoo.com 29 | \x89\x50\x4E\x47([^\x49]|(\x49[^\x45])|(\x49\x45[^\x4E])|(\x49\x45\x4E[^\x44])|(\x49\x45\x4E\x44[^\xAE])|(\x49\x45\x4E\x44\xAE[^\x42])|(\x49\x45\x4E\x44\xAE\x42[^\x60])|(\x49\x45\x4E\x44\xAE\x42\x60[^\x82]))+\x49\x45\x4E\x44\xAE\x42\x60\x82 30 | \xFF\xD8\xFF[\xFE\xE0\xDB\xC4\xE1\xEE]([^\xFF]|(\xFF[^\xD9]))+\xFF\xD9 31 | @((hotmail)|(yahoo)|(gmail))\x2E((com)|(net)) 32 | terr((itory)|(or(ist)?)) 33 | \x46\x4C\x56.\x05\x00\x00\x00.\x00\x00\x00\x00 34 | \x43\x48\x49\x55\x20?\x54\x60\x41\x4E\x47 35 | <html>([^h]|(h[^t]))+</html> 36 | ##?[\x20\x2C\x2D\x2E\x2F]##?[\x20\x2C\x2D\x2E\x2F]##(##)? 37 | \xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1 38 | independen(t|(ce)) 39 | <title>(MSN\x20)?Hotmail 40 | \x50\x4B\x03\x04([^\x50]|(\x50[^\x4B]))+\x50\x4B\x05\x06 41 | (0|([3-9]#?)|(1#?#?)|(2([0-4]#?)|(5[0-5]?)|[6-9]))\x2E(0|([3-9]#?)|(1#?#?)|(2([0-4]#?)|(5[0-5]?)|[6-9]))\x2E(0|([3-9]#?)|(1#?#?)|(2([0-4]#?)|(5[0-5]?)|[6-9]))\x2E(0|([3-9]#?)|(1#?#?)|(2([0-4]#?)|(5[0-5]?)|[6-9])) 42 | FILE0\x00\x03\x00 43 | well[\x2D\x20]?informed 44 | 36##[\x20\x2D]?######[\x20\x2D]?#### 45 | 50+,0+,0+ 46 | -------------------------------------------------------------------------------- /pytest/keys/twain.txt: -------------------------------------------------------------------------------- 1 | Twain 2 | [\n\r]Twain 3 | Twain[\n\r] 4 | Huck[a-zA-Z]+ 5 | Tom|Sawyer|Huckelberry|Finn 6 | .(Tom|Sawyer|Huckelberry|Finn) 7 | [a-zA-Z]+ing 8 | [a-zA-Z]+mov[a-zA-Z]+ 9 | ([A-Za-z]awyer|[A-Za-z]inn)[^a-zA-Z] 10 | [A-Z]awyer|[A-Z]inn 11 | Tom.{0,30}river|river.{0,30}Tom 12 | [\n\r][a-zA-Z]{0,4}ing[^a-zA-Z] 13 | [\n\r][a-zA-Z]{5,}[\n\r] 14 | [a-zA-Z]+ing[\n\r] 15 | [\n\r].[\n\r] 16 | -------------------------------------------------------------------------------- /pytest/receive_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import struct 4 | import socket 5 | 6 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 7 | sock.bind(('127.0.0.1', 12777)) 8 | sock.listen(1) 9 | conn, addr = sock.accept() 10 | 11 | data = '' 12 | 13 | while 1: 14 | while len(data) < 16: 15 | data += conn.recv(1024) 16 | if not data: break 17 | 18 | if not data: break 19 | 20 | i, size = struct.unpack('QQ', data[0:16]) 21 | print("Received id %s with %s bytes" % (i, size)) 22 | 23 | while len(data) < size + 16: 24 | data += conn.recv(1024) 25 | if not data: break 26 | 27 | if not data: break 28 | 29 | # data = data[16+size:] 30 | 31 | for j in range(0, size): 32 | print struct.unpack('B', data[16+j]) 33 | data = data[16+size:] 34 | 35 | sock.close() 36 | -------------------------------------------------------------------------------- /pytest/results/-----5.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/-----5.txt -------------------------------------------------------------------------------- /pytest/results/----10.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/----10.txt -------------------------------------------------------------------------------- /pytest/results/----25.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/----25.txt -------------------------------------------------------------------------------- /pytest/results/----50.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/----50.txt -------------------------------------------------------------------------------- /pytest/results/---100.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/---100.txt -------------------------------------------------------------------------------- /pytest/results/---200.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/---200.txt -------------------------------------------------------------------------------- /pytest/results/---500.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/---500.txt -------------------------------------------------------------------------------- /pytest/results/--1000.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/--1000.txt -------------------------------------------------------------------------------- /pytest/results/--1500.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/--1500.txt -------------------------------------------------------------------------------- /pytest/results/--2000.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/--2000.txt -------------------------------------------------------------------------------- /pytest/results/--2500.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/--2500.txt -------------------------------------------------------------------------------- /pytest/results/--3000.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/--3000.txt -------------------------------------------------------------------------------- /pytest/results/--4000.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/--4000.txt -------------------------------------------------------------------------------- /pytest/results/--4500.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/--4500.txt -------------------------------------------------------------------------------- /pytest/results/--5000.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/--5000.txt -------------------------------------------------------------------------------- /pytest/results/--6000.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/--6000.txt -------------------------------------------------------------------------------- /pytest/results/--7000.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/--7000.txt -------------------------------------------------------------------------------- /pytest/results/--8000.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/--8000.txt -------------------------------------------------------------------------------- /pytest/results/--9000.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/--9000.txt -------------------------------------------------------------------------------- /pytest/results/-10000.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/-10000.txt -------------------------------------------------------------------------------- /pytest/results/-11000.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/-11000.txt -------------------------------------------------------------------------------- /pytest/results/-12000.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/-12000.txt -------------------------------------------------------------------------------- /pytest/results/-13000.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/-13000.txt -------------------------------------------------------------------------------- /pytest/results/-14000.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/-14000.txt -------------------------------------------------------------------------------- /pytest/results/-15000.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/-15000.txt -------------------------------------------------------------------------------- /pytest/results/-17500.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/-17500.txt -------------------------------------------------------------------------------- /pytest/results/-20000.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/-20000.txt -------------------------------------------------------------------------------- /pytest/results/-25000.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/-25000.txt -------------------------------------------------------------------------------- /pytest/results/-30000.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/-30000.txt -------------------------------------------------------------------------------- /pytest/results/-35000.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/-35000.txt -------------------------------------------------------------------------------- /pytest/results/-40000.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/-40000.txt -------------------------------------------------------------------------------- /pytest/results/-45000.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/-45000.txt -------------------------------------------------------------------------------- /pytest/results/-50000.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/-50000.txt -------------------------------------------------------------------------------- /pytest/results/-60000.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/-60000.txt -------------------------------------------------------------------------------- /pytest/results/-70000.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/-70000.txt -------------------------------------------------------------------------------- /pytest/results/-80000.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/-80000.txt -------------------------------------------------------------------------------- /pytest/results/-90000.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/-90000.txt -------------------------------------------------------------------------------- /pytest/results/100000.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/100000.txt -------------------------------------------------------------------------------- /pytest/results/114743.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/pytest/results/114743.txt -------------------------------------------------------------------------------- /pytest/send_data_to_lg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import struct 4 | import socket 5 | import random 6 | import string 7 | import os 8 | from multiprocessing import Process 9 | 10 | class LGClient: 11 | # struct FileHeader { 12 | # byte Cmd, 13 | # Type; 14 | # uint64 ID, 15 | # StartOffset, 16 | # Length; 17 | # }; 18 | 19 | def __init__(self, prefix): 20 | self.prefix = prefix 21 | self.totalStreams = random.randint(1, 10) 22 | self.hdr = struct.Struct('<BBQQQ') 23 | self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 24 | 25 | def connect(self): 26 | self.sock.connect(('127.0.0.1', 12777)) 27 | self.sock.setsockopt(socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1) 28 | print("connected to lg as " + str(self.sock.getsockname())) 29 | 30 | def sendSearchHeader(self, id, len): 31 | self.sock.sendall(self.hdr.pack(0, 0, id, 0, len)) 32 | 33 | def hangup(self): 34 | self.sock.sendall(self.hdr.pack(2, 0, 0, 0, 0)) 35 | self.sock.recv(1) 36 | self.sock.close() 37 | 38 | def shutdown(self): 39 | self.sock.sendall(self.hdr.pack(3, 0, 0, 0, 0)) 40 | self.sock.close() 41 | 42 | def sendStream(self, i): 43 | id = self.prefix + i 44 | len = random.randint(1, 2 << 16) 45 | self.sendSearchHeader(id, len) 46 | pages = len // 4096 47 | remainder = len % 4096 48 | for i in xrange(pages): 49 | data = ''.join(random.choice(string.ascii_lowercase) for x in xrange(4096)) 50 | self.sock.sendall(data) 51 | data = ''.join(random.choice(string.ascii_lowercase) for x in xrange(remainder)) 52 | self.sock.sendall(data) 53 | print("sent %s bytes for id %s" % (len, id)) 54 | 55 | def run(self): 56 | self.connect() 57 | for i in xrange(self.totalStreams): 58 | self.sendStream(i) 59 | self.hangup() 60 | 61 | def start(prefix): 62 | print("running client with prefix " + str(prefix)) 63 | client = LGClient(prefix) 64 | client.run() 65 | 66 | if __name__ == '__main__': 67 | print("starting") 68 | procs = [Process(target=start, args=[(i*10000)]) for i in xrange(5)] 69 | for p in procs: 70 | print("spawning proc") 71 | p.start() 72 | for p in procs: 73 | p.join() 74 | killer = LGClient(0) 75 | killer.connect() 76 | killer.shutdown() 77 | print("done") 78 | -------------------------------------------------------------------------------- /pytest/tb_db_create.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os 4 | import sqlite3 5 | import sys 6 | 7 | def main(): 8 | dbfile = sys.argv[1] 9 | 10 | perf_schema = \ 11 | 'CREATE TABLE perf ( \ 12 | git TEXT, \ 13 | test_name TEXT, \ 14 | seconds REAL, \ 15 | task_clock REAL, \ 16 | context_switches INTEGER, \ 17 | CPU_migrations INTEGER, \ 18 | page_faults INTEGER, \ 19 | cycles INTEGER, \ 20 | stalled_cycles_frontend INTEGER, \ 21 | stalled_cycles_backend INTEGER, \ 22 | instructions INTEGER, \ 23 | branches INTEGER, \ 24 | branch_misses INTEGER, \ 25 | L1_dcache_loads INTEGER, \ 26 | L1_dcache_load_misses INTEGER, \ 27 | LLC_loads INTEGER, \ 28 | LLC_load_misses INTEGER, \ 29 | PRIMARY KEY (git, test_name) \ 30 | )' 31 | 32 | commit_schema = \ 33 | 'CREATE TABLE commits ( \ 34 | git TEXT PRIMARY KEY, \ 35 | timestamp INTEGER \ 36 | )' 37 | 38 | try: 39 | with sqlite3.connect(dbfile) as db: 40 | db.execute(commit_schema) 41 | db.execute(perf_schema) 42 | except: 43 | os.unlink(dbfile) 44 | raise 45 | 46 | if __name__ == "__main__": 47 | main() 48 | -------------------------------------------------------------------------------- /pytest/trace_runner.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from multiprocessing import Process 3 | from analyze_trace import TraceStats 4 | 5 | def runProc(id, name, keys, corpus): 6 | print("%s %s %s" % (name, keys, corpus)) 7 | args = ['./lightgrep.exe', '--no-output', '--begin-debug=0', keys, corpus] 8 | lg = subprocess.Popen(args=args, bufsize=-1, stderr=subprocess.PIPE) 9 | stats = TraceStats() 10 | stats.RecordSegments = False 11 | stats.read(lg.stderr) 12 | lg.wait() 13 | print('%s\t%s\t%s\t%s' % (id, name, 'TotalThreads', stats.totalThreads())) 14 | print('%s\t%s\t%s\t%s' % (id, name, 'MaxThreadsOnAFrame', stats.MaxThreadsOnFrame)) 15 | print('%s\t%s\t%s\t%s' % (id, name, 'AvgThreadsPerFrame', stats.avgThreadsPerFrame())) 16 | print('%s\t%s\t%s\t%s' % (id, name, 'TotalInstructions', stats.totalInstructions())) 17 | print('%s\t%s\t%s\t%s' % (id, name, 'MaxInstructionsOnAFrame', stats.maxInstructionsOnAFrame())) 18 | print('%s\t%s\t%s\t%s' % (id, name, 'AvgInstructionsPerFrame', stats.avgInstructionsPerFrame())) 19 | print('%s\t%s\t%s\t%s' % (id, name, 'MaxThreadLifetime', stats.maxThreadLifetime())) 20 | print('%s\t%s\t%s\t%s' % (id, name, 'AvgThreadLifetime', stats.avgThreadLifetime())) 21 | print('%s\t%s\t%s\t%s' % (id, name, 'MaxFinishOpsOnAFrame', stats.maxFinishOpsOnAFrame())) 22 | print('%s\t%s\t%s\t%s' % (id, name, 'AvgFinishOpsPerFrame', stats.avgFinishOpsPerFrame())) 23 | 24 | if __name__ == '__main__': 25 | id = '12345' # we'll put git commit sha1 here 26 | # tests = [(id, 'norvig', 'pytest/keys/114743.txt', 'pytest/corpora/norvig6mb.txt'), 27 | # (id, 'twain', 'pytest/keys/twain.txt', 'pytest/corpora/marktwainworks.txt')] 28 | tests = [(id, 'norvig', 'pytest/keys/shuf55.txt', 'stupid.txt')] 29 | procs = [Process(target=runProc, args=t) for t in tests] 30 | for p in procs: 31 | p.start() 32 | for p in procs: 33 | p.join() 34 | -------------------------------------------------------------------------------- /re_gen/Makefile: -------------------------------------------------------------------------------- 1 | CC=g++ 2 | CXX=g++ 3 | CPPFLAGS=-O3 -W -Wall -Wextra -pedantic -std=c++11 4 | LDFLAGS=-lpcre -lstdc++ 5 | 6 | all: parsecheck randpat shitgrep 7 | 8 | parsecheck: parsecheck.o 9 | 10 | randpat: randpat.o unparser.o node.o 11 | 12 | shitgrep: shitgrep.o 13 | 14 | clean: 15 | +rm -f *.o parsecheck randpat shitgrep 16 | 17 | -------------------------------------------------------------------------------- /re_gen/aQ-3-3.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/re_gen/aQ-3-3.bz2 -------------------------------------------------------------------------------- /re_gen/aQ-3.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/re_gen/aQ-3.bz2 -------------------------------------------------------------------------------- /re_gen/basic.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include <bitset> 4 | #include <stdexcept> 5 | #include <sstream> 6 | #include <vector> 7 | #include <utility> 8 | #include <boost/smart_ptr.hpp> 9 | 10 | typedef unsigned char byte; 11 | 12 | typedef unsigned int uint32; 13 | typedef int int32; 14 | 15 | typedef unsigned long long uint64; 16 | typedef long long int64; 17 | 18 | typedef std::bitset<256> ByteSet; 19 | 20 | #define THROW_WITH_OUTPUT(exceptType, expression) \ 21 | std::ostringstream buf; \ 22 | buf << __FILE__ << ":" << __LINE__ << ": " << expression; \ 23 | throw exceptType(buf.str()) 24 | 25 | #define THROW_RUNTIME_ERROR_WITH_OUTPUT(expression) THROW_WITH_OUTPUT(std::runtime_error, expression) 26 | -------------------------------------------------------------------------------- /re_gen/db-abcd-4: -------------------------------------------------------------------------------- 1 | aaaabaaacaaadaabbaabcaabdaacbaaccaacdaadbaadcaaddababacabadabbbabbcabbdabcbabccabcdabdbabdcabddacacadacbbacbcacbdaccbacccaccdacdbacdcacddadadbbadbcadbdadcbadccadcdaddbaddcadddbbbbcbbbdbbccbbcdbbdcbbddbcbcbdbcccbccdbcdcbcddbdbdccbdcdbddcbdddccccdccddcdcdddda 2 | -------------------------------------------------------------------------------- /re_gen/lgtestlib.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import subprocess 4 | import tempfile 5 | 6 | 7 | def run_grep(grep, pats, text, emptymsg): 8 | pf = None 9 | 10 | try: 11 | if len(pats) == 1: 12 | # specify single patterns on command line 13 | cmd = (grep, '-p', pats[0]) 14 | else: 15 | # write multiple patterns to temporary pattern file 16 | fd, pfname = tempfile.mkstemp('w') 17 | pf = os.fdopen(fd, 'w') 18 | 19 | for p in pats: 20 | print >>pf, p 21 | pf.close() 22 | 23 | cmd = (grep, pfname) 24 | 25 | # get matches from grep 26 | proc = subprocess.Popen( 27 | cmd, 28 | stdin=subprocess.PIPE, 29 | stdout=subprocess.PIPE, 30 | stderr=subprocess.PIPE 31 | ) 32 | 33 | gout, gerr = proc.communicate(text) 34 | 35 | retval = proc.wait() 36 | if retval: 37 | raise Exception('{} returned {}, {}'.format(grep, retval, gerr)) 38 | 39 | finally: 40 | if pf: 41 | # clean up pattern file, if we used one 42 | pf.close() 43 | os.unlink(pfname) 44 | 45 | if len(pats) == gerr.count(emptymsg): 46 | # every pattern in this pattern set has zero-length matches 47 | return None 48 | 49 | # parse the matches 50 | matches = [] 51 | for m in gout.splitlines(): 52 | matches.append(map(int, m.split('\t', 3)[0:3])) 53 | 54 | # sort the matches by start, end, label 55 | lex = lambda x,y: cmp(x[0], y[0]) or cmp(x[1], y[1]) or cmp(x[2], y[2]) 56 | matches.sort(lex) 57 | 58 | return matches 59 | 60 | 61 | def run_shitgrep(sg, pats, text): 62 | return run_grep(sg, pats, text, 'is not allowed as a final state of the NFA') 63 | 64 | 65 | def run_lightgrep(lg, pats, text): 66 | return run_grep(lg, pats, text, 'Empty matches on pattern') 67 | -------------------------------------------------------------------------------- /re_gen/minfail.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os.path 4 | import sys 5 | 6 | import lgtestlib 7 | 8 | def main(): 9 | sg = os.path.dirname(__file__) + '/shitgrep' 10 | lg = os.path.dirname(__file__) + '/../bin/src/cmd/lightgrep' 11 | 12 | if len(sys.argv) != 2: 13 | raise Exception('wrong number of arguments') 14 | 15 | text = sys.argv[1] 16 | pats = [ p.rstrip('\n') for p in sys.stdin.readlines() ] 17 | 18 | # binary search to weed out good patterns quickly 19 | pats = partition(sg, lg, pats, text) 20 | 21 | # now remove patterns one at at time 22 | for p in reversed(pats): 23 | if mismatch(sg, lg, [ x for x in pats if x != p ], text): 24 | pats.remove(p) 25 | 26 | print pats 27 | 28 | 29 | def partition(sg, lg, pats, text): 30 | if mismatch(sg, lg, pats, text): 31 | # this pattern list fails 32 | if (len(pats) == 1): 33 | # the pattern list is a singleton 34 | return pats 35 | else: 36 | ret = partition(sg, lg, pats[0:len(pats)/2], text) 37 | if len(ret) > 0: 38 | # the first half of the pattern list fails 39 | return ret 40 | ret = partition(sg, lg, pats[len(pats)/2:], text) 41 | if len(ret) > 0: 42 | # the second half of the pattern list fails 43 | return ret 44 | return pats 45 | else: 46 | # this pattern list is ok 47 | return list() 48 | 49 | 50 | def mismatch(sg, lg, pats, text): 51 | # get matches from shitgrep 52 | sgmatches = lgtestlib.run_shitgrep(sg, pats, text) 53 | 54 | # get matches from lightgrep 55 | lgmatches = lgtestlib.run_lightgrep(lg, pats, text) 56 | 57 | return lgmatches != sgmatches 58 | 59 | 60 | if __name__ == "__main__": 61 | sys.exit(main()) 62 | -------------------------------------------------------------------------------- /re_gen/node.cpp: -------------------------------------------------------------------------------- 1 | #include "node.h" 2 | 3 | void repetition(std::ostream& out, uint32 min, uint32 max) { 4 | if (min == 0) { 5 | if (max == 1) { 6 | // ? is {0,1} 7 | out << '?'; 8 | return; 9 | } 10 | else if (max == UNBOUNDED) { 11 | // * is {0,} 12 | out << '*'; 13 | return; 14 | } 15 | } 16 | else if (min == 1 && max == UNBOUNDED) { 17 | // + is {1,} 18 | out << '+'; 19 | return; 20 | } 21 | 22 | out << '{' << min; 23 | 24 | if (max == UNBOUNDED) { 25 | out << ','; 26 | } 27 | else if (max != min) { 28 | out << ',' << max; 29 | } 30 | 31 | out << '}'; 32 | } 33 | 34 | std::ostream& operator<<(std::ostream& out, const Node& n) { 35 | switch (n.Type) { 36 | case Node::REGEXP: 37 | return out << "REGEXP"; 38 | case Node::ALTERNATION: 39 | return out << '|'; 40 | case Node::CONCATENATION: 41 | return out << '&'; 42 | case Node::REPETITION: 43 | repetition(out, n.Min, n.Max); 44 | return out; 45 | case Node::REPETITION_NG: 46 | repetition(out, n.Min, n.Max); 47 | return out << '?'; 48 | case Node::ELEMENT: 49 | return out << "ELEMENT"; 50 | case Node::DOT: 51 | return out << '.'; 52 | case Node::CHAR_CLASS: 53 | return out << n.Bits; 54 | case Node::LITERAL: 55 | return out << (char) n.Val; 56 | case Node::IGNORE: 57 | return out << "IGNORE"; 58 | default: 59 | return out << "WTF"; 60 | } 61 | } 62 | 63 | void printTree(std::ostream& out, const Node& n) { 64 | if (n.Right) { 65 | printTree(out, *n.Right); 66 | } 67 | 68 | if (n.Left) { 69 | printTree(out, *n.Left); 70 | } 71 | 72 | out << n << '\n'; 73 | } 74 | 75 | void printTreeDetails(std::ostream& out, const Node& n) { 76 | if (n.Right) { 77 | printTreeDetails(out, *n.Right); 78 | } 79 | 80 | if (n.Left) { 81 | printTreeDetails(out, *n.Left); 82 | } 83 | 84 | out << &n << ' ' << n.Type << ' ' << n.Left << ' ' << n.Right << ' ' 85 | << n.Val << ' ' << n.Min << ' ' << n.Max << '\n'; 86 | } 87 | 88 | -------------------------------------------------------------------------------- /re_gen/node.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "basic.h" 4 | 5 | #include <iostream> 6 | #include <limits> 7 | 8 | static const uint32 UNBOUNDED = std::numeric_limits<uint32>::max(); 9 | 10 | struct Node { 11 | enum NodeType { 12 | REGEXP, 13 | ALTERNATION, 14 | CONCATENATION, 15 | REPETITION, 16 | REPETITION_NG, 17 | ELEMENT, 18 | DOT, 19 | CHAR_CLASS, 20 | LITERAL, 21 | IGNORE 22 | }; 23 | 24 | NodeType Type; 25 | Node *Left, 26 | *Right; 27 | int Val; 28 | uint32 Min, 29 | Max; 30 | ByteSet Bits; 31 | 32 | Node(): Type(LITERAL), Left(0), Right(0), Val(0), Min(0), Max(0) 33 | { 34 | Bits.reset(); 35 | } 36 | 37 | Node(NodeType t, unsigned int v): 38 | Type(t), Left(0), Right(0), Val(v), Min(0), Max(0) 39 | { 40 | Bits.reset(); 41 | Bits.set(v); 42 | } 43 | 44 | Node(NodeType t, Node* l): 45 | Type(t), Left(l), Right(0), Val(0), Min(0), Max(0) {} 46 | 47 | Node(NodeType t, Node* l, Node* r): 48 | Type(t), Left(l), Right(r), Val(0), Min(0), Max(0) {} 49 | 50 | Node(NodeType t, Node* l, uint32 min, uint32 max): 51 | Type(t), Left(l), Right(0), Val(0), Min(min), Max(max) {} 52 | 53 | Node(NodeType t, unsigned int first, unsigned int last): 54 | Type(t), Left(0), Right(0), Val(0), Min(0), Max(0) 55 | { 56 | Bits.reset(); 57 | range(first, last); 58 | } 59 | 60 | explicit Node(NodeType t, const ByteSet& b): 61 | Type(t), Left(0), Right(0), Val(0), Min(0), Max(0), Bits(b) {} 62 | 63 | void range(byte first, byte last) { 64 | for (uint32 i = first; i <= last; ++i) { 65 | Bits.set(i); 66 | } 67 | } 68 | }; 69 | 70 | std::ostream& operator<<(std::ostream& out, const Node& n); 71 | 72 | void printTree(std::ostream& out, const Node& n); 73 | void printTreeDetails(std::ostream& out, const Node& n); 74 | void repetition(std::ostream& out, uint32 min, uint32 max); 75 | 76 | -------------------------------------------------------------------------------- /re_gen/parsecheck.cpp: -------------------------------------------------------------------------------- 1 | #include <pcre.h> 2 | 3 | #include <iostream> 4 | #include <string> 5 | 6 | #include <boost/lexical_cast.hpp> 7 | 8 | void parse(const char* pattern) { 9 | const char* error_str; 10 | int error_off; 11 | 12 | pcre* re = pcre_compile( 13 | pattern, 14 | PCRE_DOTALL | PCRE_NO_AUTO_CAPTURE, 15 | &error_str, 16 | &error_off, 17 | nullptr 18 | ); 19 | 20 | if (!re) { 21 | std::cout << pattern << ' ' << error_str << std::endl; 22 | } 23 | 24 | free(re); 25 | } 26 | 27 | /* 28 | Attempt to parse with PCRE all ASCII strings <= the given length. 29 | */ 30 | 31 | int main(int argc, char** argv) { 32 | const uint32_t maxlen = boost::lexical_cast<uint32_t>(argv[1]); 33 | 34 | std::string pat = " "; // 0x20 35 | 36 | do { 37 | parse(pat.c_str()); 38 | 39 | // increment 40 | for (std::string::iterator i(pat.begin()); ++*i == 0x7F; ) { 41 | *i = 0x20; 42 | ++i; 43 | if (i == pat.end()) { 44 | pat += ' '; 45 | break; 46 | } 47 | } 48 | } while (pat.length() <= maxlen); 49 | 50 | return 0; 51 | } 52 | -------------------------------------------------------------------------------- /re_gen/parsetree.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "basic.h" 4 | #include "node.h" 5 | 6 | #include <vector> 7 | 8 | class ParseTree { 9 | public: 10 | Node* Root; 11 | 12 | Node* add(const Node& n) { 13 | Store.push_back(n); 14 | return &Store[Store.size()-1]; 15 | } 16 | 17 | void init(uint32 len) { 18 | // 19 | // Sizing explanation: 20 | // 21 | // * Every character in a pattern contributes at most one node to 22 | // the parse tree. Some characters, such as parentheses, the square 23 | // brackets for character classes, and the nongreedy marker '?' 24 | // contribute none. 25 | // 26 | // * Concatenation is implicit in patterns. Each intercharacter 27 | // position potentially contributes one node to the parse tree. 28 | // 29 | // * The root is one node in the parse tree. 30 | // 31 | // The worst case is a pattern made up of n literals, which will 32 | // generate n nodes for the literals, n-1 nodes for the concatenations, 33 | // and one node for the root. n + n - 1 + 1 = 2n. 34 | // 35 | // Therefore, sizing the vector to twice the length of the pattern 36 | // ensures that the vector will never resize on us and invalidate our 37 | // Node pointers. 38 | // 39 | Root = 0; 40 | Store.clear(); 41 | Store.reserve(2*len); 42 | } 43 | 44 | private: 45 | std::vector<Node> Store; 46 | }; 47 | -------------------------------------------------------------------------------- /re_gen/pat2data-enc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # 4 | # Expects one tab-separated pattern set per line on stdin, text as only 5 | # command-line argument or as last element of each line. Writes tests to 6 | # stdout in multipattern "long" test format. 7 | # 8 | 9 | import os.path 10 | import struct 11 | import sys 12 | 13 | import lgtestlib 14 | 15 | def main(): 16 | sg = os.path.dirname(__file__) + '/shitgrep' 17 | 18 | # compile the output structs 19 | bstruct = struct.Struct('B') 20 | lstruct = struct.Struct('=L') 21 | mstruct = struct.Struct('=QQQ') 22 | 23 | setnum = 0 24 | 25 | for line in sys.stdin: 26 | # read the patterns 27 | parts = line.rstrip('\n').split('\t') 28 | 29 | # get the text from the command line if specified 30 | if len(sys.argv) == 2: 31 | text = sys.argv[1] 32 | else: 33 | text = parts[-1] 34 | parts = parts[0:-1] 35 | 36 | # slice up the parts 37 | pats = parts[0::4] 38 | fixeds = parts[1::4] 39 | cases = parts[2::4] 40 | encodings = parts[3::4] 41 | 42 | # get matches from shitgrep 43 | matches = lgtestlib.run_shitgrep(sg, pats, text) 44 | 45 | if matches is None: 46 | # skip pattern sets where every pattern has zero-length matches 47 | continue 48 | 49 | # write out patterns and their matches 50 | sys.stdout.write(lstruct.pack(len(pats))) 51 | for (pat,fix,case,enc) in zip(pats,fixeds,cases,encodings): 52 | sys.stdout.write(lstruct.pack(len(pat))) 53 | sys.stdout.write(pat) 54 | sys.stdout.write(bstruct.pack(int(fix))) 55 | sys.stdout.write(bstruct.pack(int(case))) 56 | sys.stdout.write(lstruct.pack(len(enc))) 57 | sys.stdout.write(enc) 58 | sys.stdout.write(lstruct.pack(len(text))) 59 | sys.stdout.write(text) 60 | sys.stdout.write(lstruct.pack(len(matches))) 61 | 62 | for m in matches: 63 | sys.stdout.write(mstruct.pack(m[0], m[1], m[2])) 64 | 65 | # show progress 66 | setnum += 1 67 | if not setnum % 100: 68 | print >>sys.stderr, setnum 69 | 70 | print >>sys.stderr, setnum 71 | return 0 72 | 73 | 74 | if __name__ == "__main__": 75 | sys.exit(main()) 76 | 77 | -------------------------------------------------------------------------------- /re_gen/pat2data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # 4 | # Expects one tab-separated pattern set per line on stdin, text as only 5 | # command-line argument or as last element of each line. Writes tests to 6 | # stdout in multipattern "long" test format. 7 | # 8 | 9 | import os.path 10 | import struct 11 | import sys 12 | 13 | import lgtestlib 14 | 15 | def main(): 16 | sg = os.path.dirname(__file__) + '/shitgrep' 17 | 18 | # compile the output structs 19 | bstruct = struct.Struct('B') 20 | lstruct = struct.Struct('=L') 21 | mstruct = struct.Struct('=QQQ') 22 | 23 | setnum = 0 24 | 25 | for line in sys.stdin: 26 | # read the patterns 27 | pats = line.rstrip('\n').split('\t') 28 | 29 | # get the text from the command line if specified 30 | if len(sys.argv) == 2: 31 | text = sys.argv[1] 32 | else: 33 | text = pats[-1] 34 | pats = pats[0:-1] 35 | 36 | # get matches from shitgrep 37 | matches = lgtestlib.run_shitgrep(sg, pats, text) 38 | 39 | if matches is None: 40 | # skip pattern sets where every pattern is faulty 41 | continue 42 | 43 | # write out patterns and their matches 44 | sys.stdout.write(lstruct.pack(len(pats))) 45 | for pat in pats: 46 | sys.stdout.write(lstruct.pack(len(pat))) 47 | sys.stdout.write(pat) 48 | sys.stdout.write(bstruct.pack(0)) 49 | sys.stdout.write(bstruct.pack(0)) 50 | sys.stdout.write(lstruct.pack(len('ASCII'))) 51 | sys.stdout.write('ASCII') 52 | sys.stdout.write(lstruct.pack(len(text))) 53 | sys.stdout.write(text) 54 | sys.stdout.write(lstruct.pack(len(matches))) 55 | 56 | for m in matches: 57 | sys.stdout.write(mstruct.pack(m[0], m[1], m[2])) 58 | 59 | # show progress 60 | setnum += 1 61 | if not setnum % 100: 62 | print >>sys.stderr, setnum 63 | 64 | print >>sys.stderr, setnum 65 | return 0 66 | 67 | 68 | if __name__ == "__main__": 69 | sys.exit(main()) 70 | 71 | -------------------------------------------------------------------------------- /re_gen/pat2swith.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # 4 | # Expects either: 5 | # 6 | # * a tab-separated pattern set followed by a text on each stdin line, or 7 | # * a tab-separated pattern set on each stdin line, and a text as argv[1]. 8 | # 9 | # Writes unit tests to stdout. 10 | # 11 | 12 | import os.path 13 | import sys 14 | 15 | import lgtestlib 16 | 17 | def main(): 18 | sg = os.path.dirname(__file__) + '/shitgrep' 19 | 20 | # get text from command line? 21 | if len(sys.argv) < 2: 22 | cl_text = False 23 | elif len(sys.argv) == 2: 24 | text = sys.argv[1] 25 | cl_text = True 26 | else: 27 | raise Exception('too many arguments') 28 | 29 | # print head stuff 30 | print '''#include <scope/test.h> 31 | 32 | #include "stest.h" 33 | ''' 34 | 35 | setnum = 0 36 | 37 | for line in sys.stdin: 38 | # read the test set 39 | parts = line.rstrip('\n').split('\t') 40 | 41 | if cl_text: 42 | pats = parts 43 | else: 44 | pats = parts[0:-1] 45 | text = parts[-1] 46 | 47 | apats = ['^(' + p + ')' for p in pats] 48 | 49 | # get matches from shitgrep 50 | matches = lgtestlib.run_shitgrep(sg, apats, text) 51 | 52 | if len(pats) == 1: 53 | stest = 'R"({})"'.format(pats[0]) 54 | else: 55 | stest = '{{ R"({})" }}'.format(')", R"('.join(pats)) 56 | 57 | print '''SCOPE_FIXTURE_CTOR(autoPatternStartsWithTest{setnum}, STest, STest({stest})) {{'''.format(setnum=setnum, stest=stest) 58 | 59 | if matches is None: 60 | # every pattern in this set has zero-length matches 61 | print ' SCOPE_ASSERT(fixture.parsesButNotValid());' 62 | else: 63 | # this pattern set has no zero-length matches 64 | print ''' const char text[] = "{text}"; 65 | fixture.startsWith(text, text + {textlen}, 0); 66 | SCOPE_ASSERT_EQUAL({matchcount}u, fixture.Hits.size());'''.format(text=text, textlen=len(text), matchcount=len(matches)) 67 | 68 | for i, m in enumerate(matches): 69 | print ' SCOPE_ASSERT_EQUAL(SearchHit({}, {}, {}), fixture.Hits[{}]);'.format(m[0], m[1], m[2], i) 70 | 71 | print '}\n' 72 | 73 | setnum += 1 74 | 75 | return 0 76 | 77 | 78 | if __name__ == "__main__": 79 | sys.exit(main()) 80 | 81 | -------------------------------------------------------------------------------- /re_gen/pat2test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # 4 | # Expects either: 5 | # 6 | # * a tab-separated pattern set followed by a text on each stdin line, or 7 | # * a tab-separated pattern set on each stdin line, and a text as argv[1]. 8 | # 9 | # Writes unit tests to stdout. 10 | # 11 | 12 | import os.path 13 | import sys 14 | 15 | import lgtestlib 16 | 17 | def main(): 18 | sg = os.path.dirname(__file__) + '/shitgrep' 19 | 20 | # get text from command line? 21 | if len(sys.argv) < 2: 22 | cl_text = False 23 | elif len(sys.argv) == 2: 24 | text = sys.argv[1] 25 | cl_text = True 26 | else: 27 | raise Exception('too many arguments') 28 | 29 | # print head stuff 30 | print '''#include <scope/test.h> 31 | 32 | #include "stest.h" 33 | ''' 34 | 35 | setnum = 0 36 | 37 | for line in sys.stdin: 38 | # read the test set 39 | parts = line.rstrip('\n').split('\t') 40 | 41 | if cl_text: 42 | pats = parts 43 | else: 44 | pats = parts[0:-1] 45 | text = parts[-1] 46 | 47 | # get matches from shitgrep 48 | matches = lgtestlib.run_shitgrep(sg, pats, text) 49 | 50 | if len(pats) == 1: 51 | stest = 'R"({})"'.format(pats[0]) 52 | else: 53 | stest = '{{ R"({})" }}'.format(')", R"('.join(pats)) 54 | 55 | print '''SCOPE_FIXTURE_CTOR(autoPatternTest{setnum}, STest, STest({stest})) {{'''.format(setnum=setnum, stest=stest) 56 | 57 | if matches is None: 58 | # every pattern in this set has zero-length matches 59 | print ' SCOPE_ASSERT(fixture.parsesButNotValid());' 60 | else: 61 | # this pattern set has no zero-length matches 62 | print ''' const char text[] = R"({text})"; 63 | fixture.search(text, text + {textlen}, 0); 64 | SCOPE_ASSERT_EQUAL({matchcount}u, fixture.Hits.size());'''.format(text=text, textlen=len(text), matchcount=len(matches)) 65 | 66 | for i, m in enumerate(matches): 67 | print ' SCOPE_ASSERT_EQUAL(SearchHit({}, {}, {}), fixture.Hits[{}]);'.format(m[0], m[1], m[2], i) 68 | 69 | print '}\n' 70 | 71 | setnum += 1 72 | 73 | return 0 74 | 75 | 76 | if __name__ == "__main__": 77 | sys.exit(main()) 78 | 79 | -------------------------------------------------------------------------------- /re_gen/patsample.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # 4 | # Expects patterns on stdin, one per line, and sample count n and size k as 5 | # arguments. Prints n lines of k tab-separated patterns, chosen uniformly 6 | # without replacement within each line from the patterns given on stdin. 7 | # 8 | 9 | import random 10 | import sys 11 | 12 | def main(): 13 | pats = map(lambda s: s.rstrip('\n'), sys.stdin.readlines()) 14 | scount = int(sys.argv[1]) 15 | ssize = int(sys.argv[2]) 16 | 17 | for x in range(0, scount): 18 | print '\t'.join(random.sample(pats, ssize)) 19 | 20 | return 0 21 | 22 | 23 | if __name__ == "__main__": 24 | sys.exit(main()) 25 | 26 | -------------------------------------------------------------------------------- /re_gen/testregex/forcedassoc.dat: -------------------------------------------------------------------------------- 1 | NOTE left-assoc:pass-all right-assoc:pass-all : 2002-04-29 2 | 3 | E (a|ab)(c|bcd) abcd (0,4)(0,1)(1,4) 4 | E (a|ab)(bcd|c) abcd (0,4)(0,1)(1,4) 5 | E (ab|a)(c|bcd) abcd (0,4)(0,1)(1,4) 6 | E (ab|a)(bcd|c) abcd (0,4)(0,1)(1,4) 7 | E ((a|ab)(c|bcd))(d*) abcd (0,4)(0,4)(0,1)(1,4)(4,4) 8 | E ((a|ab)(bcd|c))(d*) abcd (0,4)(0,4)(0,1)(1,4)(4,4) 9 | E ((ab|a)(c|bcd))(d*) abcd (0,4)(0,4)(0,1)(1,4)(4,4) 10 | E ((ab|a)(bcd|c))(d*) abcd (0,4)(0,4)(0,1)(1,4)(4,4) 11 | E (a|ab)((c|bcd)(d*)) abcd (0,4)(0,2)(2,4)(2,3)(3,4) 12 | E (a|ab)((bcd|c)(d*)) abcd (0,4)(0,2)(2,4)(2,3)(3,4) 13 | E (ab|a)((c|bcd)(d*)) abcd (0,4)(0,2)(2,4)(2,3)(3,4) 14 | E (ab|a)((bcd|c)(d*)) abcd (0,4)(0,2)(2,4)(2,3)(3,4) 15 | E (a*)(b|abc) abc (0,3)(0,0)(0,3) 16 | E (a*)(abc|b) abc (0,3)(0,0)(0,3) 17 | E ((a*)(b|abc))(c*) abc (0,3)(0,3)(0,0)(0,3)(3,3) 18 | E ((a*)(abc|b))(c*) abc (0,3)(0,3)(0,0)(0,3)(3,3) 19 | E (a*)((b|abc)(c*)) abc (0,3)(0,1)(1,3)(1,2)(2,3) 20 | E (a*)((abc|b)(c*)) abc (0,3)(0,1)(1,3)(1,2)(2,3) 21 | E (a*)(b|abc) abc (0,3)(0,0)(0,3) 22 | E (a*)(abc|b) abc (0,3)(0,0)(0,3) 23 | E ((a*)(b|abc))(c*) abc (0,3)(0,3)(0,0)(0,3)(3,3) 24 | E ((a*)(abc|b))(c*) abc (0,3)(0,3)(0,0)(0,3)(3,3) 25 | E (a*)((b|abc)(c*)) abc (0,3)(0,1)(1,3)(1,2)(2,3) 26 | E (a*)((abc|b)(c*)) abc (0,3)(0,1)(1,3)(1,2)(2,3) 27 | E (a|ab) ab (0,2)(0,2) 28 | E (ab|a) ab (0,2)(0,2) 29 | E (a|ab)(b*) ab (0,2)(0,2)(2,2) 30 | E (ab|a)(b*) ab (0,2)(0,2)(2,2) 31 | -------------------------------------------------------------------------------- /re_gen/testregex/leftassoc.dat: -------------------------------------------------------------------------------- 1 | NOTE left-assoc:pass-all right-assoc:pass-none : 2002-04-29 2 | 3 | E (a|ab)(c|bcd)(d*) abcd (0,4)(0,1)(1,4)(4,4) 4 | E (a|ab)(bcd|c)(d*) abcd (0,4)(0,1)(1,4)(4,4) 5 | E (ab|a)(c|bcd)(d*) abcd (0,4)(0,1)(1,4)(4,4) 6 | E (ab|a)(bcd|c)(d*) abcd (0,4)(0,1)(1,4)(4,4) 7 | 8 | E (a*)(b|abc)(c*) abc (0,3)(0,0)(0,3)(3,3) 9 | E (a*)(abc|b)(c*) abc (0,3)(0,0)(0,3)(3,3) 10 | E (a*)(b|abc)(c*) abc (0,3)(0,0)(0,3)(3,3) 11 | E (a*)(abc|b)(c*) abc (0,3)(0,0)(0,3)(3,3) 12 | 13 | E (a|ab)(c|bcd)(d|.*) abcd (0,4)(0,1)(1,4)(4,4) 14 | E (a|ab)(bcd|c)(d|.*) abcd (0,4)(0,1)(1,4)(4,4) 15 | E (ab|a)(c|bcd)(d|.*) abcd (0,4)(0,1)(1,4)(4,4) 16 | E (ab|a)(bcd|c)(d|.*) abcd (0,4)(0,1)(1,4)(4,4) 17 | -------------------------------------------------------------------------------- /re_gen/testregex/nullsubexpr.dat: -------------------------------------------------------------------------------- 1 | NOTE null subexpression matches : 2002-06-06 2 | 3 | E (a*)* a (0,1)(0,1) 4 | E SAME x (0,0)(0,0) 5 | E SAME aaaaaa (0,6)(0,6) 6 | E SAME aaaaaax (0,6)(0,6) 7 | E (a*)+ a (0,1)(0,1) 8 | E SAME x (0,0)(0,0) 9 | E SAME aaaaaa (0,6)(0,6) 10 | E SAME aaaaaax (0,6)(0,6) 11 | E (a+)* a (0,1)(0,1) 12 | E SAME x (0,0) 13 | E SAME aaaaaa (0,6)(0,6) 14 | E SAME aaaaaax (0,6)(0,6) 15 | E (a+)+ a (0,1)(0,1) 16 | E SAME x NOMATCH 17 | E SAME aaaaaa (0,6)(0,6) 18 | E SAME aaaaaax (0,6)(0,6) 19 | 20 | E ([a]*)* a (0,1)(0,1) 21 | E SAME x (0,0)(0,0) 22 | E SAME aaaaaa (0,6)(0,6) 23 | E SAME aaaaaax (0,6)(0,6) 24 | E ([a]*)+ a (0,1)(0,1) 25 | E SAME x (0,0)(0,0) 26 | E SAME aaaaaa (0,6)(0,6) 27 | E SAME aaaaaax (0,6)(0,6) 28 | E ([^b]*)* a (0,1)(0,1) 29 | E SAME b (0,0)(0,0) 30 | E SAME aaaaaa (0,6)(0,6) 31 | E SAME aaaaaab (0,6)(0,6) 32 | E ([ab]*)* a (0,1)(0,1) 33 | E SAME aaaaaa (0,6)(0,6) 34 | E SAME ababab (0,6)(0,6) 35 | E SAME bababa (0,6)(0,6) 36 | E SAME b (0,1)(0,1) 37 | E SAME bbbbbb (0,6)(0,6) 38 | E SAME aaaabcde (0,5)(0,5) 39 | E ([^a]*)* b (0,1)(0,1) 40 | E SAME bbbbbb (0,6)(0,6) 41 | E SAME aaaaaa (0,0)(0,0) 42 | E ([^ab]*)* ccccxx (0,6)(0,6) 43 | E SAME ababab (0,0)(0,0) 44 | 45 | E ((z)+|a)* zabcde (0,2)(1,2) 46 | 47 | {E a+? aaaaaa (0,1) no *? +? mimimal match ops 48 | E (a) aaa (0,1)(0,1) 49 | E (a*?) aaa (0,0)(0,0) 50 | E (a)*? aaa (0,0) 51 | E (a*?)*? aaa (0,0) 52 | } 53 | 54 | B \(a*\)*\(x\) x (0,1)(0,0)(0,1) 55 | B \(a*\)*\(x\) ax (0,2)(0,1)(1,2) 56 | B \(a*\)*\(x\) axa (0,2)(0,1)(1,2) 57 | B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1) 58 | B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2) 59 | B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3) 60 | B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4) 61 | B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3) 62 | 63 | E (a*)*(x) x (0,1)(0,0)(0,1) 64 | E (a*)*(x) ax (0,2)(0,1)(1,2) 65 | E (a*)*(x) axa (0,2)(0,1)(1,2) 66 | 67 | E (a*)+(x) x (0,1)(0,0)(0,1) 68 | E (a*)+(x) ax (0,2)(0,1)(1,2) 69 | E (a*)+(x) axa (0,2)(0,1)(1,2) 70 | 71 | E (a*){2}(x) x (0,1)(0,0)(0,1) 72 | E (a*){2}(x) ax (0,2)(1,1)(1,2) 73 | E (a*){2}(x) axa (0,2)(1,1)(1,2) 74 | -------------------------------------------------------------------------------- /re_gen/testregex/rightassoc.dat: -------------------------------------------------------------------------------- 1 | NOTE left-assoc:pass-none right-assoc:pass-all : 2002-04-29 2 | 3 | E (a|ab)(c|bcd)(d*) abcd (0,4)(0,2)(2,3)(3,4) 4 | E (a|ab)(bcd|c)(d*) abcd (0,4)(0,2)(2,3)(3,4) 5 | E (ab|a)(c|bcd)(d*) abcd (0,4)(0,2)(2,3)(3,4) 6 | E (ab|a)(bcd|c)(d*) abcd (0,4)(0,2)(2,3)(3,4) 7 | 8 | E (a*)(b|abc)(c*) abc (0,3)(0,1)(1,2)(2,3) 9 | E (a*)(abc|b)(c*) abc (0,3)(0,1)(1,2)(2,3) 10 | E (a*)(b|abc)(c*) abc (0,3)(0,1)(1,2)(2,3) 11 | E (a*)(abc|b)(c*) abc (0,3)(0,1)(1,2)(2,3) 12 | 13 | E (a|ab)(c|bcd)(d|.*) abcd (0,4)(0,2)(2,3)(3,4) 14 | E (a|ab)(bcd|c)(d|.*) abcd (0,4)(0,2)(2,3)(3,4) 15 | E (ab|a)(c|bcd)(d|.*) abcd (0,4)(0,2)(2,3)(3,4) 16 | E (ab|a)(bcd|c)(d|.*) abcd (0,4)(0,2)(2,3)(3,4) 17 | -------------------------------------------------------------------------------- /re_gen/tests/dot-star: -------------------------------------------------------------------------------- 1 | 00000 -------------------------------------------------------------------------------- /re_gen/tests/dot-star_patterns: -------------------------------------------------------------------------------- 1 | .* 2 | 0.* 3 | 0.*0 4 | -------------------------------------------------------------------------------- /re_gen/unparser.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "basic.h" 4 | #include "parsetree.h" 5 | 6 | #include <string> 7 | 8 | std::string byteToCharacterString(uint32 i); 9 | std::string byteSetToCharacterClass(const ByteSet& bs); 10 | 11 | std::string unparse(const ParseTree& tree); 12 | 13 | -------------------------------------------------------------------------------- /re_gen/utf8-gen.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import re 4 | import sys 5 | 6 | def main(): 7 | for cp in xrange(0x01, 0x09): 8 | print_test(cp) 9 | 10 | # skip \t, we use it as a separator between patterns 11 | # and the text so it can't appear in the text 12 | 13 | for cp in xrange(0x10,0x110000): 14 | print_test(cp) 15 | 16 | def print_test(cp): 17 | bytes = unichr(cp).encode('UTF-8') 18 | # pat = re.escape(bytes) 19 | pat = escape(bytes) 20 | text = '%s%s' % ((bytes, 'x') if cp % 2 else ('x', bytes)) 21 | print('%s\t0\t0\tUTF-8\t%s' % (pat, text)) 22 | 23 | def escape(pat): 24 | return re.sub('([.*+?[\\\\|()])', '\\\\\g<1>', pat) 25 | 26 | if __name__ == "__main__": 27 | sys.exit(main()) 28 | -------------------------------------------------------------------------------- /re_gen/utf8-singles.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/re_gen/utf8-singles.bz2 -------------------------------------------------------------------------------- /re_gen/valid.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/re_gen/valid.bz2 -------------------------------------------------------------------------------- /src/lib/ascii.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #include "encoders/ascii.h" 20 | 21 | uint32_t ASCII::write(int32_t cp, byte buf[]) const { 22 | if (cp < 0) { 23 | return 0; 24 | } 25 | else if (cp < 0x80) { 26 | buf[0] = cp; 27 | return 1; 28 | } 29 | else { 30 | return 0; 31 | } 32 | } 33 | 34 | void ASCII::write(const UnicodeSet& uset, std::vector<std::vector<ByteSet>>& v) const { 35 | v.emplace_back(1); 36 | for (const UnicodeSet::range& r : uset) { 37 | if (r.first > 0x7F) { 38 | break; 39 | } 40 | else if (r.second > 0x7F) { 41 | v[0][0].set(r.first, 0x80, true); 42 | break; 43 | } 44 | else { 45 | v[0][0].set(r.first, r.second, true); 46 | } 47 | } 48 | } 49 | 50 | uint32_t ASCII::write(const byte buf[], int32_t& cp) const { 51 | if (buf[0] < 0x80) { 52 | cp = buf[0]; 53 | return 1; 54 | } 55 | else { 56 | cp = -1; 57 | return 0; 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/lib/automata.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #include "automata.h" 20 | 21 | #include <limits> 22 | 23 | const uint32_t Glushkov::NOLABEL = std::numeric_limits<uint32_t>::max(); 24 | 25 | std::string Glushkov::label() const { 26 | std::ostringstream buf; 27 | if (Trans) { 28 | buf << Trans->label(); 29 | if (Label != NOLABEL) { 30 | buf << "/" << Label; 31 | } 32 | } 33 | return buf.str(); 34 | } 35 | -------------------------------------------------------------------------------- /src/lib/byteencoder.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #include <memory> 20 | #include <sstream> 21 | 22 | #include "encoders/byteencoder.h" 23 | 24 | uint32_t ByteEncoder::maxByteLength() const { 25 | return BaseEnc->maxByteLength(); 26 | } 27 | 28 | std::string ByteEncoder::name() const { 29 | std::ostringstream ss; 30 | ss << BaseEnc->name() << '|' << Name; 31 | return ss.str(); 32 | } 33 | 34 | const UnicodeSet& ByteEncoder::validCodePoints() const { 35 | return BaseEnc->validCodePoints(); 36 | } 37 | 38 | uint32_t ByteEncoder::write(int32_t cp, byte buf[]) const { 39 | const uint32_t ret = BaseEnc->write(cp, buf); 40 | byteTransform(buf, ret); 41 | return ret; 42 | } 43 | 44 | uint32_t ByteEncoder::write(const byte[], int32_t& cp) const { 45 | // FIXME - Joel revisit this, note that first arg is unused 46 | std::unique_ptr<byte[]> tmp(new byte[maxByteLength()]); 47 | byteUntransform(tmp.get(), maxByteLength()); 48 | return BaseEnc->write(tmp.get(), cp); 49 | } 50 | -------------------------------------------------------------------------------- /src/lib/byteset.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #include "byteset.h" 20 | 21 | #include <iomanip> 22 | 23 | std::ostream& operator<<(std::ostream& out, const ByteSet& bs) { 24 | out << '[' << std::hex; 25 | 26 | int low = -1; 27 | bool first = true; 28 | for (int i = 0; i < 256; ++i) { 29 | if (bs.test(i)) { 30 | if (low < 0) { 31 | if (!first) { 32 | out << ','; 33 | } 34 | out << std::setfill('0') << std::setw(2) << i; 35 | low = i; 36 | first = false; 37 | } 38 | } 39 | else if (low >= 0) { 40 | if ((i-1) > low) { 41 | out << '-' << std::setfill('0') << std::setw(2) << (i-1); 42 | } 43 | low = -1; 44 | } 45 | } 46 | 47 | if (0 <= low && low < 255) { 48 | out << '-' << std::setfill('0') << std::setw(2) << 255; 49 | } 50 | 51 | out << ']' << std::dec; 52 | return out; 53 | } 54 | -------------------------------------------------------------------------------- /src/lib/c_api_util.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #include <cstring> 20 | #include <new> 21 | 22 | #include "c_api_util.h" 23 | 24 | namespace { 25 | // Our own strdup, using new. 26 | char* dup(const char* s) { 27 | return std::strcpy(new char[std::strlen(s)+1], s); 28 | } 29 | } 30 | 31 | LG_Error* makeError( 32 | const char* msg, 33 | const char* pattern, 34 | const char* encodingChain, 35 | const char* source, 36 | int index 37 | ) { 38 | try { 39 | return new LG_Error{ 40 | dup(msg), // don't make messageless errors 41 | pattern ? dup(pattern) : nullptr, 42 | encodingChain ? dup(encodingChain) : nullptr, 43 | source ? dup(source) : nullptr, 44 | index, 45 | nullptr 46 | }; 47 | } 48 | catch (const std::bad_alloc&) { 49 | // Insufficient memory to copy one of the strings. Everything is hosed. 50 | } 51 | catch (...) { 52 | // Should be impossible. 53 | } 54 | 55 | return nullptr; 56 | } 57 | -------------------------------------------------------------------------------- /src/lib/charencoder.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #include <sstream> 20 | 21 | #include "encoders/charencoder.h" 22 | 23 | std::string CharEncoder::name() const { 24 | std::ostringstream ss; 25 | ss << Name << '|' << BaseEnc->name(); 26 | return ss.str(); 27 | } 28 | 29 | uint32_t CharEncoder::write(int32_t cp, byte buf[]) const { 30 | return BaseEnc->write(charTransform(cp), buf); 31 | } 32 | 33 | uint32_t CharEncoder::write(const byte buf[], int32_t& cp) const { 34 | const uint32_t ret = BaseEnc->write(buf, cp); 35 | cp = charUntransform(cp); 36 | return ret; 37 | } 38 | -------------------------------------------------------------------------------- /src/lib/decoders/decoder.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #include "lightgrep/util.h" 20 | #include "decoders/decoder.h" 21 | 22 | const int32_t Decoder::END = LG_WINDOW_END; 23 | -------------------------------------------------------------------------------- /src/lib/fsmthingy.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #include "fsmthingy.h" 20 | #include "encoders/encoder.h" 21 | 22 | #include <memory> 23 | #include <string> 24 | #include <vector> 25 | 26 | FSMThingy::FSMThingy(uint32_t sizeHint): Fsm(new NFA(1, sizeHint)) { 27 | Fsm->TransFac = Nfab.getTransFac(); 28 | } 29 | 30 | void FSMThingy::addPattern(const ParseTree& tree, const char* chain, uint32_t label) { 31 | // prepare the NFA builder 32 | Nfab.reset(); 33 | Nfab.setCurLabel(label); 34 | 35 | // set the character encoding 36 | Nfab.setEncoder(EncFac.get(chain)); 37 | 38 | // build the NFA for this pattern 39 | if (Nfab.build(tree)) { 40 | // and merge it into the greater NFA 41 | Comp.pruneBranches(*Nfab.getFsm()); 42 | Comp.mergeIntoFSM(*Fsm, *Nfab.getFsm()); 43 | } 44 | else { 45 | THROW_RUNTIME_ERROR_WITH_CLEAN_OUTPUT("Empty matches"); 46 | } 47 | } 48 | 49 | void FSMThingy::finalizeGraph(bool determinize) { 50 | if (Fsm->verticesSize() < 2) { 51 | throw std::runtime_error("No valid patterns were parsed"); 52 | } 53 | 54 | if (determinize && !Fsm->Deterministic) { 55 | NFAPtr dfa(new NFA(1, 2 * Fsm->verticesSize(), Fsm->edgesSize())); 56 | dfa->TransFac = Fsm->TransFac; 57 | Comp.subsetDFA(*dfa, *Fsm); 58 | Fsm = dfa; 59 | } 60 | 61 | Comp.labelGuardStates(*Fsm); 62 | } 63 | -------------------------------------------------------------------------------- /src/lib/icuencoder.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #include "encoders/icuencoder.h" 20 | 21 | ICUEncoder::ICUEncoder(const std::string& name): 22 | EncoderBase(), 23 | Conv(name) 24 | { 25 | Valid = Conv.validCodePoints(); 26 | } 27 | 28 | uint32_t ICUEncoder::write(int32_t cp, byte buf[]) const { 29 | return Conv.cp_to_bytes(cp, buf); 30 | } 31 | 32 | uint32_t ICUEncoder::write(const byte [], int32_t&) const { 33 | // TODO: fill this in 34 | return 0; 35 | } 36 | -------------------------------------------------------------------------------- /src/lib/lightgrep.pc.in: -------------------------------------------------------------------------------- 1 | prefix=@prefix@ 2 | exec_prefix=@exec_prefix@ 3 | includedir=@includedir@ 4 | libdir=@libdir@ 5 | 6 | Name: lightgrep 7 | Description: Not the worst forensics regexp engine 8 | URL: https://github.com/LightboxTech/liblightgrep 9 | Version: @PACKAGE_VERSION@ 10 | 11 | Cflags: -I${includedir} 12 | Libs: -L${libdir} -llightgrep 13 | Requires.private: icu-uc 14 | -------------------------------------------------------------------------------- /src/lib/parser.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #include "basic.h" 20 | #include "parser.h" 21 | #include "rewriter.h" 22 | 23 | #include <string> 24 | 25 | namespace { 26 | bool containsPossibleNongreedy(const std::string& pattern) { 27 | // The trailing '?' of a nongreedy operator must have at least 28 | // two characters preceeding it. 29 | return pattern.find('?', 2) != std::string::npos; 30 | } 31 | 32 | bool containsPossibleCountedRepetition(const std::string& pattern) { 33 | // The '{' of a counted repetition operator must have at least one 34 | // character preceeding it and two characters following it. 35 | const std::string::size_type cr = pattern.rfind('{', pattern.length()-3); 36 | return cr > 0 && cr != std::string::npos; 37 | } 38 | } 39 | 40 | void parseAndReduce(const Pattern& pattern, ParseTree& tree) { 41 | // parse the pattern 42 | if (!parse(pattern, tree)) { 43 | THROW_RUNTIME_ERROR_WITH_CLEAN_OUTPUT("Could not parse"); 44 | } 45 | reduce(pattern.Expression, tree); 46 | } 47 | 48 | void reduce(const std::string& text, ParseTree& tree) { 49 | // rewrite the parse tree, if necessary 50 | bool rewrite = makeBinopsRightAssociative(tree.Root); 51 | rewrite |= combineConsecutiveRepetitions(tree.Root); 52 | 53 | if (containsPossibleNongreedy(text)) { 54 | rewrite |= reduceTrailingNongreedyThenEmpty(tree.Root); 55 | rewrite |= reduceTrailingNongreedyThenGreedy(tree.Root); 56 | } 57 | 58 | if (rewrite || containsPossibleCountedRepetition(text)) { 59 | reduceEmptySubtrees(tree.Root); 60 | reduceUselessRepetitions(tree.Root); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/lib/parsetree.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #include "parsetree.h" 20 | #include <ostream> 21 | 22 | void ParseTree::init(uint32_t len) { 23 | Root = nullptr; 24 | Store.clear(); 25 | Store.reserve(2*len); 26 | } 27 | 28 | void printTree(std::ostream& out, const ParseNode& n) { 29 | switch (n.Type) { 30 | case ParseNode::ALTERNATION: 31 | case ParseNode::CONCATENATION: 32 | if (n.Child.Right) { 33 | printTree(out, *n.Child.Right); 34 | } 35 | case ParseNode::REGEXP: 36 | case ParseNode::REPETITION: 37 | case ParseNode::REPETITION_NG: 38 | if (n.Child.Left) { 39 | printTree(out, *n.Child.Left); 40 | } 41 | break; 42 | default: 43 | break; 44 | } 45 | 46 | out << n << '\n'; 47 | } 48 | 49 | std::ostream& operator<<(std::ostream& out, const ParseTree& tree) { 50 | if (tree.Root) { 51 | printTree(out, *tree.Root); 52 | } 53 | return out; 54 | } 55 | -------------------------------------------------------------------------------- /src/lib/pattern.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #include "pattern.h" 20 | 21 | std::ostream& operator<<(std::ostream& out, const Pattern& p) { 22 | out << p.Expression << ", " 23 | << (p.FixedString ? "fixed": "grep") << ", " 24 | << (p.CaseInsensitive ? "no case": "case") << ", " 25 | << p.Encoding; 26 | 27 | return out; 28 | } 29 | -------------------------------------------------------------------------------- /src/lib/thread.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #include "thread.h" 20 | 21 | #include <limits> 22 | 23 | const uint32_t Thread::NOLABEL = std::numeric_limits<uint32_t>::max(); 24 | const uint64_t Thread::NONE = std::numeric_limits<uint64_t>::max(); 25 | 26 | -------------------------------------------------------------------------------- /src/lib/utfbase.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #include "encoders/utfbase.h" 20 | 21 | void UTFBase::writeRange(std::vector<std::vector<ByteSet>>& va, UnicodeSet::const_iterator& i, const UnicodeSet::const_iterator& iend, uint32_t& l, uint32_t& h, byte* cur, uint32_t len, uint32_t blimit) const { 22 | while (l < std::min(h, blimit)) { 23 | // write the encoding for the next code point 24 | write(l, cur); 25 | va.emplace_back(len); 26 | std::vector<ByteSet>& v = va.back(); 27 | 28 | for (uint32_t j = 0; j < len; ++j) { 29 | v[j].set(cur[j]); 30 | } 31 | 32 | // write the encoding for all code points with the same initial len bytes 33 | writeRangeBlock(v, ++l, h, len, blimit); 34 | 35 | // figure out where to look for the next code point 36 | if (l < h) { 37 | if (l >= blimit) { 38 | return; 39 | } 40 | } 41 | else if (i == iend) { 42 | return; 43 | } 44 | else { 45 | ++i; 46 | if (i == iend) { 47 | return; 48 | } 49 | l = i->first; 50 | h = i->second; 51 | } 52 | } 53 | } 54 | 55 | void UTFBase::skipRange(UnicodeSet::const_iterator& i, const UnicodeSet::const_iterator& iend, uint32_t& l, uint32_t& h, uint32_t ubound) const { 56 | if (l < ubound) { 57 | if (i == iend) { 58 | return; 59 | } 60 | 61 | for (++i; i != iend; ++i) { 62 | l = i->first; 63 | h = i->second; 64 | if (h > ubound) { 65 | if (l < ubound) { 66 | l = ubound; 67 | } 68 | return; 69 | } 70 | } 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/lib/version.rc: -------------------------------------------------------------------------------- 1 | #include <winver.h> 2 | 3 | #include "config.h" 4 | 5 | #define STR_HELPER(x) #x 6 | #define STR(x) STR_HELPER(x) 7 | 8 | VS_VERSION_INFO VERSIONINFO 9 | FILEVERSION PACKAGE_VERSION_MAJOR, PACKAGE_VERSION_MINOR, PACKAGE_VERSION_MICRO, 0 10 | PRODUCTVERSION PACKAGE_VERSION_MAJOR, PACKAGE_VERSION_MINOR, PACKAGE_VERSION_MICRO, 0 11 | FILEFLAGSMASK VS_FFI_FILEFLAGSMASK 12 | #ifdef DEBUG 13 | FILEFLAGS VS_FF_DEBUG 14 | #else 15 | FILEFLAGS 0 16 | #endif 17 | FILEOS VOS_NT_WINDOWS32 // ok for 64-bit, apparently there's nothing newer 18 | FILETYPE VFT_DLL 19 | FILESUBTYPE VFT2_UNKNOWN 20 | BEGIN 21 | BLOCK "StringFileInfo" 22 | BEGIN 23 | BLOCK "040904B0" // US English (0409) and Unicode (04B0) 24 | BEGIN 25 | VALUE "CompanyName", PACKAGE_COMPANY 26 | VALUE "FileDescription", PACKAGE_DESCRIPTION 27 | VALUE "FileVersion", PACKAGE_VERSION 28 | VALUE "InternalName", PACKAGE_NAME 29 | VALUE "LegalCopyright", "©" STR(PACKAGE_YEAR) " " PACKAGE_COMPANY 30 | VALUE "OriginalFilename", PACKAGE_NAME ".dll" 31 | VALUE "ProductName", PACKAGE_NAME 32 | VALUE "ProductVersion", PACKAGE_VERSION 33 | END 34 | END 35 | BLOCK "VarFileInfo" 36 | BEGIN 37 | // NB: charset is in decimal here, way to be consitent MS 38 | VALUE "Translation", 0x0409, 1200 // US English, Unicode 39 | END 40 | END 41 | -------------------------------------------------------------------------------- /test/data/hectotest: -------------------------------------------------------------------------------- 1 | a|a??.|.|a? a?|aa|a??a+? a*(a)|a*?|.+ a??a+.+?(a) a+(.).*?a. a??|.|.|aa|aa a+?|.?aa|a* a|a|.|.|.+.+ a?|.?|.??.|a a+a+?|.+|.*? a+?|.+..|.a a?a+|a?|.|. a|a|aa.*|a+? a*a?a??|.+ a+|a|.|.|(a) aa|.?.+?|.+? aa+?a*(a) a?|a|.a|a. (a)|(.)a|.a*? a*?|a*?|a*?|.|a a|.+?|a|a.+ a*.aa*?|.+ a??|(.).*.+? a+?|.?|.?a* a?.*..|a? a*?|.+(.) a*|aa|aa|.a a+?...a|.+ a|.(.)aa|.+? a+?|a|.|a|.a. a?.?|.|.|a* a|a.??(a.) a.|(a)|(.*?) a.aaa??a. a??.*.??a+? a.(.)(a|.) a+.?|.|.(.) a*?a??a*?|a? a*?a.|.?a? a+aa..|a+ a?|a+?|a|.(.) a|a.+|a*?a|a a+?|a|aa??|(.) a|.??a*|.a a??|.??..|.* a+?|a?.?a. aa(a).|a|. a|.|.+a|. aa.*.a.+ a*.+?|.*|.* a+?a?|a|a|.*? aaaa|a|.|a|. aaaa*.a a?|a+?|a|a|aa aa*|.*|.+? a*.+?(a)* a+?(a)|a.a? a.a?a+(a) a+|(a)..|. a+?|a*?a|.|a a+?|a|.|a|(a) (a)|(a)|.a*? a|aa?a|.|.*? a|a|(a).*a+? a?|..|a.*? a+a+?|.|.|a*? a?|.+|(.)+ a+..|.*|a*? a*?|.|a|a+?|.|. a*?a*?..|. a??|a*|a.+? a?a+.|.(a) a+.+?|a*?aa a?.*aa(.) a*?|a*|a.|a|a a?|(.).?|a+? a|.|.*|.aa*? a|a|.*|.?a?? a*?.*|aa|a a+a|a.|.|a|. a.a|..|(.) a|.aaa|.a+ a*?|a?..|(a) a*a|.|a*?|.*? a*?.+?.?(a) a|.(.).. a|..+|.*.a a+|.|a*?|.|a a|a(a).a|.*? a*?|a*.|.|.+ a?|.?.*?. a??a.|a?aa a|..|..|a|(a) a*?.|.|.?.|. a??a*|a+|a+ a..a|..|a?? a??|a.|.??|.|a a??a?a*.? a.|.*a*?|.|a a*|..|.|..*? 2 | -------------------------------------------------------------------------------- /test/data/hectotest.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/test/data/hectotest.dat -------------------------------------------------------------------------------- /test/data/kilotest.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strozfriedberg/liblightgrep/3eed03a5b6698a09f5d08e2b686d9d03d96fa4c2/test/data/kilotest.dat -------------------------------------------------------------------------------- /test/data_reader.cpp: -------------------------------------------------------------------------------- 1 | #include <istream> 2 | 3 | #include "data_reader.h" 4 | 5 | bool readTestData( 6 | std::istream& in, 7 | std::vector<Pattern>& patterns, 8 | std::string& text, 9 | std::vector<SearchHit>& expected) 10 | { 11 | uint32_t len, patcount; 12 | 13 | // read number of patterns 14 | in.read(reinterpret_cast<char*>(&patcount), sizeof(patcount)); 15 | if (!in) { 16 | return false; 17 | } 18 | patterns.reserve(patcount); 19 | 20 | for (uint32_t i = 0; i < patcount; ++i) { 21 | // read pattern 22 | in.read(reinterpret_cast<char*>(&len), sizeof(len)); 23 | if (!in) return false; 24 | std::string pattern(len, '\0'); 25 | in.read(&pattern[0], len); 26 | if (!in) return false; 27 | 28 | // read fixed 29 | bool fixed; 30 | in.read(reinterpret_cast<char*>(&fixed), 1); 31 | if (!in) return false; 32 | 33 | // read case-insensitive 34 | bool case_insensitive; 35 | in.read(reinterpret_cast<char*>(&case_insensitive), 1); 36 | if (!in) return false; 37 | 38 | // read encoding 39 | in.read(reinterpret_cast<char*>(&len), sizeof(len)); 40 | if (!in) return false; 41 | std::string encoding(len, '\0'); 42 | in.read(&encoding[0], len); 43 | if (!in) return false; 44 | 45 | patterns.emplace_back(pattern, fixed, case_insensitive, encoding); 46 | } 47 | 48 | // read text 49 | in.read(reinterpret_cast<char*>(&len), sizeof(len)); 50 | if (!in) return false; 51 | text.assign(len, '\0'); 52 | in.read(&text[0], len); 53 | if (!in) return false; 54 | 55 | // read hits 56 | in.read(reinterpret_cast<char*>(&len), sizeof(len)); 57 | if (!in) return false; 58 | expected.resize(len); 59 | 60 | if (sizeof(SearchHit) == 24) { 61 | // data can be read in as-is, in one shot 62 | in.read(reinterpret_cast<char*>(&expected[0]), len*sizeof(SearchHit)); 63 | } 64 | else { 65 | // we have to do it the hard way, one at a time, skipping the padding 66 | char buf[24]; 67 | for (uint32_t i = 0; i < len; ++i) { 68 | in.read(buf, sizeof(buf)); 69 | expected[i] = *reinterpret_cast<const SearchHit*>(buf); 70 | } 71 | } 72 | 73 | if (!in) return false; 74 | 75 | return true; 76 | } 77 | -------------------------------------------------------------------------------- /test/data_reader.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #pragma once 20 | 21 | #include <iosfwd> 22 | #include <string> 23 | #include <vector> 24 | 25 | #include "pattern.h" 26 | #include "searchhit.h" 27 | 28 | bool readTestData( 29 | std::istream& in, 30 | std::vector<Pattern>& patterns, 31 | std::string& text, 32 | std::vector<SearchHit>& expected); 33 | -------------------------------------------------------------------------------- /test/dtest.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #pragma once 20 | 21 | class DTest { 22 | public: 23 | DTest(const char* const path) { 24 | run(path); 25 | } 26 | 27 | operator bool() const { return true; } 28 | 29 | private: 30 | void run(const char* const path); 31 | }; 32 | -------------------------------------------------------------------------------- /test/executor.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #pragma once 20 | 21 | #ifdef HAVE_CONFIG_H 22 | #include "config.h" 23 | #endif /* HAVE_CONFIG_H */ 24 | 25 | #include <thread> 26 | #include <vector> 27 | 28 | #include <boost/asio.hpp> 29 | 30 | class Executor { 31 | public: 32 | Executor(size_t n = std::thread::hardware_concurrency()): 33 | service_(n), work_(new boost::asio::io_service::work(service_)) 34 | { 35 | for (size_t i = 0; i < n; ++i) { 36 | pool_.emplace_back([this](){ service_.run(); }); 37 | } 38 | } 39 | 40 | ~Executor() { 41 | delete work_; 42 | for (std::thread& t : pool_) { t.join(); } 43 | } 44 | 45 | template <typename F> 46 | void submit(F task) { 47 | service_.post(task); 48 | } 49 | 50 | protected: 51 | std::vector<std::thread> pool_; 52 | boost::asio::io_service service_; 53 | boost::asio::io_service::work* work_; 54 | }; 55 | -------------------------------------------------------------------------------- /test/mockcallback.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #include "searchhit.h" 20 | #include "mockcallback.h" 21 | 22 | #include <vector> 23 | 24 | void mockCallback(void* userData, const LG_SearchHit* const hit) { 25 | std::vector<SearchHit>& hits(*static_cast<std::vector<SearchHit>*>(userData)); 26 | hits.push_back(*static_cast<const SearchHit* const>(hit)); 27 | } 28 | -------------------------------------------------------------------------------- /test/mockcallback.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "lightgrep/api.h" 22 | 23 | void mockCallback(void* userData, const LG_SearchHit* const hit); 24 | 25 | -------------------------------------------------------------------------------- /test/test_ascii.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #include <scope/test.h> 20 | 21 | #include "encoders/ascii.h" 22 | 23 | SCOPE_TEST(testASCII) { 24 | ASCII enc; 25 | SCOPE_ASSERT_EQUAL(1u, enc.maxByteLength()); 26 | 27 | byte buf[1]; 28 | uint32_t len; 29 | 30 | // too low 31 | SCOPE_ASSERT_EQUAL(0u, enc.write(-1, buf)); 32 | 33 | // just right 34 | for (uint32_t i = 0; i < 0x80; ++i) { 35 | len = enc.write(i, buf); 36 | SCOPE_ASSERT_EQUAL(1u, len); 37 | SCOPE_ASSERT_EQUAL(i, buf[0]); 38 | } 39 | 40 | // too high 41 | SCOPE_ASSERT_EQUAL(0u, enc.write(0x80, buf)); 42 | } 43 | -------------------------------------------------------------------------------- /test/test_basic.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #include <scope/test.h> 20 | 21 | #include "basic.h" 22 | 23 | SCOPE_TEST(basicTypeSizes) { 24 | SCOPE_ASSERT_EQUAL(1u, sizeof(byte)); 25 | SCOPE_ASSERT_EQUAL(4u, sizeof(uint32_t)); 26 | SCOPE_ASSERT_EQUAL(8u, sizeof(uint64_t)); 27 | SCOPE_ASSERT_EQUAL(8u, sizeof(int64_t)); 28 | } 29 | -------------------------------------------------------------------------------- /test/test_bytesource.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #include <scope/test.h> 20 | 21 | #include "decoders/bytesource.h" 22 | 23 | SCOPE_TEST(byteSourceName) { 24 | const byte buf[] = "x"; 25 | ByteSource bs(buf, buf); 26 | SCOPE_ASSERT_EQUAL("", bs.name()); 27 | } 28 | 29 | SCOPE_TEST(byteSourceNext) { 30 | const byte buf[] = "abcdefghijklmnopqrstuvwxyz\n\t"; 31 | ByteSource bs(buf, buf + sizeof(buf)); 32 | for (size_t i = 0; i < sizeof(buf); ++i) { 33 | SCOPE_ASSERT_EQUAL(std::make_pair((int32_t) buf[i], buf+i), bs.next()); 34 | } 35 | SCOPE_ASSERT_EQUAL(std::make_pair(Decoder::END, buf+sizeof(buf)), bs.next()); 36 | } 37 | -------------------------------------------------------------------------------- /test/test_helper.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "automata.h" 22 | #include "fwd_pointers.h" 23 | #include "pattern.h" 24 | 25 | #include <initializer_list> 26 | #include <vector> 27 | 28 | void edge(NFA::VertexDescriptor source, NFA::VertexDescriptor target, NFA& fsm, Transition* trans); 29 | 30 | bool edgeExists(const NFA& g, const NFA::VertexDescriptor source, const NFA::VertexDescriptor target); 31 | 32 | void ASSERT_SUPERGRAPH(const NFA& a, const NFA& b); 33 | 34 | void ASSERT_EQUAL_GRAPHS(const NFA& a, const NFA& b); 35 | 36 | void ASSERT_EQUAL_LABELS(const NFA& a, const NFA& b); 37 | 38 | void ASSERT_EQUAL_MATCHES(const NFA& a, const NFA& b); 39 | 40 | NFAPtr createGraph(const std::vector<Pattern>& pats, bool determinize); 41 | 42 | -------------------------------------------------------------------------------- /test/test_icu.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #include "basic.h" 20 | #include "container_out.h" 21 | 22 | #include <scope/test.h> 23 | 24 | #include <set> 25 | 26 | #include <unicode/ucnv.h> 27 | 28 | SCOPE_TEST(testICUStandards) { 29 | // check that ICU is defining exactly the standards we expect 30 | 31 | std::set<std::string> expected{ 32 | "UTR22", "IANA", "MIME", "IBM", "WINDOWS", "JAVA", "" 33 | }; 34 | 35 | std::set<std::string> actual; 36 | UErrorCode err = U_ZERO_ERROR; 37 | const uint32_t slen = ucnv_countStandards(); 38 | for (uint32_t i = 0; i < slen; ++i) { 39 | actual.insert(ucnv_getStandard(i, &err)); 40 | SCOPE_ASSERT(!U_FAILURE(err)); 41 | } 42 | 43 | SCOPE_ASSERT_EQUAL(expected, actual); 44 | } 45 | -------------------------------------------------------------------------------- /test/test_icuutil.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #include "pair_out.h" 20 | #include <scope/test.h> 21 | 22 | #include "rangeset.h" 23 | #include "icuutil.h" 24 | 25 | #include <unicode/uset.h> 26 | #include <memory> 27 | 28 | SCOPE_TEST(testUnicodeSetICUToLGDstEmpty) { 29 | std::unique_ptr<USet, void(*)(USet*)> src(uset_open(0x27, 0x3F), uset_close); 30 | UnicodeSet exp{{0x27,0x40}}, act; 31 | 32 | convUnicodeSet(act, src.get()); 33 | SCOPE_ASSERT_EQUAL(exp, act); 34 | } 35 | 36 | SCOPE_TEST(testUnicodeSetICUToLGDstNonEmpty) { 37 | // test that convUnicodeSet clears dst 38 | std::unique_ptr<USet, void(*)(USet*)> src(uset_open(0x27, 0x3F), uset_close); 39 | UnicodeSet exp{{0x27,0x40}}, act{0xBEEF}; 40 | 41 | convUnicodeSet(act, src.get()); 42 | SCOPE_ASSERT_EQUAL(exp, act); 43 | } 44 | 45 | SCOPE_TEST(testUnicodeSetLGToICUDstEmpty) { 46 | UnicodeSet src{{0x27,0x40}}; 47 | std::unique_ptr<USet, void(*)(USet*)> exp(uset_open(0x27, 0x3F), uset_close); 48 | std::unique_ptr<USet, void(*)(USet*)> act(uset_openEmpty(), uset_close); 49 | 50 | convUnicodeSet(act.get(), src); 51 | SCOPE_ASSERT(uset_equals(exp.get(), act.get())); 52 | } 53 | 54 | SCOPE_TEST(testUnicodeSetLGToICUDstNonEmpty) { 55 | // test that convUnicodeSet clears dst 56 | UnicodeSet src{{0x27,0x40}}; 57 | std::unique_ptr<USet, void(*)(USet*)> exp(uset_open(0x27, 0x3F), uset_close); 58 | std::unique_ptr<USet, void(*)(USet*)> act(uset_open(0xBEEF, 0xBEEF), uset_close); 59 | 60 | convUnicodeSet(act.get(), src); 61 | SCOPE_ASSERT(uset_equals(exp.get(), act.get())); 62 | } 63 | -------------------------------------------------------------------------------- /test/test_ostream_join_iterator.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #include <scope/test.h> 20 | 21 | #include "ostream_join_iterator.h" 22 | 23 | #include <algorithm> 24 | #include <sstream> 25 | 26 | SCOPE_TEST(joinEmpty) { 27 | const int *a = 0; 28 | std::ostringstream ss; 29 | std::copy(a, a, ostream_join_iterator<int>(ss, ", ")); 30 | SCOPE_ASSERT_EQUAL("", ss.str()); 31 | } 32 | 33 | SCOPE_TEST(joinSingleton) { 34 | const int a[] = { 1 }; 35 | std::ostringstream ss; 36 | std::copy(a, a + 1, ostream_join_iterator<int>(ss, ", ")); 37 | SCOPE_ASSERT_EQUAL("1", ss.str()); 38 | } 39 | 40 | SCOPE_TEST(joinMultiple) { 41 | const int a[] = { 1, 2, 3 }; 42 | std::ostringstream ss; 43 | std::copy(a, a + 3, ostream_join_iterator<int>(ss, ", ")); 44 | SCOPE_ASSERT_EQUAL("1, 2, 3", ss.str()); 45 | } 46 | -------------------------------------------------------------------------------- /test/test_program.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #include <scope/test.h> 20 | 21 | #include "program.h" 22 | 23 | ProgramPtr makeProgram() { 24 | ProgramPtr p1(new Program(3)); 25 | (*p1)[0] = Instruction::makeByte('a'); 26 | (*p1)[1] = Instruction::makeLabel(0); 27 | (*p1)[2] = Instruction::makeMatch(); 28 | 29 | p1->FilterOff = 0; 30 | for (uint32_t i = 0; i < 256; ++i) { 31 | p1->Filter.set((i << 8) | 'a'); 32 | } 33 | 34 | return p1; 35 | } 36 | 37 | SCOPE_TEST(testProgramSize) { 38 | ProgramPtr p1(makeProgram()); 39 | SCOPE_ASSERT_EQUAL(3u, p1->size()); 40 | } 41 | 42 | SCOPE_TEST(testProgramBufSize) { 43 | ProgramPtr p1(makeProgram()); 44 | SCOPE_ASSERT_EQUAL( 45 | sizeof(Program) + p1->size()*sizeof(Instruction), p1->bufSize() 46 | ); 47 | } 48 | 49 | SCOPE_TEST(testProgramSerialization) { 50 | ProgramPtr p1(makeProgram()); 51 | 52 | std::vector<char> buf = p1->marshall(); 53 | SCOPE_ASSERT_EQUAL(p1->bufSize(), buf.size()); 54 | 55 | ProgramPtr p2 = Program::unmarshall(buf.data(), buf.size()); 56 | SCOPE_ASSERT(p2); 57 | SCOPE_ASSERT(*p1 == *p2); 58 | } 59 | -------------------------------------------------------------------------------- /test/test_search_data.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #include <scope/test.h> 20 | 21 | #include "config.h" 22 | #include "dtest.h" 23 | 24 | SCOPE_FIXTURE_CTOR(hundredPatternSearch, DTest, DTest(TDATDIR "/hectotest.dat")) { SCOPE_ASSERT(fixture); } 25 | 26 | SCOPE_FIXTURE_CTOR(thousandPatternSearch, DTest, DTest(TDATDIR "/kilotest.dat")) { SCOPE_ASSERT(fixture); } 27 | -------------------------------------------------------------------------------- /test/test_search_data_driver.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #pragma once 20 | 21 | #include <iosfwd> 22 | 23 | bool longTest(std::istream& in); 24 | -------------------------------------------------------------------------------- /test/test_sequences.h: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #include <scope/test.h> 20 | 21 | #include <vector> 22 | 23 | #include "basic.h" 24 | #include "listops.h" 25 | 26 | SCOPE_TEST(removeRightDuplicatesTest) { 27 | std::vector<uint32_t> v{7,9,7,9}; 28 | removeRightDuplicates(v); 29 | std::vector<uint32_t> exp{7,9}; 30 | SCOPE_ASSERT_EQUAL(exp, v); 31 | } 32 | -------------------------------------------------------------------------------- /test/test_sparseset.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #include <scope/test.h> 20 | 21 | #include "sparseset.h" 22 | 23 | SCOPE_TEST(basicSparseTest) { 24 | SparseSet s(5); 25 | SCOPE_ASSERT_EQUAL(0u, s.size()); 26 | for (uint32_t i = 0; i < 5; ++i) { 27 | SCOPE_ASSERT(!s.find(i)); 28 | } 29 | s.insert(3); 30 | SCOPE_ASSERT_EQUAL(1u, s.size()); 31 | SCOPE_ASSERT(s.find(3)); 32 | SCOPE_ASSERT(!s.find(0)); 33 | SCOPE_ASSERT(!s.find(1)); 34 | SCOPE_ASSERT(!s.find(2)); 35 | SCOPE_ASSERT(!s.find(4)); 36 | } 37 | 38 | SCOPE_TEST(sparseClear) { 39 | SparseSet s(5); 40 | SCOPE_ASSERT_EQUAL(0u, s.size()); 41 | s.insert(4); 42 | s.insert(2); 43 | SCOPE_ASSERT_EQUAL(2u, s.size()); 44 | s.clear(); 45 | SCOPE_ASSERT_EQUAL(0u, s.size()); 46 | for (uint32_t i = 0; i < 5; ++i) { 47 | SCOPE_ASSERT(!s.find(i)); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /test/test_starts_with.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #include <scope/test.h> 20 | 21 | #include "stest.h" 22 | 23 | SCOPE_FIXTURE_CTOR(startsWithTest, STest, STest({"ab..ef", "c[a-z][a-z]", "[aA][bc][bc]"})) { 24 | const char text[] = "abcdefghijklmnop"; 25 | fixture.startsWith(text, text + 16, 0); 26 | SCOPE_ASSERT_EQUAL(2u, fixture.Hits.size()); 27 | SCOPE_ASSERT_EQUAL(SearchHit(0, 3, 2), fixture.Hits[0]); 28 | SCOPE_ASSERT_EQUAL(SearchHit(0, 6, 0), fixture.Hits[1]); 29 | } 30 | 31 | SCOPE_FIXTURE_CTOR(startsWithShortTest, STest, STest({"a+"})) { 32 | const char text[] = "a"; 33 | fixture.startsWith(text, text + 1, 0); 34 | SCOPE_ASSERT_EQUAL(1u, fixture.Hits.size()); 35 | SCOPE_ASSERT_EQUAL(SearchHit(0, 1, 0), fixture.Hits[0]); 36 | } 37 | -------------------------------------------------------------------------------- /test/test_transitionfactory.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #include <scope/test.h> 20 | 21 | #include "states.h" 22 | #include "transition.h" 23 | #include "transitionfactory.h" 24 | 25 | template <class StateType> 26 | void smallestTester(const ByteSet& ebs) { 27 | TransitionFactory tfac; 28 | StateType* state = dynamic_cast<StateType*>(tfac.getSmallest(ebs)); 29 | SCOPE_ASSERT(state); 30 | ByteSet abs; 31 | state->getBytes(abs); 32 | SCOPE_ASSERT_EQUAL(ebs, abs); 33 | } 34 | 35 | SCOPE_TEST(getSmallestNoneTest) { 36 | const ByteSet ebs; 37 | smallestTester<ByteSetState>(ebs); 38 | } 39 | 40 | SCOPE_TEST(getSmallestOneTest) { 41 | const ByteSet ebs('z'); 42 | smallestTester<ByteState>(ebs); 43 | } 44 | 45 | SCOPE_TEST(getSmallestTwoTest) { 46 | ByteSet ebs; 47 | ebs.set('a'); 48 | ebs.set('z'); 49 | smallestTester<EitherState>(ebs); 50 | } 51 | 52 | SCOPE_TEST(getSmallestRangeTest) { 53 | ByteSet ebs; 54 | ebs.set('a', 'z' + 1, true); 55 | smallestTester<RangeState>(ebs); 56 | } 57 | 58 | SCOPE_TEST(getSmallestManyTest) { 59 | ByteSet ebs; 60 | ebs.set('A'); 61 | ebs.set('a'); 62 | ebs.set('b'); 63 | smallestTester<ByteSetState>(ebs); 64 | } 65 | -------------------------------------------------------------------------------- /test/test_utf8decoder.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | liblightgrep: not the worst forensics regexp engine 3 | Copyright (C) 2013, Lightbox Technologies, Inc 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program. If not, see <http://www.gnu.org/licenses/>. 17 | */ 18 | 19 | #include <scope/test.h> 20 | 21 | #include <vector> 22 | 23 | #include "decoders/bytesource.h" 24 | #include "decoders/utf8decoder.h" 25 | 26 | SCOPE_TEST(utf8DecoderName) { 27 | const byte buf[] = "x"; 28 | UTF8Decoder d(std::unique_ptr<Decoder>(new ByteSource(buf, buf))); 29 | SCOPE_ASSERT_EQUAL("UTF-8", d.name()); 30 | } 31 | 32 | SCOPE_TEST(utf8DecoderNext) { 33 | const byte buf[] = { 34 | 'a', 'b', 'c', 0x80, 0x81, 0xF0, 0x9F, 0x92, 0xA9 35 | }; 36 | // a b c | invalid | PILE_OF_POO 37 | 38 | UTF8Decoder d(std::unique_ptr<Decoder>( 39 | new ByteSource(buf, buf + sizeof(buf)) 40 | )); 41 | 42 | const std::vector<std::pair<int32_t,const byte*>> exp{ 43 | {'a', buf}, {'b', buf+1}, {'c', buf+2}, 44 | {-0x81, buf+3}, {-0x82, buf+4}, 45 | {0x1F4A9, buf+5}, 46 | {Decoder::END, buf+9} 47 | }; 48 | 49 | for (const std::pair<int32_t,const byte*>& cp : exp) { 50 | SCOPE_ASSERT_EQUAL(cp, d.next()); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /tools/macify.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Package a MacOS X build so that it works on other Macs. 4 | # 5 | # By default, Mach executables include absolute paths to shared libraries they depend on. 6 | # This is obviously stupid. The solution is to use "@loader_path" and rejigger the 7 | # executable _and_ the libraries to state their dependencies relative to @loader_path. 8 | # @loader_path works both with an executable and with a requesting library (as opposed 9 | # to @executable_path). Modern versions of MacOS X also support @rpath, but I couldn't 10 | # get this to work, sadly. 11 | # 12 | # Obviously, this script will only work on my Mac. But it's at least a repeatable process. 13 | 14 | BUILDDIR=./mac_build 15 | 16 | LIBCPP=libstdc++.6.dylib 17 | LIBGCC=libgcc_s.1.dylib 18 | LIBLG=liblightgrep.dylib 19 | 20 | LIBCPPPATH=/Users/jon/my_gcc/lib/$LIBCPP 21 | LIBGCCPATH=/Users/jon/my_gcc/lib/$LIBGCC 22 | LIBLGPATH=bin/src/lib/$LIBLG 23 | 24 | if [ $BUILDDIR ] 25 | then 26 | rm -Rf $BUILDDIR 27 | fi 28 | 29 | mkdir $BUILDDIR 30 | cp lightgrep.exe $BUILDDIR/lightgrep 31 | cp $LIBCPPPATH $BUILDDIR/ 32 | cp $LIBGCCPATH $BUILDDIR/ 33 | cp $LIBLGPATH $BUILDDIR/ 34 | 35 | cd $BUILDDIR 36 | 37 | install_name_tool -id @loader_path/$LIBGCC $LIBGCC 38 | 39 | install_name_tool -id @loader_path/$LIBCPP $LIBCPP 40 | install_name_tool -change $LIBGCCPATH @loader_path/$LIBGCC $LIBCPP 41 | 42 | install_name_tool -id @loader_path/$LIBLG $LIBLG 43 | install_name_tool -change $LIBGCCPATH @loader_path/$LIBGCC $LIBLG 44 | install_name_tool -change $LIBCPPPATH @loader_path/$LIBCPP $LIBLG 45 | 46 | install_name_tool -change $LIBCPPPATH @loader_path/$LIBCPP lightgrep 47 | install_name_tool -change $LIBGCCPATH @loader_path/$LIBGCC lightgrep 48 | install_name_tool -change $LIBLGPATH @loader_path/$LIBLG lightgrep 49 | -------------------------------------------------------------------------------- /tools/thread_dump.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | use JSON; 4 | 5 | use constant { 6 | BORN => 1, 7 | PRERUN => 2, 8 | POSTRUN => 4, 9 | DIED => 8 10 | }; 11 | 12 | printf( 13 | "\n %-16s %-8s %-8s %-16s %-16s\n", 14 | 'id', 15 | 'pc', 16 | 'label', 17 | 'start', 18 | 'end' 19 | ); 20 | 21 | while (<>) { 22 | next unless /^{/; # ignore lines which are not JSON 23 | print_frame($_); 24 | } 25 | 26 | print "\n"; 27 | 28 | sub readabilify { 29 | my $byte = $_[0]; 30 | my $c = chr($byte); 31 | return ($c =~ /[[:print:]]/) ? "'$c'" : sprintf('0x%02x', $byte); 32 | } 33 | 34 | sub print_frame { 35 | my $frame = decode_json $_[0]; 36 | 37 | printf( 38 | "\n%016x <- %s\n\n", 39 | $frame->{'offset'}, 40 | readabilify($frame->{'byte'}) 41 | ); 42 | 43 | foreach $thread (@{$frame->{'list'}}) { 44 | if ($thread->{'state'} & BORN) { 45 | # thread birth is green 46 | print "\33[1;32m"; 47 | } 48 | elsif ($thread->{'state'} & DIED) { 49 | # thread death is red 50 | print "\33[1;31m"; 51 | } 52 | 53 | if ($thread->{'state'} & PRERUN) { 54 | # pre-run 55 | print '-'; 56 | } 57 | else { 58 | # post-run 59 | print '+'; 60 | } 61 | 62 | printf( 63 | " %016x %08x %08x [%016x,%016x]\n", 64 | $thread->{'Id'}, 65 | $thread->{'PC'} & 0xFFFFFFFF, 66 | $thread->{'Label'}, 67 | $thread->{'Start'}, 68 | $thread->{'End'} 69 | ); 70 | 71 | if ($thread->{'state'} & (BORN | DIED)) { 72 | # switch back to regular text color 73 | print "\33[0m"; 74 | } 75 | } 76 | } 77 | --------------------------------------------------------------------------------