├── COPYING ├── etc ├── .gitignore ├── coverage_report.sh ├── CONFIGURE_WINDOWS_LATEST.bash └── install_autotools.sh ├── tests ├── .gitignore ├── README.md ├── random.dat ├── unilang.htm ├── run_each.sh ├── regex_demo.cpp └── unilang8.htm ├── demos ├── .gitignore ├── README.md ├── thread_demo.cpp └── regex_demo.cpp ├── AUTHORS ├── NEWS ├── make_debug ├── INSTALL ├── thread-pool ├── 2105.00613.pdf └── LICENSE.md ├── abstract_image_reader.cpp ├── scan_sha1_test.h ├── pos0.cpp ├── .gitmodules ├── codecov.yml ├── bootstrap.sh ├── doc └── unit-tests.txt ├── .make-codecov ├── abstract_image_reader.h ├── test_image_reader.h ├── test_be20_api_malloc_debug ├── scanner_config.cpp ├── feature_recorder_mhist.h.broken ├── test_image_reader.cpp ├── .gitignore ├── formatter.h ├── .clang-format ├── ChangeLog ├── char_class.h ├── README_WIN.md ├── m4 ├── slg_noopt.m4 ├── slg_address_sanitizer.m4 └── slg_gcc_all_warnings.m4 ├── feature_recorder_sql.h ├── CODING_STANDARDS.txt ├── Makefile.am ├── .github └── workflows │ ├── build-windows.yml │ └── build-ubuntu-macos.yml ├── test_be20_threadpool.cpp ├── sbuf_stream.h ├── configure.ac ├── scanner_params.cpp ├── be20_configure.m4 ├── path_printer.h ├── scan_sha1_test.cpp ├── atomic_set.h ├── histogram_def.cpp ├── Makefile.defs ├── pcap_fake.h ├── regex_vector.h ├── LICENSE.md ├── net_ethernet.h ├── feature_recorder_file.h ├── TODO.md ├── regex_vector.cpp ├── machine_stats.h ├── utils.cpp ├── threadpool.h ├── atomic_unicode_histogram.h ├── unicode_escape.h ├── word_and_context_list.cpp ├── README.md ├── word_and_context_list.h ├── feature_recorder_mhist.cpp.broken ├── sbuf_stream.cpp ├── aftimer.h ├── histogram_def.h ├── scanner_config.h ├── atomic_unicode_histogram.cpp ├── pos0.h ├── feature_recorder_sql.cpp ├── pcap_fake.cpp ├── atomic_map.h ├── utils.h ├── threadpool.cpp └── feature_recorder_set.h /COPYING: -------------------------------------------------------------------------------- 1 | Go for it. 2 | -------------------------------------------------------------------------------- /etc/.gitignore: -------------------------------------------------------------------------------- 1 | *.secret 2 | -------------------------------------------------------------------------------- /tests/.gitignore: -------------------------------------------------------------------------------- 1 | regex_demo 2 | -------------------------------------------------------------------------------- /demos/.gitignore: -------------------------------------------------------------------------------- 1 | a.out 2 | a.out.dSYM 3 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Simson L. Garfinkel 2 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | Sept 17, 2025 - Removed support for pcre and std::regex_match 2 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | unilang from http://www.humancomp.org/unichtm/unichtm.htm 2 | -------------------------------------------------------------------------------- /tests/random.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simsong/be20_api/HEAD/tests/random.dat -------------------------------------------------------------------------------- /tests/unilang.htm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simsong/be20_api/HEAD/tests/unilang.htm -------------------------------------------------------------------------------- /make_debug: -------------------------------------------------------------------------------- 1 | make clean 2 | ./configure CFLAGS="-g -O0" CXXFLAGS="-g -O0" 3 | make test_be20_api 4 | -------------------------------------------------------------------------------- /INSTALL: -------------------------------------------------------------------------------- 1 | Typically you don't install this. It's meant to be a submodule for bulk_extractor and tcptrans 2 | -------------------------------------------------------------------------------- /thread-pool/2105.00613.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simsong/be20_api/HEAD/thread-pool/2105.00613.pdf -------------------------------------------------------------------------------- /demos/README.md: -------------------------------------------------------------------------------- 1 | This directory is little test and demo programs used by the author to learn the ins and outs of C++17 2 | -------------------------------------------------------------------------------- /abstract_image_reader.cpp: -------------------------------------------------------------------------------- 1 | #include "abstract_image_reader.h" 2 | 3 | abstract_image_reader::~abstract_image_reader() 4 | { 5 | } 6 | -------------------------------------------------------------------------------- /scan_sha1_test.h: -------------------------------------------------------------------------------- 1 | #ifndef SCAN_SHA1_H 2 | #define SCAN_SHA1_H 3 | 4 | #include "scanner_params.h" 5 | 6 | scanner_t scan_sha1_test; 7 | #endif 8 | -------------------------------------------------------------------------------- /pos0.cpp: -------------------------------------------------------------------------------- 1 | #include "pos0.h" 2 | 3 | /** 4 | * Map a file; falls back to read if mmap is not available 5 | */ 6 | std::string pos0_t::map_file_delimiter(pos0_t::U10001C); 7 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "dfxml_cpp"] 2 | path = dfxml_cpp 3 | url = https://github.com/dfxml-working-group/dfxml_cpp.git 4 | [submodule "utfcpp"] 5 | path = utfcpp 6 | url = https://github.com/nemtrif/utfcpp.git 7 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | range: 40..60 3 | round: nearest 4 | precision: 2 5 | 6 | ignore: 7 | - "catch.hpp" 8 | - "utf8.h" 9 | - "utf8/*" 10 | - "tests/regex_demo.cpp" 11 | - "test_be20_api.cpp" 12 | -------------------------------------------------------------------------------- /tests/run_each.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # run each test once 3 | TEST=test_be20_api 4 | PATH=$PATH:.:.. 5 | tests=$($TEST -l | egrep -v 'All available|test cases|\[') 6 | for test in $tests ; do 7 | echo ========== $test =========== 8 | echo '$' test_be $test 9 | $TEST $test 10 | echo 11 | echo 12 | done 13 | -------------------------------------------------------------------------------- /bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | mkdir -p build-aux 4 | 5 | # have automake do an initial population if necessary 6 | autoheader -f 7 | touch NEWS README AUTHORS ChangeLog 8 | touch stamp-h 9 | aclocal -I m4 10 | autoconf -f 11 | automake --add-missing --copy 12 | # bootstrap is complete 13 | echo 14 | echo The bootstrap.sh is complete. Be sure to run ./configure. 15 | echo 16 | -------------------------------------------------------------------------------- /doc/unit-tests.txt: -------------------------------------------------------------------------------- 1 | the following unit test frameworks were considered in order: 2 | 3 | 1. https://github.com/exoticlibraries/libcester 4 | 2. https://github.com/catchorg/Catch2/blob/master/docs/assertions.md#top 5 | 3. https://github.com/cpputest/cpputest 6 | 4. https://github.com/unittest-cpp/unittest-cpp 7 | 8 | Currently we are using libcester due to excellent support from the author! 9 | -------------------------------------------------------------------------------- /.make-codecov: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # 2020-10-29 - slg - compile for codecov, run self-test, and upload results. 4 | # 5 | bash bootstrap.sh 6 | ./configure CFLAGS='-g -O0 -fprofile-arcs -ftest-coverage' \ 7 | CXXFLAGS='-g -O0 -fprofile-arcs -ftest-coverage' \ 8 | LIBS='-lgcov' 9 | make clean \ 10 | && make test_be20_api \ 11 | && ./test_be20_api \ 12 | && gcov-9 -n -o . *cpp \ 13 | && bash <(curl -s https://codecov.io/bash) 14 | make distclean 15 | -------------------------------------------------------------------------------- /abstract_image_reader.h: -------------------------------------------------------------------------------- 1 | #ifndef ABSTRACT_IMAGE_READER 2 | #define ABSTRACT_IMAGE_READER 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | class abstract_image_reader { 9 | public: 10 | abstract_image_reader() {}; 11 | virtual ssize_t pread(void *buf, size_t bufsize, uint64_t offset) const = 0; 12 | virtual int64_t image_size() const=0; 13 | virtual std::filesystem::path image_fname() const = 0; 14 | virtual ~abstract_image_reader(); 15 | }; 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /test_image_reader.h: -------------------------------------------------------------------------------- 1 | #ifndef TEST_IMAGE_READER 2 | #define TEST_IMAGE_READER 3 | 4 | 5 | #include "abstract_image_reader.h" 6 | 7 | class test_image_reader : public abstract_image_reader { 8 | public: 9 | test_image_reader(); 10 | virtual ~test_image_reader(); 11 | virtual ssize_t pread(void *buf, size_t bufsize, uint64_t offset) const; 12 | virtual int64_t image_size() const; 13 | virtual std::filesystem::path image_fname() const { return std::filesystem::path("/"); } 14 | }; 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /test_be20_api_malloc_debug: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # This is a shell script that rus the test with malloc debug enabled on MacOS. 3 | # https://developer.apple.com/library/archive/documentation/Performance/Conceptual/ManagingMemory/Articles/MallocDebug.html 4 | export MallocStackLogging=1 5 | export MallocStackLoggingNoCompact=1 6 | export MallocScribble=1 7 | export MallocPreScribble=1 8 | export MallocGuardEdges=1 9 | ./test_be20_api $* || exit 1 10 | 11 | export MallocCheckHeapStart=1000 12 | export MallocCheckHeapEach=100 13 | ./test_be20_api $* || exit 1 14 | -------------------------------------------------------------------------------- /scanner_config.cpp: -------------------------------------------------------------------------------- 1 | #include "config.h" 2 | 3 | #include 4 | #include 5 | 6 | #include "scanner_config.h" 7 | 8 | /************************************ 9 | *** HELP and option processing *** 10 | ************************************/ 11 | 12 | //void scanner_config::set_config(const std::string& name, const std::string& val) { namevals[name] = val; } 13 | //void scanner_config::push_scanner_command(const std::string& scannerName, scanner_command::command_t c) { 14 | // scanner_commands.push_back(scanner_command(scannerName, c)); 15 | //} 16 | -------------------------------------------------------------------------------- /etc/coverage_report.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Create a code-coverage report locally and upload one to codecov 4 | # Should be run from the root directory 5 | 6 | if [ -r coverage_report.sh ]; then 7 | echo "coverage_report.sh run in /etc directory. moving to .." 8 | cd .. 9 | fi 10 | 11 | #make distclean 12 | #CFLAGS="--coverage" CXXFLAGS="--coverage" LDFLAGS="--coverage" ./configure 13 | make check 14 | lcov --capture --directory . --output-file main_coverage.info 15 | genhtml main_coverage.info --output-directory out 16 | 17 | # Upload the coverage report 18 | bash <(curl -s https://codecov.io/bash) 19 | 20 | /bin/rm -f *.gcov *.gcda *.gcno 21 | 22 | -------------------------------------------------------------------------------- /feature_recorder_mhist.h.broken: -------------------------------------------------------------------------------- 1 | /** 2 | * histogram support. 3 | * We can ask the feature recorder to generate a histogram. 4 | * The file feature recorder re-reads the file. 5 | * The in-memory histogram feature recorder just records the features and outputs 6 | * them as a histogram when it shuts down (and whenever it runs out of memory!) 7 | * The SQL recorder uses an SQL query to make the histogram. So it never runs out of memory, 8 | * but it may run slow. 9 | */ 10 | 11 | /* in-memory histograms */ 12 | typedef atomic_histogram mhistogram_t; // memory histogram 13 | typedef std::map mhistograms_t; 14 | -------------------------------------------------------------------------------- /test_image_reader.cpp: -------------------------------------------------------------------------------- 1 | #include "test_image_reader.h" 2 | 3 | test_image_reader::test_image_reader() 4 | { 5 | } 6 | 7 | test_image_reader::~test_image_reader() 8 | { 9 | } 10 | 11 | /* 12 | * Virtual data is 0..255 in positions 0..255 13 | */ 14 | 15 | ssize_t test_image_reader::pread(void *buf, size_t bufsize, uint64_t offset) const 16 | { 17 | if ( offset>=256) return 0; 18 | if ( offset+bufsize > 256) bufsize = 256-offset; 19 | for ( size_t i=0;i(buf))[i] = i+offset; 21 | } 22 | return bufsize; 23 | } 24 | 25 | int64_t test_image_reader::image_size() const 26 | { 27 | return 256; 28 | } 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.d 2 | *.exe 3 | *.gcda 4 | *.gcno 5 | *.gcov 6 | *.info 7 | *.log 8 | *.o 9 | *.so 10 | *.sql3 11 | *.swo 12 | *.swp 13 | *.tar.gz 14 | *.tmp 15 | *.trs 16 | *.zip 17 | *~ 18 | .DS_Store 19 | .deps 20 | .dirstamp 21 | Makefile 22 | Makefile.in 23 | TAGS 24 | _deps 25 | a.out 26 | aclocal.m4 27 | ar-lib 28 | autom4te.cache 29 | be20_api-*-*/ 30 | build-aux/ 31 | compile 32 | config.guess 33 | config.h 34 | config.h.in 35 | config.log 36 | config.status 37 | config.sub 38 | configure 39 | depcomp 40 | install-sh 41 | missing 42 | out/ 43 | stamp-h1 44 | stand 45 | test-driver 46 | test-program.cpp 47 | test_be20_api 48 | test_be20api 49 | test_be20api_catch2 50 | tests/Makefile 51 | x.cpp 52 | test_be20_api 53 | README 54 | stamp-h 55 | *.old 56 | -------------------------------------------------------------------------------- /demos/thread_demo.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Example of C++11 threads and atomic variables 3 | */ 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | std::atomic v(0); 11 | int x(0); 12 | 13 | void adder() 14 | { 15 | for(int i=0;i<1000000;i++){ 16 | v += 1; 17 | x += 1; 18 | } 19 | } 20 | 21 | int main(int argc, char **argv) 22 | { 23 | std::thread *t[10]; 24 | for(int i=0;i<10;i++){ 25 | std::cout << "i=" << i << std::endl; 26 | t[i] = new std::thread(adder); 27 | } 28 | for(int i=0;i<10;i++){ 29 | t[i]->join(); 30 | } 31 | std::cout << "v=" << v << std::endl; 32 | std::cout << "x=" << x << std::endl; 33 | return(0); 34 | } 35 | 36 | -------------------------------------------------------------------------------- /tests/regex_demo.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Small test program to show how to use C++17 regular expressions. 3 | */ 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | int main(int argc,char **argv) 10 | { 11 | std::string s("abc123def"); 12 | std::regex r("([0-9]+)"); 13 | std::smatch m; 14 | if (std::regex_search(s, m, r)){ 15 | std::cout << "Matches '" << m.str() << "'\n"; 16 | } 17 | 18 | /* Try 32-bit vecotrs */ 19 | std::u32string s32(U"Hello"); 20 | std::cout << "len(s32)=" << s32.size() << "\n"; 21 | std::basic_regex r8("([0-9]+)"); 22 | std::basic_regex r16(L"([0-9]+)"); 23 | 24 | // this doesn't work: 25 | //std::basic_regex r32(U"([0-9]+)"); 26 | return(0); 27 | } 28 | -------------------------------------------------------------------------------- /etc/CONFIGURE_WINDOWS_LATEST.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # etc/CONFIGURE_WINDOWS_MSYS2.bash 3 | # Configure MSYS2/MinGW environment for be20_api build 4 | # See: https://www.msys2.org/ 5 | 6 | OS_NAME=msys 7 | MAKE_CONCURRENCY=-j2 8 | MPKGS="" 9 | 10 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 11 | cd "$SCRIPT_DIR" 12 | 13 | . ./paths.bash 2>/dev/null || true 14 | 15 | echo "******************************************************************" 16 | echo "Configuring Windows/MSYS2 environment to compile be20_api" 17 | echo "******************************************************************" 18 | 19 | # Ensure MSYS2 is updated 20 | 21 | # Install required packages 22 | if [ $? != 0 ]; then 23 | echo "Could not install some of the packages. Will not proceed." 24 | exit 1 25 | fi 26 | -------------------------------------------------------------------------------- /formatter.h: -------------------------------------------------------------------------------- 1 | // https://stackoverflow.com/questions/12261915/how-to-throw-stdexceptions-with-variable-messages 2 | 3 | #ifndef FORMATTER_H 4 | #define FORMATTER_H 5 | 6 | #include 7 | #include 8 | 9 | class Formatter { 10 | public: 11 | Formatter() {}; 12 | ~Formatter() {}; 13 | 14 | template Formatter& operator<<(const Type& value) { 15 | stream_ << value; 16 | return *this; 17 | } 18 | 19 | std::string str() const { return stream_.str(); } 20 | operator std::string() const { return stream_.str(); } 21 | 22 | enum ConvertToString { to_str }; 23 | std::string operator>>(ConvertToString) { return stream_.str(); } 24 | 25 | private: 26 | std::stringstream stream_{}; 27 | Formatter(const Formatter&); 28 | Formatter& operator=(Formatter&); 29 | }; 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | # We'll use defaults from the LLVM style, but with 4 columns indentation. 3 | BasedOnStyle: LLVM 4 | IndentWidth: 4 5 | UseTab: Never 6 | --- 7 | Language: Cpp 8 | # Force pointers to the type for C++. 9 | DerivePointerAlignment: false 10 | PointerAlignment: Left 11 | ColumnLimit: 120 12 | AccessModifierOffset: -4 13 | AllowShortBlocksOnASingleLine: true 14 | AllowShortIfStatementsOnASingleLine: true 15 | AllowShortCaseLabelsOnASingleLine: true 16 | AllowShortFunctionsOnASingleLine: true 17 | AllowShortLambdasOnASingleLine: true 18 | AllowShortLoopsOnASingleLine: true 19 | SpaceBeforeCtorInitializerColon: true 20 | Standard: Cpp17 21 | MaxEmptyLinesToKeep: 2 22 | --- 23 | Language: JavaScript 24 | # Use 100 columns for JS. 25 | ColumnLimit: 100 26 | --- 27 | Language: Proto 28 | # Don't format .proto files. 29 | DisableFormat: true 30 | --- 31 | Language: CSharp 32 | # Use 100 columns for C#. 33 | ColumnLimit: 100 34 | -------------------------------------------------------------------------------- /ChangeLog: -------------------------------------------------------------------------------- 1 | 2020-07-10 Simson Garfinkel 2 | 3 | * removed main_thread detection. 4 | 5 | 2020-06-13 Simson Garfinkel 6 | * updated license to MIT License, copyright Simson L. Garfinkel, consistent with the fact that this is no longer an official US Government work product. 7 | 8 | 2019-11-11 Simson Garfinkel 9 | 10 | * Tagged v1.6.0 11 | 12 | 2019-11-10 user 13 | 14 | * bulk_extractor_i.h (safe_utf16to8): fixed catching of polymorphic type value value. Exceptions need to be caught by reference. 15 | (safe_utf8to16): fixed catching of polymorphic type value value. Exceptions need to be caught by reference. 16 | 17 | 2019-11-10 Simson Garfinkel 18 | 19 | * sbuf.h (class sbuf_t): Really want to make *buf private, but it's still used too many places. 20 | 21 | 2021-06-01 Simson Garfinkel 22 | * Complete rewrite for 2.0 23 | -------------------------------------------------------------------------------- /char_class.h: -------------------------------------------------------------------------------- 1 | /** 2 | * \class CharClass 3 | * Examine a block of text and count the number of characters 4 | * in various ranges. This is useful for determining if a block of 5 | * bytes is coded in BASE16, BASE64, etc. 6 | */ 7 | 8 | #ifndef CHAR_CLASS_H 9 | #define CHAR_CLASS_H 10 | struct CharClass { 11 | uint32_t range_0_9{0}; // a range_0_9 character 12 | uint32_t range_A_Fi{0}; // a-f or A-F 13 | uint32_t range_g_z{0}; // g-z 14 | uint32_t range_G_Z{0}; // G-Z 15 | CharClass() {} 16 | void add(const uint8_t ch) { 17 | if (ch >= 'a' && ch <= 'f') range_A_Fi++; 18 | if (ch >= 'A' && ch <= 'F') range_A_Fi++; 19 | if (ch >= 'g' && ch <= 'z') range_g_z++; 20 | if (ch >= 'G' && ch <= 'Z') range_G_Z++; 21 | if (ch >= '0' && ch <= '9') range_0_9++; 22 | } 23 | void add(const uint8_t* buf, size_t len) { 24 | for (size_t i = 0; i < len; i++) { add(buf[i]); } 25 | } 26 | }; 27 | 28 | #endif 29 | -------------------------------------------------------------------------------- /README_WIN.md: -------------------------------------------------------------------------------- 1 | ## The joy of building with Make on Windows 2 | 3 | * There are sub-modules here which require specific hydration 4 | * `git clone --recurse-submodules https://github.com/simsong/be20_api` 5 | * If you've already cloned - `git submodule update --init --recursive` 6 | * Install mysys64 - https://www.msys2.org/ 7 | * Create toolchain using `pacman` 8 | 9 | ```sh 10 | pacman -S \ 11 | base-devel \ 12 | mingw-w64-ucrt-x86_64-gcc \ 13 | mingw-w64-ucrt-x86_64-make \ 14 | mingw-w64-ucrt-x86_64-re2 \ 15 | mingw-w64-ucrt-x86_64-abseil-cpp \ 16 | mingw-w64-ucrt-x86_64-sqlite3 \ 17 | mingw-w64-ucrt-x86_64-openssl \ 18 | mingw-w64-ucrt-x86_64-expat 19 | ``` 20 | * Generate the `config.h` 21 | ``` 22 | ./bootstrap.sh 23 | ``` 24 | * Configure 25 | ``` 26 | ./configure 27 | ``` 28 | * Then make the executable 29 | ``` 30 | make 31 | ``` 32 | * Time for tests 33 | ```shell 34 | ./test_be20_api.exe 35 | 36 | make check || (for fn in test*.log ; do echo ""; echo $fn ; cat $fn ; done; exit 1) 37 | ``` 38 | Done! 39 | -------------------------------------------------------------------------------- /thread-pool/LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Barak Shoshany 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /m4/slg_noopt.m4: -------------------------------------------------------------------------------- 1 | ################################################################ 2 | ############## drop optimization flags and add -g if requested ################ 3 | # Should we disable optimization? 4 | AC_ARG_ENABLE([opt], 5 | [AS_HELP_STRING([--disable-opt],[Drop all -O C flags])], 6 | [with_opt=no], 7 | [with_opt=yes]) 8 | 9 | # Or maybe just tone it down a bit? 10 | AC_ARG_ENABLE([o3], 11 | [AS_HELP_STRING([--disable-o3],[Do not force O3 optimization; use default level])], 12 | [with_o3=no], 13 | [with_o3=yes]) 14 | 15 | if test "${with_opt}" = "no" ; then 16 | CFLAGS=`echo -g "$CFLAGS" | sed s/-O[[0-9]]//` # note the double quoting! 17 | CXXFLAGS=`echo -g "$CXXFLAGS" | sed s/-O[[0-9]]//` 18 | else 19 | # If we are not stripping the optimizer, 20 | # increase optimizer from -O2 to -O3 if not explicitly forbidden 21 | if test "${with_o3}" != "no" ; then 22 | AC_MSG_NOTICE([adding -O3 to CFLAGS and CXXFLAGS]) 23 | CFLAGS=`echo -g "$CFLAGS" | sed 's/-O[123]//'` # note the double quoting! 24 | CFLAGS="$CFLAGS -O3" 25 | 26 | CXXFLAGS=`echo -g "$CXXFLAGS" | sed 's/-O[123]//'` 27 | CXXFLAGS="$CXXFLAGS -O3" 28 | fi 29 | fi 30 | -------------------------------------------------------------------------------- /feature_recorder_sql.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "feature_recorder.h" 11 | #include "pos0.h" 12 | 13 | #ifdef HAVE_SQLITE3_H 14 | #include 15 | 16 | class feature_recorder_sql : public feature_recorder { 17 | struct besql_stmt { 18 | besql_stmt(const besql_stmt&) = delete; 19 | besql_stmt& operator=(const besql_stmt&) = delete; 20 | std::mutex Mstmt{}; 21 | sqlite3_stmt* stmt{}; // the prepared statement 22 | besql_stmt(sqlite3* db3, const char* sql); 23 | virtual ~besql_stmt(); 24 | void insert_feature(const pos0_t& pos, // insert it into this table! 25 | const std::string& feature, const std::string& feature8, const std::string& context); 26 | }; 27 | #if defined(HAVE_SQLITE3_H) and defined(HAVE_LIBSQLITE3) 28 | // virtual void dump_histogram_sqlite3(const histogram_def &def,void *user,feature_recorder::dump_callback_t cb) 29 | // const; 30 | #endif 31 | public: 32 | feature_recorder_sql(class feature_recorder_set& fs, feature_recorder_def def); 33 | virtual ~feature_recorder_sql(); 34 | //virtual void histogram_write(AtomicUnicodeHistogram& h) override; // flush a specific histogram 35 | }; 36 | #endif 37 | -------------------------------------------------------------------------------- /CODING_STANDARDS.txt: -------------------------------------------------------------------------------- 1 | Coding Standards v1.0 2 | Simson L. Garfinkel 3 | December 3, 2013 4 | 5 | All standards are based on compromise. These standards seem to be a 6 | good compromise between a variety of coding styles and existing 7 | standards. 8 | 9 | Executive summary: 10 | 11 | * No tabs in source code. 12 | 13 | Legacy code has tabs at 8 characters; they can be freely converted 14 | to spaces as necessary. 15 | 16 | * Indent at 4 spaces. 17 | 18 | * Open braces start on the SAME LINE for: 19 | - if statements 20 | - inline functions in .h headers 21 | - Java function declarations 22 | 23 | * Open braces start on NEXT LINE for: 24 | - C function declarations 25 | 26 | * We use the following lines/configuration variables to try to enforce 27 | the above: 28 | 29 | For EMACS at the top of c programs: 30 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ 31 | 32 | In .emacs files: 33 | (setq-default indent-tabs-mode nil) 34 | (setq c-basic-offset 4) 35 | 36 | 37 | * In general, do not use pointers in structures if nullptr is undefined. Always use references in these cases. 38 | 39 | References: 40 | =========== 41 | 42 | * http://www.emacswiki.org/emacs/NoTabs 43 | 44 | * http://www.jwz.org/doc/tabs-vs-spaces.html 45 | 46 | * http://slashdot.org/pollBooth.pl?qid=395&aid=-1 47 | 48 | * http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml 49 | 50 | * http://www.python.org/dev/peps/pep-0008/#maximum-line-length 51 | -------------------------------------------------------------------------------- /tests/unilang8.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | "Language Learning and Teaching" (more or less) in 16 or more languages 5 | 6 | 7 |

8 | "Language Learning and Teaching" (more or less) in 16 or more languages 9 |

10 |
11 | 外国語の学習と教授 12 |

13 | Language Learning and Teaching 14 |

15 | Изучение и обучение иностранных языков 16 |

17 | Tere Daaheng Aneng Karimah 18 |

19 | 語文教學・语文教学 20 |

21 | Enseñanza y estudio de idiomas 22 |

23 | Изучаване и Преподаване на Чужди Езици 24 |

25 | ქართული ენის შესწავლა და სწავლება 26 |

27 | 'læŋɡwidʒ 'lɘr:niŋ ænd 'ti:tʃiŋ 28 |

29 | Lus kawm thaib qhia 30 |

31 | Ngôn Ngữ, Sự học, 32 |

33 | ‭‫ללמוד וללמד את השֵפה 34 |

35 | L'enseignement et l'étude des langues 36 |

37 | 말배우기와 가르치기 38 |

39 | Nauka języków obcych 40 |

41 | Γλωσσική Εκμὰθηση και Διδασκαλία 42 |

43 | ‭‫ﺗﺪﺭﯾﺲ ﻭ ﯾﺎﺩﮔﯿﺮﯼ ﺯﺑﺎﻥ 44 |

45 | Sprachlernen und -lehren 46 |

47 | ‭‫ﺗﻌﻠﻢ ﻭﺗﺪﺭﻳﺲ ﺍﻟﻌﺮﺑﻴﺔ 48 |

49 | เรียนและสอนภาษา 50 |

51 |


Home | Site Map 52 | | Services | New 53 | | WinCALIS | UniEdit 54 |
55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /etc/install_autotools.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Originally from https://gist.github.com/GraemeConradie/49d2f5962fa72952bc6c64ac093db2d5 4 | # Install gnu autotools for running under github actions 5 | 6 | ## 7 | # Install autoconf, automake and libtool smoothly on Mac OS X. 8 | # Newer versions of these libraries are available and may work better on OS X 9 | ## 10 | 11 | export build=~/devtools # or wherever you'd like to build 12 | export PATH=$PATH:/usr/local/bin 13 | mkdir -p $build 14 | 15 | ## 16 | # Autoconf 17 | # https://ftpmirror.gnu.org/autoconf 18 | 19 | AUTOCONF="autoconf-2.69" 20 | cd $build 21 | curl -k -OL https://ftpmirror.gnu.org/autoconf/$AUTOCONF.tar.gz || exit 1 22 | ls -l $AUTOCONF.tar.gz 23 | tar xzf $AUTOCONF.tar.gz || xxd $AUTOCONF.tar.gz || exit 1 24 | cd $AUTOCONF 25 | ./configure --prefix=/usr/local || exit 1 26 | make || exit 1 27 | sudo make install || exit 1 28 | 29 | ## 30 | # Automake 31 | # https://ftpmirror.gnu.org/automake 32 | 33 | AUTOMAKE="automake-1.16.3" 34 | cd $build 35 | curl -k -OL https://ftpmirror.gnu.org/automake/$AUTOMAKE.tar.gz || exit 1 36 | ls -l $AUTOMAKE.tar.gz 37 | tar xzf $AUTOMAKE.tar.gz 38 | cd $AUTOMAKE 39 | ./configure --prefix=/usr/local 40 | make 41 | sudo make install 42 | 43 | ## 44 | # Libtool 45 | # https://ftpmirror.gnu.org/libtool 46 | 47 | LIBTOOL=libtool-2.4.6 48 | cd $build 49 | curl -k -OL https://ftpmirror.gnu.org/libtool/$LIBTOOL.tar.gz || exit 1 50 | ls -l $LIBTOOL.tar.gz 51 | tar xzf $LIBTOOL.tar.gz 52 | cd $LIBTOOL 53 | ./configure --prefix=/usr/local 54 | make 55 | sudo make install 56 | 57 | echo "Installation complete." 58 | -------------------------------------------------------------------------------- /m4/slg_address_sanitizer.m4: -------------------------------------------------------------------------------- 1 | ################################################################ 2 | ## AddressSanitizer support 3 | # https://github.com/libMesh/libmesh/issues/1396 4 | AC_ARG_ENABLE([address-sanitizer], 5 | [AS_HELP_STRING([--enable-address-sanitizer], 6 | [enabled AddressSanitizer support for detecting a wide variety of 7 | memory allocation and deallocation errors])], 8 | [AC_DEFINE(HAVE_ADDRESS_SANITIZER, 1, [enable AddressSanitizer]) 9 | address_sanitizer="yes" 10 | CXXFLAGS="$CXXFLAGS -fsanitize=address -fsanitize-address-use-after-scope" 11 | ], 12 | []) 13 | 14 | AC_ARG_ENABLE([thread-sanitizer], 15 | [AS_HELP_STRING([--enable-thread-sanitizer], 16 | [enabled ThreadSanitizer support for detecting a wide variety of 17 | thread interlocking errors])], 18 | [AC_DEFINE(HAVE_THREAD_SANITIZER, 1, [enable ThreadSanitizer]) 19 | thread_sanitizer="yes" 20 | CXXFLAGS="$CXXFLAGS -fsanitize=thread " 21 | ], 22 | []) 23 | 24 | AC_ARG_ENABLE([undefined-sanitizer], 25 | [AS_HELP_STRING([--enable-undefined-sanitizer], 26 | [enabled UndefinedSanitizer support for detecting a wide variety of undefined])], 27 | [AC_DEFINE(HAVE_UNDEFINED_SANITIZER, 1, [enable UndefinedSanitizer]) 28 | undefined_sanitizer="yes" 29 | CXXFLAGS="$CXXFLAGS -fsanitize=undefined " 30 | ], 31 | []) 32 | -------------------------------------------------------------------------------- /Makefile.am: -------------------------------------------------------------------------------- 1 | # be20_api Makefile.am 2 | # This file is compiled with automake to create Makefile.in. 3 | # Makefile.in is transformed by "configure" to create Makefile 4 | # 5 | # (C) 2020-2022 Simson L. Garfinkel 6 | # (C) 2020-2023 BasisTech LLC 7 | # https://www.gnu.org/licenses/lgpl-3.0.en.html 8 | 9 | # don't include bootstrap. People run it, and they shouldn't 10 | # It's only for people who check out the git repo 11 | 12 | # Use the current directory and include the list of BE20_API sources 13 | BE20_API_DIR = . 14 | include Makefile.defs 15 | 16 | DFXML_SRC_DIR=dfxml_cpp/src/ 17 | include $(DFXML_SRC_DIR)Makefile.defs 18 | 19 | # Hardcode dfxml_cpp/src/Makefile.defs ; there is a typo somewhere. 20 | EXTRA_DIST = \ 21 | $(DFXML_EXTRA_DIST) \ 22 | $(BE20_API_EXTRA_DIST) \ 23 | bootstrap.sh \ 24 | test_be20_api_malloc_debug \ 25 | tests/random.dat \ 26 | tests/regex_demo.cpp \ 27 | tests/unilang.htm \ 28 | tests/unilang8.htm 29 | 30 | ETAGS = etags-emacs 31 | ACLOCAL_AMFLAGS = -I m4 32 | 33 | AM_CPPFLAGS = @RE2_CFLAGS@ -I$(top_srcdir)/utfcpp/source 34 | 35 | clean-local: 36 | rm -f *.gcov *~ *.gcda *.gcno 37 | 38 | clean-gcov: 39 | rm -f *.gcov *.gcda *.gcno 40 | 41 | clang-format: 42 | clang-format* -i *h *cpp 43 | 44 | AUTOMAKE_OPTIONS = subdir-objects 45 | 46 | bin_PROGRAMS = test_be20_api 47 | check_PROGRAMS = test_be20_api 48 | check_SCRIPTS = test_be20_api_malloc_debug 49 | TESTS = $(check_PROGRAMS) 50 | 51 | # apitest: test_be20_api 52 | 53 | test_be20_api_LDADD = @RE2_LIBS@ $(LIBS) 54 | test_be20_api_SOURCES = $(BE20_API_SRC) $(DFXML_READER) $(DFXML_WRITER) \ 55 | catch.hpp \ 56 | test_be20_api.cpp \ 57 | test_be20_threadpool.cpp \ 58 | test_image_reader.h \ 59 | test_image_reader.cpp 60 | -------------------------------------------------------------------------------- /.github/workflows/build-windows.yml: -------------------------------------------------------------------------------- 1 | name: BE20_API CI Windows 2 | on: 3 | pull_request: 4 | branches: [ main ] 5 | push: 6 | branches: [ main ] 7 | 8 | 9 | jobs: 10 | build: 11 | runs-on: 'windows-latest' 12 | steps: 13 | - name: Checkout 14 | uses: actions/checkout@v4 15 | with: 16 | submodules: recursive 17 | 18 | # ---------------------------- 19 | # Windows ( mSYS2 UCRT64) 20 | # ---------------------------- 21 | - name: Setup MSYS2 22 | uses: msys2/setup-msys2@v2 23 | with: 24 | update: true 25 | msystem: ucrt64 26 | path-type: inherit 27 | 28 | - name: Install Windows dependencies and bootstrap 29 | shell: msys2 {0} 30 | env: 31 | WANT_AUTOCONF: "2.71" 32 | run: | 33 | pacman -Syu --noconfirm 34 | pacman -S --needed --noconfirm base-devel automake autoconf pkgconf mingw-w64-ucrt-x86_64-gcc mingw-w64-ucrt-x86_64-make \ 35 | mingw-w64-ucrt-x86_64-re2 mingw-w64-ucrt-x86_64-abseil-cpp mingw-w64-ucrt-x86_64-sqlite3 mingw-w64-ucrt-x86_64-openssl \ 36 | mingw-w64-ucrt-x86_64-expat 37 | bash bootstrap.sh 38 | 39 | - name: configure for windows (ucrt64) 40 | shell: msys2 {0} 41 | run: | 42 | ./configure --prefix=/ucrt64 43 | 44 | # ---------------------------- 45 | # build + test windows 46 | # ---------------------------- 47 | - name: make check 48 | shell: msys2 {0} 49 | run: | 50 | make 51 | ./test_be20_api.exe 52 | make check || (for fn in test*.log ; do echo ""; echo $fn ; cat $fn ; done; exit 1) 53 | 54 | - uses: ammaraskar/gcc-problem-matcher@master 55 | name: GCC Problem Matcher 56 | 57 | -------------------------------------------------------------------------------- /m4/slg_gcc_all_warnings.m4: -------------------------------------------------------------------------------- 1 | ################################################################ 2 | # 3 | # Enable all the compiler debugging we can find 4 | # Simson L. Garfinkel 5 | # 6 | # This is originally from PhotoRec, but modified substantially by Simson 7 | # Figure out which flags we can use with the compiler. 8 | # 9 | # These I don't like: 10 | # -Wdeclaration-after-statement -Wconversion 11 | # doesn't work: -Wunreachable-code 12 | # causes configure to crash on gcc-4.2.1: -Wsign-compare-Winline 13 | # causes warnings with unistd.h: -Wnested-externs 14 | # Just causes too much annoyance: -Wmissing-format-attribute 15 | 16 | # Check G++ 17 | # We don't use these warnings: 18 | # -Waggregate-return -- aggregate returns are GOOD; they simplify code design 19 | # We can use these warnings after ZLIB gets upgraded: 20 | # -Wundef --- causes problems with zlib 21 | # -Wcast-qual 22 | # -Wmissing-format-attribute --- Just too annoying 23 | 24 | AC_LANG_PUSH(C++) 25 | AC_CHECK_HEADERS([string]) 26 | CXX_WARNINGS_TO_TEST="-Wall -MD -Wpointer-arith -Wshadow -Wwrite-strings -Wcast-align -Wredundant-decls -Wdisabled-optimization -Wfloat-equal -Wmultichar -Wmissing-noreturn -Woverloaded-virtual -Wsign-promo" 27 | 28 | if test x"${mingw}" != "xyes" ; then 29 | # add the warnings we don't want to do on mingw 30 | CXX_WARNINGS_TO_TEST="$CXX_WARNINGS_TO_TEST -Weffc++" 31 | fi 32 | 33 | AC_MSG_NOTICE([C++ Warnings to test: $CXX_WARNINGS_TO_TEST]) 34 | 35 | for option in $CXX_WARNINGS_TO_TEST 36 | do 37 | SAVE_CXXFLAGS="$CXXFLAGS" 38 | CXXFLAGS="$CXXFLAGS $option" 39 | AC_MSG_CHECKING([whether g++ understands $option]) 40 | AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[]], [[]])], 41 | [has_option=yes], 42 | [has_option=no; CXXFLAGS="$SAVE_CXXFLAGS"]) 43 | AC_MSG_RESULT($has_option) 44 | unset has_option 45 | unset SAVE_CXXFLAGS 46 | done 47 | unset option 48 | AC_LANG_POP() 49 | -------------------------------------------------------------------------------- /demos/regex_demo.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | int main(int argc, char** argv) { 6 | // Try out the new loop 7 | std::string s = "my mother mary."; 8 | for (auto& it : s) { std::cout << "it=" << it << "\n"; } 9 | exit(0); 10 | 11 | // Simple regular expression matching 12 | const std::string fnames[] = {"foo.txt", "bar.txt", "baz.dat", "zoidberg"}; 13 | const std::regex txt_regex("[a-z]+\\.txt"); 14 | 15 | for (const auto& fname : fnames) { std::cout << fname << ": " << std::regex_match(fname, txt_regex) << '\n'; } 16 | 17 | // Extraction of a sub-match 18 | const std::regex base_regex("([a-z]+)\\.txt"); 19 | std::smatch base_match; 20 | 21 | for (const auto& fname : fnames) { 22 | if (std::regex_match(fname, base_match, base_regex)) { 23 | // The first sub_match is the whole string; the next 24 | // sub_match is the first parenthesized expression. 25 | if (base_match.size() == 2) { 26 | std::ssub_match base_sub_match = base_match[1]; 27 | std::string base = base_sub_match.str(); 28 | std::cout << fname << " has a base of " << base << '\n'; 29 | } 30 | } 31 | } 32 | 33 | // Extraction of several sub-matches 34 | const std::regex pieces_regex("([a-z]+)\\.([a-z]+)"); 35 | std::smatch pieces_match; 36 | 37 | for (const auto& fname : fnames) { 38 | if (std::regex_match(fname, pieces_match, pieces_regex)) { 39 | std::cout << fname << '\n'; 40 | for (size_t i = 0; i < pieces_match.size(); ++i) { 41 | std::ssub_match sub_match = pieces_match[i]; 42 | std::string piece = sub_match.str(); 43 | std::cout << " submatch " << i << ": " << piece << '\n'; 44 | } 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /test_be20_threadpool.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * be20_api threadpool test is in this file. 3 | * The goal is to have complete test coverage of the v2 API 4 | * 5 | */ 6 | 7 | // https://github.com/catchorg/Catch2/blob/master/docs/tutorial.md#top 8 | 9 | #define CATCH_CONFIG_CONSOLE_WIDTH 120 10 | 11 | #include "config.h" 12 | #include "catch.hpp" 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | #include "dfxml_cpp/src/hash_t.h" 29 | #include "dfxml_cpp/src/dfxml_writer.h" 30 | 31 | #include "atomic_unicode_histogram.h" 32 | #include "sbuf.h" 33 | #include "sbuf_stream.h" 34 | #include "scanner_set.h" 35 | #include "threadpool.h" 36 | #include "utils.h" 37 | 38 | #ifndef O_BINARY 39 | #define O_BINARY 0 40 | #endif 41 | 42 | [[noreturn]] void alarm_handler(int signal) 43 | { 44 | std::cerr << "alarm\n"; 45 | throw std::runtime_error("scanner_set_mt timeout"); 46 | } 47 | 48 | // This will give an error unless run with MallocNanoZone=0 49 | TEST_CASE("scanner_set_mt", "[thread_pool]") { 50 | std::cout << std::endl << "This will take at least 60 seconds. Don't give up..." << std::endl; 51 | INFO("scanner_set_mt test start"); 52 | std::atomic done{false}; 53 | 54 | std::thread watchdog([&] { 55 | using namespace std::chrono_literals; 56 | std::this_thread::sleep_for(60s); 57 | if (!done) { 58 | FAIL("scanner_set_mt test timed out"); 59 | } 60 | }); 61 | 62 | scanner_config sc; 63 | feature_recorder_set::flags_t f; 64 | scanner_set ss(sc, f, nullptr); 65 | ss.launch_workers(12); 66 | ss.set_spin_poll_time(1); 67 | ss.join(); 68 | 69 | done = true; 70 | watchdog.join(); 71 | } 72 | -------------------------------------------------------------------------------- /sbuf_stream.h: -------------------------------------------------------------------------------- 1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ 2 | #ifndef SBUF_STREAM_H 3 | #define SBUF_STREAM_H 4 | 5 | #include "sbuf.h" 6 | 7 | /** \addtogroup bulk_extractor_APIs 8 | * @{ 9 | */ 10 | 11 | /** \file */ 12 | /** 13 | * sbuf_stream provides the get services of sbuf_t but wrapped in a Stream interface. 14 | * Note that sbuf_stream is not particularly optimized; it is simply a wrapper. 15 | * Right now this is only used by scan_winprefetch. It could become a general iterator. 16 | */ 17 | class sbuf_stream { 18 | private: 19 | const sbuf_t& sbuf; 20 | size_t offset {}; 21 | 22 | public: 23 | sbuf_stream(const sbuf_t& sbuf_); 24 | ~sbuf_stream(); 25 | void seek(size_t offset); 26 | size_t tell(); 27 | 28 | /** 29 | * \name integer-based stream readers 30 | * @{ */ 31 | uint8_t get8u(); 32 | uint16_t get16u(); 33 | uint32_t get32u(); 34 | uint64_t get64u(); 35 | 36 | uint8_t get8uBE(); 37 | uint16_t get16uBE(); 38 | uint32_t get32uBE(); 39 | uint64_t get64uBE(); 40 | 41 | uint8_t get8u(sbuf_t::byte_order_t bo); 42 | uint16_t get16u(sbuf_t::byte_order_t bo); 43 | uint32_t get32u(sbuf_t::byte_order_t bo); 44 | uint64_t get64u(sbuf_t::byte_order_t bo); 45 | 46 | int8_t get8i(); 47 | int16_t get16i(); 48 | int32_t get32i(); 49 | int64_t get64i(); 50 | 51 | int8_t get8iBE(); 52 | int16_t get16iBE(); 53 | int32_t get32iBE(); 54 | int64_t get64iBE(); 55 | 56 | int8_t get8i(sbuf_t::byte_order_t bo); 57 | int16_t get16i(sbuf_t::byte_order_t bo); 58 | int32_t get32i(sbuf_t::byte_order_t bo); 59 | int64_t get64i(sbuf_t::byte_order_t bo); 60 | /** @} */ 61 | 62 | /** 63 | * \name string and wstring stream readers 64 | * @{ */ 65 | std::string getUTF8(); 66 | std::string getUTF8(size_t num_octets_requested ); 67 | std::wstring getUTF16(); 68 | std::wstring getUTF16(size_t num_code_units_requested); 69 | /** @} */ 70 | }; 71 | 72 | #endif 73 | -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | AC_PREREQ([2.71]) 2 | AC_INIT([BE20_API],[2.1.0],[bugs@digitalcorpora.org]) 3 | AC_CONFIG_SRCDIR([Makefile.am]) dnl or src/main.c / src/... 4 | AC_CONFIG_AUX_DIR([build-aux]) 5 | AC_USE_SYSTEM_EXTENSIONS 6 | AM_INIT_AUTOMAKE 7 | 8 | m4_include([be20_configure.m4]) 9 | m4_include([dfxml_cpp/src/dfxml_configure.m4]) 10 | m4_include([m4/slg_address_sanitizer.m4]) 11 | m4_include([m4/slg_noopt.m4]) 12 | 13 | AC_LANG([C++]) 14 | AC_PROG_CC 15 | AC_PROG_CXX 16 | 17 | AC_CONFIG_HEADERS([config.h]) 18 | AC_CONFIG_FILES([Makefile ]) 19 | 20 | dnl Enforce C++20 21 | AX_CXX_COMPILE_STDCXX([17], [noext], [mandatory]) 22 | 23 | dnl Optional: pkg-config and deps 24 | PKG_PROG_PKG_CONFIG 25 | dnl PKG_CHECK_MODULES([DEPS], [foo >= 1.2 bar]) 26 | 27 | dnl Optional feature toggles (asan example) 28 | AC_ARG_ENABLE([asan], 29 | [AS_HELP_STRING([--enable-asan], [Build with AddressSanitizer])], 30 | [], [enable_asan=no]) 31 | AS_IF([test "x$enable_asan" = "xyes"], [ 32 | dnl Append rather than overwrite; keep user flags intact 33 | CXXFLAGS="$CXXFLAGS -fsanitize=address -fno-omit-frame-pointer" 34 | LDFLAGS="$LDFLAGS -fsanitize=address" 35 | ]) 36 | 37 | 38 | ################################################################ 39 | # Take out duplicate flags 40 | CFLAGS=$(echo $CFLAGS | tr ' ' '\n' | sort -u | tr '\n' ' ') 41 | CPPFLAGS=$(echo $CPPFLAGS | tr ' ' '\n' | sort -u | tr '\n' ' ') 42 | CXXFLAGS=$(echo $CXXFLAGS | tr ' ' '\n' | sort -u | tr '\n' ' ') 43 | AM_LDFLAGS=$(echo $LDFLAGS | tr ' ' '\n' | sort -u | tr '\n' ' ') 44 | 45 | AC_MSG_NOTICE([*************************************]) 46 | AC_MSG_NOTICE([*************************************]) 47 | AC_MSG_NOTICE([ PACKAGE_NAME: $PACKAGE_NAME]) 48 | AC_MSG_NOTICE([ PACKAGE_VERSION: $PACKAGE_VERSION]) 49 | AC_MSG_NOTICE([ CC: $CC]) 50 | AC_MSG_NOTICE([ CXX: $CXX]) 51 | AC_MSG_NOTICE([ CPPFLAGS: $CPPFLAGS]) 52 | AC_MSG_NOTICE([ CFLAGS: $CFLAGS]) 53 | AC_MSG_NOTICE([ CXXFLAGS: $CXXFLAGS]) 54 | AC_MSG_NOTICE([ LIBS: $LIBS]) 55 | AC_MSG_NOTICE([ LDFLAGS: $LDFLAGS]) 56 | 57 | 58 | AC_OUTPUT 59 | -------------------------------------------------------------------------------- /scanner_params.cpp: -------------------------------------------------------------------------------- 1 | #include "config.h" 2 | #include "scanner_params.h" 3 | #include "feature_recorder.h" 4 | #include "feature_recorder_set.h" 5 | #include "scanner_set.h" 6 | #include "path_printer.h" 7 | 8 | scanner_params::scanner_params(struct scanner_config &sc_, class scanner_set *ss_, 9 | const path_printer *pp_, phase_t phase_, const sbuf_t* sbuf_) 10 | : sc(sc_), ss(ss_), pp(pp_), phase(phase_), sbuf(sbuf_) 11 | { 12 | } 13 | 14 | scanner_params::scanner_params(const scanner_params& sp_existing, const sbuf_t* sbuf_, std::string pp_path_) 15 | : sc(sp_existing.sc), ss(sp_existing.ss), pp(sp_existing.pp), phase(sp_existing.phase), sbuf(sbuf_), 16 | pp_path(pp_path_), pp_po(sp_existing.pp_po) 17 | { 18 | } 19 | 20 | 21 | /* This interface creates if we are in init phase, doesn't if we are in scan phase */ 22 | feature_recorder& scanner_params::named_feature_recorder(const std::string feature_recorder_name) const 23 | { 24 | assert(ss!=nullptr); 25 | return ss->named_feature_recorder(feature_recorder_name); 26 | } 27 | 28 | /* 29 | * Allow call by scanners using the sp. Currently used in scan_zip 30 | */ 31 | bool scanner_params::check_previously_processed(const sbuf_t &s) const 32 | { 33 | assert(ss!=nullptr); 34 | return ss->previously_processed_count(s)==0; 35 | } 36 | 37 | void scanner_params::recurse(const sbuf_t* new_sbuf) const { 38 | if (pp!=nullptr) { // we have a path printer; call that instead 39 | scanner_params sp_new(*this, new_sbuf, this->pp_path); 40 | try { 41 | pp->process_sp( sp_new ); // where do we keep the path being processed? In scanner_params... 42 | } 43 | catch (path_printer::path_printer_finished &e) { 44 | delete new_sbuf; // make sure it gets deleted 45 | throw; // and re-throw 46 | } 47 | delete new_sbuf; // and now we are done with it. 48 | return; 49 | } 50 | 51 | assert(ss!=nullptr); // make sure there is a scanner set if we are descending 52 | // In normal operations we recurse. However, in unit testing recursion is sometimes intentionally disabled. 53 | // In such a situation, the sbuf is just deleted. 54 | if (ss->allow_recurse()) { 55 | ss->schedule_sbuf(new_sbuf); /* sbuf will be deleted after it is processed */ 56 | } else { 57 | delete new_sbuf; // just delete it 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /be20_configure.m4: -------------------------------------------------------------------------------- 1 | # 2 | # mix-ins for be20_api 3 | # 4 | 5 | AC_MSG_NOTICE([be20_api/be20_configure.m4 start]) 6 | AC_DEFINE(BE20_CONFIGURE_APPLIED, 1, [be20_configure.m4 was included by autoconf.ac]) 7 | 8 | ################################################################ 9 | ## Endian check. Used for sbuf code. 10 | AC_C_BIGENDIAN([AC_DEFINE(BE20_API_BIGENDIAN, 1, [Big Endian aarchitecutre - like M68K])], 11 | AC_DEFINE(BE20_API_LITTLEENDIAN, 1, [Little Endian aarchitecutre - like x86])) 12 | 13 | ################################################################ 14 | ## Headers 15 | AC_CHECK_HEADERS([ dlfcn.h fcntl.h limits.h limits/limits.h linux/if_ether.h net/ethernet.h netinet/if_ether.h netinet/in.h pcap.h pcap/pcap.h sqlite3.h sys/cdefs.h sys/mman.h sys/stat.h sys/time.h sys/types.h sys/vmmeter.h unistd.h windows.h windows.h windowsx.h winsock2.h wpcap/pcap.h mach/mach.h mach-o/dyld.h]) 16 | 17 | AC_CHECK_FUNCS([gmtime_r ishexnumber isxdigit localtime_r unistd.h mmap err errx warn warnx pread64 pread strptime _lseeki64 task_info utimes host_statistics64]) 18 | 19 | ################################################################ 20 | ## Libraries 21 | ## Note that we now require pkg-config 22 | 23 | AC_CHECK_LIB([sqlite3],[sqlite3_libversion]) 24 | AC_CHECK_FUNCS([sqlite3_create_function_v2 sysctlbyname]) 25 | 26 | AC_MSG_NOTICE([be20_configure: CPPFLAGS are now $CPPFLAGS]) 27 | 28 | # re2 29 | AC_LANG_PUSH(C++) 30 | AC_CHECK_HEADERS([re2/re2.h]) 31 | PKG_CHECK_MODULES([RE2], [re2], 32 | [ 33 | AC_MSG_NOTICE([re2 detected]) 34 | AC_DEFINE([HAVE_RE2], [1], [Define if you have the RE2 library]) 35 | AC_DEFINE([HAVE_RE2], [1], [Define if you have the RE2 library]) ], 36 | [AC_MSG_NOTICE([Could not find RE2 library. Please install libre2-dev or equivalent.])] 37 | ) 38 | AC_LANG_POP() 39 | 40 | ################################################################ 41 | ## Check on two annoying warnings 42 | AC_COMPILE_IFELSE([AC_LANG_PROGRAM( 43 | [[#pragma GCC diagnostic ignored "-Wredundant-decls" 44 | int a=3; 45 | ]])], 46 | [AC_DEFINE(HAVE_DIAGNOSTIC_REDUNDANT_DECLS,1,[define 1 if GCC supports -Wredundant-decls])] 47 | ) 48 | 49 | AC_COMPILE_IFELSE([AC_LANG_PROGRAM( 50 | [[#pragma GCC diagnostic ignored "-Wcast-align" 51 | int a=3; 52 | ]])], 53 | [AC_DEFINE(HAVE_DIAGNOSTIC_CAST_ALIGN,1,[define 1 if GCC supports -Wcast-align])] 54 | ) 55 | AC_MSG_NOTICE([be20_api/be20_configure.m4 end]) 56 | 57 | # Take out duplicate flags 58 | RE2_CFLAGS=$(echo $RE2_CFLAGS | tr ' ' '\n' | sort -u | tr '\n' ' ') 59 | RE2_LIBS=$(echo $RE2_LIBS | tr ' ' '\n' | sort -u | tr '\n' ' ') 60 | -------------------------------------------------------------------------------- /path_printer.h: -------------------------------------------------------------------------------- 1 | #ifndef PATH_PRINTER_H 2 | #define PATH_PRINTER_H 3 | 4 | #include 5 | #include 6 | 7 | #include "scanner_params.h" 8 | #include "abstract_image_reader.h" 9 | 10 | // C++ does not allow forward references on nested classes. 11 | // "You can't do it, it's a hole in the C++ language. You'll have to un-nest at least one of the nested classes." 12 | // https://stackoverflow.com/questions/951234/forward-declaration-of-nested-types-classes-in-c 13 | 14 | struct PrintOptions : public std::map { 15 | static inline const std::string HTTP_EOL {"\r\n"}; // stdout is in binary form 16 | static inline const size_t DEFAULT_BUFSIZE = 16384; 17 | enum print_mode_t { MODE_NONE = 0, MODE_HEX, MODE_RAW, MODE_HTTP }; 18 | print_mode_t print_mode {MODE_NONE}; 19 | size_t process_path_bufsize {DEFAULT_BUFSIZE}; 20 | bool http_mode {false}; 21 | std::string get(std::string key, std::string default_) const; 22 | void add_rfc822_header(std::ostream &os, std::string line); 23 | size_t content_length {0}; 24 | }; 25 | 26 | 27 | class path_printer { 28 | class scanner_set &ss; 29 | abstract_image_reader *reader {nullptr}; 30 | mutable std::stringstream os {}; // for temp creation 31 | std::ostream &out; // for output 32 | path_printer(const path_printer &) = delete; 33 | path_printer &operator=(const path_printer &) = delete; 34 | 35 | public:; 36 | class path_printer_finished: public std::exception { 37 | public: 38 | virtual const char *what() const throw() { 39 | return "path printer finished."; 40 | } 41 | }; 42 | 43 | 44 | 45 | path_printer(scanner_set &ss_, abstract_image_reader *reader_, std::ostream &out); 46 | static inline const std::string PRINT {"PRINT"}; 47 | static inline const std::string CONTENT_LENGTH {"Content-Length"}; 48 | static inline const std::string DEFAULT_CONTENT_LENGTH {"4096"}; 49 | 50 | static std::string lowerstr(const std::string str); 51 | static std::string get_and_remove_token(std::string &path); 52 | 53 | void process_sp( const scanner_params &sp ) const; // called recursively by sp.recurse() 54 | void display_path( std::string path, const PrintOptions &po) const; // entry point for process() command 55 | 56 | void process_path(std::string path) ; // main entrance point to display a path, output to os 57 | void process_interactive(std::istream &is) ; // run an interactive server on is 58 | void process_http(std::istream &is); // read an HTTP command from is and send result to os 59 | }; 60 | 61 | #endif 62 | -------------------------------------------------------------------------------- /scan_sha1_test.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * scan_sha1: 4 | * plug-in demonstration that shows how to write a simple plug-in scanner that calculates 5 | * the SHA1 of each sbuf. The hash is written to both the XML file and to the sha1 feature file. 6 | * 7 | * Don't use this in production systems! It has a histogram that isn't useful for most applications. 8 | */ 9 | 10 | #include "config.h" // needed for hash_t 11 | 12 | #include 13 | #include 14 | 15 | #include "dfxml_cpp/src/hash_t.h" 16 | #include "dfxml_cpp/src/dfxml_writer.h" 17 | #include "scan_sha1_test.h" 18 | #include "scanner_params.h" 19 | #include "scanner_set.h" 20 | 21 | feature_recorder *sha1_recorder = nullptr; 22 | void scan_sha1_test(struct scanner_params& sp) { 23 | if (sp.phase == scanner_params::PHASE_INIT) { 24 | /* Create a scanner_info block to register this scanner */ 25 | sp.info->set_name("sha1_test"); 26 | sp.info->author = "Simson L. Garfinkel"; 27 | sp.info->description = "Compute the SHA1 of every sbuf."; 28 | sp.info->url = "https://digitalcorpora.org/bulk_extractor"; 29 | sp.info->scanner_version = "1.0.0"; 30 | sp.info->pathPrefix = "SHA1"; // just use SHA1 31 | sp.info->min_sbuf_size = 1; // we can hash a single byte 32 | 33 | // specify the feature_records that the scanner wants. 34 | // Note that the feature recorder does not need to be the same name as the scanner 35 | // scanners may specify any number of feature recorders. 36 | sp.info->feature_defs.push_back( feature_recorder_def("sha1_bufs") ); 37 | 38 | // Note that histogram_defs is a set, so it's okay if this initialization routine is called twice, 39 | // the histogram only gets inserted once. 40 | histogram_def hd("test_histogram", "sha1_bufs", "^(.....)", "", "first5", histogram_def::flags_t(true, false)); 41 | 42 | sp.info->feature_defs.push_back(feature_recorder_def("sha1_bufs")); 43 | sp.info->histogram_defs.push_back(hd); 44 | return; 45 | } 46 | if (sp.phase == scanner_params::PHASE_INIT2) { 47 | sha1_recorder = &sp.named_feature_recorder("sha1_bufs"); 48 | } 49 | 50 | if (sp.phase == scanner_params::PHASE_SCAN) { 51 | auto hexdigest = sp.sbuf->hash(); 52 | 53 | /* Perhaps we want to cache getting the recorders? */ 54 | sha1_recorder->write(sp.sbuf->pos0, hexdigest, ""); // write the hash with no context 55 | if (sp.ss->writer) { 56 | sp.ss->writer->xmlout("hashdigest",hexdigest,"type='SHA1'",false); 57 | } 58 | return; 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /atomic_set.h: -------------------------------------------------------------------------------- 1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ 2 | 3 | /** 4 | * defines atomic_map and atomic_set. 5 | * This is a nice lightweight atomic set when not much else is needed. 6 | * 7 | * 2020-07-06 - slg - Upgraded to to C++17. 8 | */ 9 | 10 | #ifndef ATOMIC_SET_H 11 | #define ATOMIC_SET_H 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | /* 21 | * note: do not use const TYPE &s for signatures; it caused deadlocks. 22 | */ 23 | 24 | template class atomic_set { 25 | // Mutex M protects myset. 26 | // It is mutable to allow modification in const methods 27 | mutable std::mutex M{}; 28 | std::set myset{}; 29 | 30 | public: 31 | atomic_set() {} 32 | ~atomic_set() { 33 | const std::lock_guard lock(M); 34 | myset.clear(); 35 | } 36 | void clear() { 37 | const std::lock_guard lock(M); 38 | myset.clear(); 39 | } 40 | bool contains(const TYPE& s) const { 41 | const std::lock_guard lock(M); 42 | return myset.find(s) != myset.end(); 43 | } 44 | void insert(const TYPE s) { 45 | const std::lock_guard lock(M); 46 | myset.insert(s); 47 | } 48 | 49 | void erase(const TYPE s) { 50 | const std::lock_guard lock(M); 51 | myset.erase(s); 52 | } 53 | 54 | /* Returns true if s is in the set, false if it is not. 55 | * After return, s is in the set. 56 | */ 57 | bool check_for_presence_and_insert(const TYPE s) { 58 | const std::lock_guard lock(M); 59 | if (myset.find(s) != myset.end()) return true; // in the set 60 | myset.insert(s); // otherwise insert it 61 | return false; // and return that it wasn't 62 | } 63 | 64 | /* Returns true if s is in the set, false if it is not. 65 | * After return, s is not the set. 66 | */ 67 | bool check_for_presence_and_erase(const TYPE s) { 68 | const std::lock_guard lock(M); 69 | bool in_set = (myset.find(s) != myset.end()); 70 | if (in_set){ 71 | myset.erase(s); 72 | } 73 | return in_set; // and return that it wasn't 74 | } 75 | 76 | 77 | /* returns the count, not the bytes */ 78 | size_t size() const { 79 | const std::lock_guard lock(M); 80 | return myset.size(); 81 | } 82 | /* like python .keys() */ 83 | std::vector keys() const { 84 | const std::lock_guard lock(M); 85 | std::vector ret; 86 | for (auto obj: myset) { 87 | ret.push_back(obj); 88 | } 89 | return ret; 90 | } 91 | }; 92 | 93 | #endif 94 | -------------------------------------------------------------------------------- /histogram_def.cpp: -------------------------------------------------------------------------------- 1 | #include "histogram_def.h" 2 | 3 | histogram_def::histogram_def(const std::string& name_, 4 | const std::string& feature_, // which feature file to use 5 | const std::string& pattern_, // which pattern to abstract 6 | const std::string& require_, // text required on the line 7 | const std::string& suffix_, // which suffix to add to the feature file name for the histogram 8 | const struct flags_t& flags_): 9 | name(name_), feature(feature_), pattern(pattern_), reg(pattern_), require(require_), suffix(suffix_), flags(flags_) { 10 | } 11 | 12 | 13 | 14 | bool histogram_def::match(std::u32string u32key, std::string* displayString, const std::string &context) const { 15 | if (flags.lowercase) { 16 | u32key = utf32_lowercase(u32key); 17 | } 18 | 19 | if (flags.numeric) { 20 | u32key = utf32_extract_numeric(u32key); 21 | } 22 | 23 | /* TODO: When we have the ability to do regular expressions in utf32, do that here. 24 | * We don't have that, so do the rest in utf8 25 | */ 26 | 27 | /* Convert match string to u8key */ 28 | std::string u8key = convert_utf32_to_utf8(u32key); 29 | 30 | if (require.size() > 0 ){ 31 | 32 | /* If a string is required and it is not present, return */ 33 | if (flags.require_feature && u8key.find(require) == std::string::npos) { 34 | return false; 35 | } 36 | 37 | if (flags.require_context && context.find(require) == std::string::npos) { 38 | return false; 39 | } 40 | } 41 | 42 | /* Check for pattern */ 43 | if (pattern.size() > 0) { 44 | std::smatch m{}; 45 | std::regex_search(u8key, m, this->reg); 46 | if (m.empty() == true) { // match does not exist 47 | return false; // regex not found 48 | } 49 | u8key = m.str(); 50 | } 51 | 52 | if (displayString) { *displayString = u8key; } 53 | return true; 54 | } 55 | 56 | bool histogram_def::match(std::string u32key, std::string* displayString, const std::string &context) const { 57 | return match(convert_utf8_to_utf32(u32key), displayString, context); 58 | } 59 | 60 | std::ostream& operator<<(std::ostream& os, const histogram_def::flags_t& f) { 61 | os << " "; 67 | return os; 68 | } 69 | 70 | 71 | std::ostream& operator<<(std::ostream& os, const histogram_def& hd) { 72 | os << ""; 74 | return os; 75 | } 76 | -------------------------------------------------------------------------------- /Makefile.defs: -------------------------------------------------------------------------------- 1 | # including be20_api/Makefile.defs 2 | BE20_API_SRC= \ 3 | $(BE20_API_DIR)/aftimer.h \ 4 | $(BE20_API_DIR)/abstract_image_reader.h \ 5 | $(BE20_API_DIR)/abstract_image_reader.cpp \ 6 | $(BE20_API_DIR)/atomic_map.h \ 7 | $(BE20_API_DIR)/atomic_set.h \ 8 | $(BE20_API_DIR)/atomic_unicode_histogram.cpp \ 9 | $(BE20_API_DIR)/atomic_unicode_histogram.h \ 10 | $(BE20_API_DIR)/char_class.h \ 11 | $(BE20_API_DIR)/feature_recorder.cpp \ 12 | $(BE20_API_DIR)/feature_recorder.h \ 13 | $(BE20_API_DIR)/feature_recorder_file.cpp \ 14 | $(BE20_API_DIR)/feature_recorder_file.h \ 15 | $(BE20_API_DIR)/feature_recorder_set.cpp \ 16 | $(BE20_API_DIR)/feature_recorder_set.h \ 17 | $(BE20_API_DIR)/feature_recorder_sql.cpp \ 18 | $(BE20_API_DIR)/feature_recorder_sql.h \ 19 | $(BE20_API_DIR)/formatter.h \ 20 | $(BE20_API_DIR)/histogram_def.cpp \ 21 | $(BE20_API_DIR)/histogram_def.h \ 22 | $(BE20_API_DIR)/machine_stats.h \ 23 | $(BE20_API_DIR)/net_ethernet.h \ 24 | $(BE20_API_DIR)/packet_info.h \ 25 | $(BE20_API_DIR)/path_printer.h \ 26 | $(BE20_API_DIR)/path_printer.cpp \ 27 | $(BE20_API_DIR)/pcap_fake.cpp \ 28 | $(BE20_API_DIR)/pcap_fake.h \ 29 | $(BE20_API_DIR)/pos0.cpp \ 30 | $(BE20_API_DIR)/pos0.h \ 31 | $(BE20_API_DIR)/regex_vector.cpp \ 32 | $(BE20_API_DIR)/regex_vector.h \ 33 | $(BE20_API_DIR)/sbuf.cpp \ 34 | $(BE20_API_DIR)/sbuf.h \ 35 | $(BE20_API_DIR)/sbuf_stream.h \ 36 | $(BE20_API_DIR)/sbuf_stream.cpp \ 37 | $(BE20_API_DIR)/scan_sha1_test.cpp \ 38 | $(BE20_API_DIR)/scan_sha1_test.h \ 39 | $(BE20_API_DIR)/scanner_config.cpp \ 40 | $(BE20_API_DIR)/scanner_config.h \ 41 | $(BE20_API_DIR)/scanner_params.cpp \ 42 | $(BE20_API_DIR)/scanner_params.h \ 43 | $(BE20_API_DIR)/scanner_set.cpp \ 44 | $(BE20_API_DIR)/scanner_set.h \ 45 | $(BE20_API_DIR)/thread-pool/thread_pool.hpp \ 46 | $(BE20_API_DIR)/threadpool.h \ 47 | $(BE20_API_DIR)/threadpool.cpp \ 48 | $(BE20_API_DIR)/unicode_escape.cpp \ 49 | $(BE20_API_DIR)/unicode_escape.h \ 50 | $(BE20_API_DIR)/utfcpp/source/utf8.h \ 51 | $(BE20_API_DIR)/utfcpp/source/utf8/checked.h \ 52 | $(BE20_API_DIR)/utfcpp/source/utf8/core.h \ 53 | $(BE20_API_DIR)/utfcpp/source/utf8/cpp11.h \ 54 | $(BE20_API_DIR)/utfcpp/source/utf8/cpp17.h \ 55 | $(BE20_API_DIR)/utfcpp/source/utf8/unchecked.h \ 56 | $(BE20_API_DIR)/utils.cpp \ 57 | $(BE20_API_DIR)/utils.h \ 58 | $(BE20_API_DIR)/word_and_context_list.cpp \ 59 | $(BE20_API_DIR)/word_and_context_list.h \ 60 | $(BE20_API_DIR)/dfxml_cpp/src/dfxml_writer.h \ 61 | $(BE20_API_DIR)/dfxml_cpp/src/hash_t.h \ 62 | $(BE20_API_DIR)/dfxml_cpp/src/cpuid.h 63 | 64 | BE20_API_EXTRA_DIST=\ 65 | $(BE20_API_DIR)/m4/slg_gcc_all_warnings.m4 \ 66 | $(BE20_API_DIR)/Makefile.defs \ 67 | $(BE20_API_DIR)/Makefile.am \ 68 | $(BE20_API_DIR)/dfxml_cpp/src/Makefile.defs \ 69 | $(BE20_API_DIR)/dfxml_cpp/src/Makefile.am \ 70 | $(BE20_API_DIR)/README.md \ 71 | $(BE20_API_DIR)/utfcpp/LICENSE \ 72 | $(BE20_API_DIR)/utfcpp/README.md 73 | -------------------------------------------------------------------------------- /.github/workflows/build-ubuntu-macos.yml: -------------------------------------------------------------------------------- 1 | name: BE20_API CI Ubuntu and Mac 2 | on: 3 | pull_request: 4 | branches: [ main ] 5 | push: 6 | branches: [ main ] 7 | 8 | 9 | jobs: 10 | build: 11 | runs-on: ${{ matrix.os }} 12 | strategy: 13 | matrix: 14 | os: ['macos-latest','ubuntu-latest'] 15 | 16 | steps: 17 | - name: Checkout 18 | uses: actions/checkout@v4 19 | with: 20 | submodules: recursive 21 | 22 | # ---------------------------- 23 | # MacOS 24 | # ---------------------------- 25 | - name: Install MacOS dependencies and run bootstrap 26 | if: startsWith( matrix.os, 'macos') 27 | run: | 28 | brew install autoconf automake libtool abseil pkg-config pcre re2 29 | autoreconf -fi 30 | 31 | # ---------------------------- 32 | # Ubuntu 33 | # ---------------------------- 34 | - name: Install Ubuntu dependencies and run bootstrap 35 | if: startsWith( matrix.os, 'ubuntu') 36 | run: | 37 | sudo apt update -y 38 | sudo apt install -y autoconf automake g++ lcov libtool libssl-dev libabsl-dev libre2-dev pkg-config make pkg-config zlib1g-dev 39 | autoreconf -fi 40 | 41 | # ---------------------------- 42 | # Configure for each OS 43 | # ---------------------------- 44 | - name: configure for ubuntu with codecov 45 | if: startsWith( matrix.os, 'ubuntu') 46 | run: | 47 | ./configure --disable-opt --enable-address-sanitizer \ 48 | CFLAGS='-g -O0 -fprofile-arcs -ftest-coverage' \ 49 | CXXFLAGS='-g -O0 -fprofile-arcs -ftest-coverage' \ 50 | LIBS='-lgcov' 51 | 52 | - name: configure for macOS 53 | if: startsWith( matrix.os, 'macos') 54 | run: | 55 | ./configure --enable-maintainer-mode --enable-address-sanitizer --disable-opt --enable-silent-rules 56 | 57 | # ---------------------------- 58 | # Common build + test - not windows 59 | # ---------------------------- 60 | - name: make check 61 | run: | 62 | make test_be20_api 63 | ./test_be20_api 64 | make check || (for fn in test*.log ; do echo ""; echo $fn ; cat $fn ; done; exit 1) 65 | 66 | - uses: ammaraskar/gcc-problem-matcher@master 67 | name: GCC Problem Matcher 68 | 69 | - name: list files 70 | run: | 71 | find . -ls 72 | 73 | - name: Generate coverage report for ubuntu 74 | if: startsWith( matrix.os, 'ubuntu') 75 | run: | 76 | lcov --capture --directory . --output-file coverage.info 77 | #lcov --remove linux-coverage.info '/usr/*' --output-file linux-coverage.info 78 | lcov --list coverage.info 79 | 80 | - name: Upload coverage to Codecov 81 | if: startsWith( matrix.os, 'ubuntu') 82 | uses: codecov/codecov-action@v5 83 | with: 84 | token: ${{ secrets.CODECOV_TOKEN }} 85 | fail_ci_if_error: false 86 | files: coverage.info 87 | flags: unittests 88 | name: sleuthkit-codecov 89 | -------------------------------------------------------------------------------- /pcap_fake.h: -------------------------------------------------------------------------------- 1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ 2 | /* 3 | * pcap_fake.h 4 | * A fake libpcap interface that can only read files without a filter. 5 | */ 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | __BEGIN_DECLS 13 | 14 | /* 15 | * Version number of the current version of the pcap file format. 16 | * 17 | * NOTE: this is *NOT* the version number of the libpcap library. 18 | * To fetch the version information for the version of libpcap 19 | * you're using, use pcap_lib_version(). 20 | */ 21 | #define PCAP_VERSION_MAJOR 2 22 | #define PCAP_VERSION_MINOR 4 23 | #define PCAP_ERRBUF_SIZE 256 24 | 25 | struct pcap_file_header { 26 | uint32_t magic; // d4 c3 b2 a1 27 | uint16_t version_major; // 02 00 28 | uint16_t version_minor; // 04 00 29 | int32_t thiszone; /* gmt to local correction - 00 00 00 00*/ 30 | uint32_t sigfigs; /* accuracy of timestamps */ 31 | uint32_t snaplen; /* max length saved portion of each pkt */ 32 | uint32_t linktype; /* data link type (LINKTYPE_*) */ 33 | } __attribute__((packed)); 34 | struct pcap_pkthdr { 35 | struct timeval ts; /* time stamp; native */ 36 | uint32_t caplen; /* length of portion present */ 37 | uint32_t len; /* length this packet (off wire) */ 38 | } __attribute__((packed)); 39 | 40 | /* What we need after opening the file to process each next packet */ 41 | typedef struct pcap pcap_t; 42 | 43 | /* 44 | * Taken from pcap-int.h 45 | */ 46 | // typedef int (*setfilter_op_t)(pcap_t *, struct bpf_program *); 47 | typedef void (*pcap_handler)(uint8_t*, const struct pcap_pkthdr*, const uint8_t*); 48 | 49 | struct bpf_program { 50 | int valid; // set true if filter is valid 51 | }; 52 | 53 | char* pcap_lookupdev(char*); // not implemented 54 | pcap_t* pcap_open_live(const char*, int, int, int, char*); // not implemented 55 | pcap_t* pcap_open_offline(const char*, char*); // open the file; set f 56 | pcap_t* pcap_fopen_offline(FILE* fp, char* errbuf); 57 | void pcap_close(pcap_t*); // close the file 58 | int pcap_loop(pcap_t*, int, pcap_handler, uint8_t*); // read the file and call loopback on each packet 59 | int pcap_datalink(pcap_t*); // noop 60 | int pcap_setfilter(pcap_t*, struct bpf_program*); // noop 61 | int pcap_compile(pcap_t*, struct bpf_program*, const char*, int, uint32_t); // generate error if filter provided 62 | char* pcap_geterr(pcap_t*); 63 | /* 64 | * These are the types that are the same on all platforms, and that 65 | * have been defined by for ages. 66 | */ 67 | #define DLT_NULL 0 /* BSD loopback encapsulation */ 68 | #define DLT_EN10MB 1 /* Ethernet (10Mb) */ 69 | #define DLT_EN3MB 2 /* Experimental Ethernet (3Mb) */ 70 | #define DLT_AX25 3 /* Amateur Radio AX.25 */ 71 | #define DLT_PRONET 4 /* Proteon ProNET Token Ring */ 72 | #define DLT_CHAOS 5 /* Chaos */ 73 | #define DLT_IEEE802 6 /* 802.5 Token Ring */ 74 | #define DLT_ARCNET 7 /* ARCNET, with BSD-style header */ 75 | #define DLT_SLIP 8 /* Serial Line IP */ 76 | #define DLT_PPP 9 /* Point-to-point Protocol */ 77 | #define DLT_FDDI 10 /* FDDI */ 78 | #define DLT_RAW 101 /* just packets */ 79 | 80 | __END_DECLS 81 | -------------------------------------------------------------------------------- /regex_vector.h: -------------------------------------------------------------------------------- 1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ 2 | /* 3 | * regex_vector.h: 4 | * 5 | * Now this covers Google's RE2 library. 6 | * Note: 7 | * 1 - RE2 and the objects are not move insertable, so we need to manually manage creating and deleting them. 8 | * 2 - RE2's PartialMatch function wont' return the position of a match unless it is wrapped in a group " () ", 9 | so we do that. 10 | */ 11 | 12 | #ifndef REGEX_VECTOR_H 13 | #define REGEX_VECTOR_H 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include "config.h" 25 | 26 | #ifdef HAVE_RE2 27 | #include // it's always here. 28 | #endif 29 | 30 | /** 31 | * The regex_vector is a vector of character regexes with a few additional convenience functions. 32 | * We might want to change this to handle ASCII, UTF-16 and UTF-8 characters simultaneously. 33 | * Only RE2 is supported because it is the only regular expression library that doesn't die on large segments. 34 | * See: https://swtch.com/~rsc/regexp/regexp3.html#caveats 35 | */ 36 | 37 | class regex_vector { 38 | std::vector regex_strings; // the original regex strings 39 | #ifdef HAVE_RE2 40 | std::vector re2_regex_comps; // the compiled regular expressions 41 | #endif 42 | regex_vector(const regex_vector&) = delete; 43 | regex_vector& operator=(const regex_vector&) = delete; 44 | static const std::string RE_ENGINE; 45 | 46 | public: 47 | static bool engine_enabled(const std::string engine) { 48 | /** each engine is enabled if it is the first to check, or if it is specified */ 49 | return std::getenv(RE_ENGINE.c_str()) == nullptr || 50 | std::getenv(RE_ENGINE.c_str())==engine; 51 | } 52 | regex_vector() : regex_strings() 53 | #ifdef HAVE_RE2 54 | , re2_regex_comps() 55 | #endif 56 | {}; 57 | ~regex_vector(); 58 | 59 | // is this a regular expression with meta characters? 60 | static bool has_metachars(const std::string& str); 61 | const std::string regex_engine(); // which engine is in use 62 | 63 | /* Add a string */ 64 | #ifndef HAVE_RE2 65 | [[noreturn]] 66 | #endif 67 | void push_back(const std::string& val); 68 | // Empty the vectors. For the compiled, be sure to delete them 69 | void clear(); 70 | size_t size() const; // the number of regular expressions in the vector 71 | 72 | /** 73 | * Read regular expressions from a file: returns 0 if successful, -1 if failure. 74 | * @param fname - the file to read. 75 | */ 76 | int readfile(const std::string& fname); // read a file of regexes, one per line 77 | 78 | /** Run Return true if any of the regexes match. 79 | * search_all() is threadsafe. 80 | * @param probe - the string we are searching. 81 | * *found - set to the found string if something is found. 82 | */ 83 | 84 | bool search_all(const std::string& probe, 85 | std::string* found, 86 | size_t* offset = nullptr, 87 | size_t* len = nullptr) const; 88 | void dump(std::ostream& os) const; 89 | }; 90 | 91 | std::ostream& operator<<(std::ostream& os, const class regex_vector& rv); 92 | 93 | #endif 94 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | ## Copyright/Non-Copyright Statements 2 | 3 | **bulk_extractor** was originally developed by Simson Garfinkel while at 4 | the Naval Postgraduate School. As a work of the US Government this 5 | work is not subject to copyright law. 6 | 7 | Simson Garfinkel left the Naval Postgraduate School in January 2015 8 | and continued to work on **bulk_extractor** in his personal 9 | capacity. Those modifications are covered under the MIT license. Other 10 | components are licensed as noted. 11 | 12 | ## MIT License. 13 | 14 | Copyright (c) 2020, Simson L. Garfinkel {{ organization }} 15 | 16 | Permission is hereby granted, free of charge, to any person obtaining a copy 17 | of this software and associated documentation files (the "Software"), to deal 18 | in the Software without restriction, including without limitation the rights 19 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 20 | copies of the Software, and to permit persons to whom the Software is 21 | furnished to do so, subject to the following conditions: 22 | 23 | The above copyright notice and this permission notice shall be included in all 24 | copies or substantial portions of the Software. 25 | 26 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 27 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 28 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 29 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 30 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 31 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 32 | OR OTHER DEALINGS IN THE SOFTWARE. 33 | 34 | ## CC0 Original Summary 35 | 36 | Except as otherwise noted, bulk_extractor source code files are public domain 37 | software. 38 | 39 | That software provided here is released by the Naval Postgraduate 40 | School, an agency of the U.S. Department of Navy. The software bears 41 | no warranty, either expressed or implied. NPS does not assume legal 42 | liability nor responsibility for a User's use of the software or the 43 | results of such use. 44 | 45 | Please note that within the United States, copyright protection, under 46 | Section 105 of the United States Code, Title 17, is not available for 47 | any work of the United States Government and/or for any works created 48 | by United States Government employees. 49 | 50 | However, because some bulk_extractor source modules (e.g. pyxpress.c) 51 | are covered under the GNU Public License, the compiled bulk_extractor 52 | executable is covered under the GPL copyright. This means that binary 53 | distributions of bulk_extractor must include the full source code (or 54 | have the source code be made easily available.) 55 | 56 | ## Other materials 57 | 58 | bulk_extractor includes the following materials: 59 | 60 | * uses some SleuthKit 3 include files. There are present 61 | in the directory src/tsk3. 62 | 63 | * src/tsk3/ includes SleuthKit 3 include files that are party of 64 | SleuthKit 3. These files are Copyright (C) 2010 Brian Carrier and covered under 65 | the Common Public License 1.0 66 | 67 | * src/be20_api/utf8.h is Copyright 2006 Nemanja Trifunovic 68 | 69 | * src/base64_forensic.cpp is Copyright (C) 1996-1999 by Internet Software Consortium, with 70 | portions Copyright (c) 1995 by International Business Machines, Inc. 71 | 72 | * src/scan_ascii85.cpp is Copyright (C) 2011 Remy Oukaour 73 | 74 | * src/scan_json.cpp is Copyright (c) 2005 JSON.org 75 | 76 | * src/pyxpress.c is Copyright 2008 (c) Matthieu Suiche. 77 | -------------------------------------------------------------------------------- /net_ethernet.h: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 1997, 1999, 2001, 2008 Free Software Foundation, Inc. 2 | This file is part of the GNU C Library. 3 | 4 | The GNU C Library is free software; you can redistribute it and/or 5 | modify it under the terms of the GNU Lesser General Public 6 | License as published by the Free Software Foundation; either 7 | version 2.1 of the License, or (at your option) any later version. 8 | 9 | The GNU C Library is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | Lesser General Public License for more details. 13 | 14 | You should have received a copy of the GNU Lesser General Public 15 | License along with the GNU C Library; if not, write to the Free 16 | Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 17 | 02111-1307 USA. */ 18 | 19 | /* Based on the FreeBSD version of this file. Curiously, that file 20 | lacks a copyright in the header. */ 21 | 22 | #ifndef __NET_ETHERNET_H 23 | #define __NET_ETHERNET_H 1 24 | 25 | #include 26 | #include 27 | //#include /* IEEE 802.3 Ethernet constants */ 28 | 29 | __BEGIN_DECLS 30 | 31 | /* This is a name for the 48 bit ethernet address available on many 32 | systems. */ 33 | struct ether_addr { 34 | u_int8_t ether_addr_octet[ETH_ALEN]; 35 | } __attribute__((__packed__)); 36 | 37 | /* 10Mb/s ethernet header */ 38 | struct ether_header { 39 | u_int8_t ether_dhost[ETH_ALEN]; /* destination eth addr */ 40 | u_int8_t ether_shost[ETH_ALEN]; /* source ether addr */ 41 | u_int16_t ether_type; /* packet type ID field */ 42 | } __attribute__((__packed__)); 43 | 44 | /* Ethernet protocol ID's */ 45 | #define ETHERTYPE_PUP 0x0200 /* Xerox PUP */ 46 | #define ETHERTYPE_SPRITE 0x0500 /* Sprite */ 47 | #define ETHERTYPE_IP 0x0800 /* IP */ 48 | #define ETHERTYPE_ARP 0x0806 /* Address resolution */ 49 | #define ETHERTYPE_REVARP 0x8035 /* Reverse ARP */ 50 | #define ETHERTYPE_AT 0x809B /* AppleTalk protocol */ 51 | #define ETHERTYPE_AARP 0x80F3 /* AppleTalk ARP */ 52 | #define ETHERTYPE_VLAN 0x8100 /* IEEE 802.1Q VLAN tagging */ 53 | #define ETHERTYPE_IPX 0x8137 /* IPX */ 54 | #define ETHERTYPE_IPV6 0x86dd /* IP protocol version 6 */ 55 | #define ETHERTYPE_LOOPBACK 0x9000 /* used to test interfaces */ 56 | 57 | #define ETHER_ADDR_LEN ETH_ALEN /* size of ethernet addr */ 58 | #define ETHER_TYPE_LEN 2 /* bytes in type field */ 59 | #define ETHER_CRC_LEN 4 /* bytes in CRC field */ 60 | #define ETHER_HDR_LEN ETH_HLEN /* total octets in header */ 61 | #define ETHER_MIN_LEN (ETH_ZLEN + ETHER_CRC_LEN) /* min packet length */ 62 | #define ETHER_MAX_LEN (ETH_FRAME_LEN + ETHER_CRC_LEN) /* max packet length */ 63 | 64 | /* make sure ethenet length is valid */ 65 | #define ETHER_IS_VALID_LEN(foo) ((foo) >= ETHER_MIN_LEN && (foo) <= ETHER_MAX_LEN) 66 | 67 | /* 68 | * The ETHERTYPE_NTRAILER packet types starting at ETHERTYPE_TRAIL have 69 | * (type-ETHERTYPE_TRAIL)*512 bytes of data followed 70 | * by an ETHER type (as given above) and then the (variable-length) header. 71 | */ 72 | #define ETHERTYPE_TRAIL 0x1000 /* Trailer packet */ 73 | #define ETHERTYPE_NTRAILER 16 74 | 75 | #define ETHERMTU ETH_DATA_LEN 76 | #define ETHERMIN (ETHER_MIN_LEN - ETHER_HDR_LEN - ETHER_CRC_LEN) 77 | 78 | __END_DECLS 79 | 80 | #endif /* net/ethernet.h */ 81 | -------------------------------------------------------------------------------- /feature_recorder_file.h: -------------------------------------------------------------------------------- 1 | #ifndef FEATURE_RECORDER_FILE_H 2 | #define FEATURE_RECORDER_FILE_H 3 | 4 | #include "config.h" 5 | 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include "feature_recorder.h" 20 | #include "pos0.h" 21 | #include "sbuf.h" 22 | 23 | class feature_recorder_file : public feature_recorder { 24 | public: 25 | inline static const std::string feature_file_header {"# Feature-File-Version: 1.1\n"}; 26 | inline static const std::string histogram_file_header {"# Histogram-File-Version: 1.1\n"}; 27 | inline static const std::string bulk_extractor_version_header { 28 | "# " PACKAGE_NAME "-Version: " PACKAGE_VERSION "\n"}; 29 | 30 | static std::string unquote_string(const std::string& s); 31 | 32 | feature_recorder_file(class feature_recorder_set& fs, const feature_recorder_def def); 33 | virtual ~feature_recorder_file(); 34 | virtual void flush() override; 35 | static bool extract_feature_context(const std::string& line, std::string &feature, std::string &context); // extract feature and context, return true if successful 36 | static bool isodigit(uint8_t ch){ 37 | return ch>='0' && ch<='7'; 38 | } 39 | 40 | private: 41 | std::mutex Mios{}; // mutex for IOS 42 | std::fstream ios{}; // where features are written 43 | 44 | void banner_stamp(std::ostream& os, const std::string& header) const; // stamp banner, and header 45 | 46 | //static const std::string histogram_file_header; 47 | //static const std::string feature_file_header; 48 | //static const std::string bulk_extractor_version_header; 49 | 50 | virtual void shutdown() override; 51 | 52 | public: 53 | /* these are not threadsafe and should only be called in startup */ 54 | // void set_carve_ignore_encoding( const std::string &encoding ){ MAINTHREAD();ignore_encoding = encoding;} 55 | /* End non-threadsafe */ 56 | 57 | // add i to file_number and return the result 58 | // fetch_add() returns the original number 59 | 60 | /* where stopped items (on stop_list or context_stop_list) get recorded: 61 | * Cannot be made inline becuase it accesses fs. 62 | */ 63 | virtual void write0(const std::string& str) override; 64 | virtual void write0(const pos0_t& pos0, const std::string& feature, const std::string& context) override; 65 | 66 | /* histogram support. 67 | * The file based feature recorder can store the histogram incrementally in memory or it can make it at the end in a second pass. 68 | */ 69 | static const inline int MAX_HISTOGRAM_FILES = 10; // don't make more than 10 files in low-memory conditions 70 | 71 | // the histograms are made in memory with the AtomicUnicodeHistogram object. 72 | // Each one contains the histogram_def. 73 | std::vector> histograms{}; 74 | 75 | virtual size_t histogram_count() override; // how many histograms it has 76 | virtual void histogram_add(const struct histogram_def& def) override; // add a new histogram 77 | 78 | // Adding features to the histogram 79 | 80 | virtual void histogram_write_from_memory(AtomicUnicodeHistogram& h); // actually write this histogram 81 | virtual void histogram_write_from_file(AtomicUnicodeHistogram& h); // actually write this histogram 82 | virtual void histogram_write(AtomicUnicodeHistogram& h); // write this histogram 83 | virtual void histograms_incremental_add_feature_context(const std::string& feature, const std::string& context) override; 84 | virtual bool histograms_write_largest() override; 85 | virtual void histograms_write_all() override; 86 | }; 87 | 88 | /** @} */ 89 | 90 | #endif 91 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | # Log of work on 2 | ## 2021-04-23 3 | - Got `TEST_CASE("run", "[scanner_set]")` mostly working. 4 | - After it runs, the output directory looks like this: 5 | ``` 6 | (base) simsong@nimi be13_api % ls -l /var/folders/09/8v4pdnys627fqqh3vjbvsnq40000gn/T/ISmG9qlC/ 7 | total 4 8 | -rw-r--r-- 1 simsong staff 0 Apr 23 21:20 alerts.txt 9 | -rw-r--r-- 1 simsong staff 172 Apr 23 21:20 sha1_bufs.txt 10 | -rw------- 1 simsong staff 0 Apr 23 21:20 sha1_bufs_Az?? 11 | (base) simsong@nimi be13_api % cat /var/folders/09/8v4pdnys627fqqh3vjbvsnq40000gn/T/ISmG9qlC/sha1_bufs.txt 12 | # BANNER FILE NOT PROVIDED (-b option) 13 | # BE13_API-Version: 1.0.0 14 | # Feature-Recorder: sha1_bufs 15 | # Feature-File-Version: 1.1 16 | hello-0 d3486ae9136e7856bc42212385ea797094475802 17 | ``` 18 | 19 | - [ ] Histogram is created with the wrong filename 20 | - [ ] Histogram file is empty 21 | 22 | ## 2021-04-24 23 | Current problems are the UTF-8 histograms that are extracted with 24 | regular expressions. Ideally we should do the regular expressions in 25 | Unicode, not in UTF-8 26 | 27 | Another option is to do everything as UTF-32 regex and convert the 28 | UTF-32 to UTF-8 when rendering into the files. 29 | - https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex 30 | 31 | Another option is to add an ICU dependency: 32 | - https://unicode-org.github.io/icu/userguide/strings/regexp.html 33 | 34 | See also: 35 | - http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/p0169r0.html 36 | 37 | Oh, Boost has a unicode regular expressions too: 38 | - https://www.boost.org/doc/libs/1_46_1/libs/regex/doc/html/boost_regex/ref/non_std_strings/icu/unicode_algo.html 39 | 40 | But make a decision. What we currently have is a mess. 41 | 42 | ## 2021-04-25 43 | Found an error in which a value from the stack was being passed by 44 | reference, the reference was being retained, and then it was going 45 | bad. 46 | - [ ] Review every pass by reference and change to pass by value when 47 | possible. Note that pass by value may be more efficient than pass by 48 | reference with modern compilers. 49 | - [x] Looks like the Atomic Unicode Histogram is using an ASCII/UTF-8 50 | regular expression on a UTF32 value, which isn't working. Perhaps 51 | I'm wrong above, and all regular expressions should be done in UTF-8 52 | and not UTF-32? EDIT: Decided not to do this. 53 | - [x] Perhaps move to SRELL as the regex package? 54 | http://www.akenotsuki.com/misc/srell/en/. EDIT: Decided not to do this. 55 | 56 | 57 | ## 2021-04-27 58 | All errors in histogram production seem to be fixed! 59 | - [ ] Need to decide if the first BE2.0 program will be bulk_extractor 60 | of tcpflow. Since tcpflow works, let's with with bulk_extractor. 61 | 62 | 63 | # Outstanding things to do 64 | 65 | - [ ] move histograms out of feature_recorder and feature_recorder_set. 66 | - [ ] Instead, histograms are made by the scanner set after the scanners have run, in the shutdown mode. 67 | - The feature recorders just need a way of reading the contents. 68 | - The feature_recorder can have any number of readers. It's just an open iostream. 69 | - [ ] Make histogram in-memory and throw them out if you run out of memory, going into low-memory mode for the second pass. 70 | - [ ] Merge of all outstanding histograms can be done single-threaded 71 | or multi-threaded. 72 | 73 | ## 2021-05-08 74 | - [ ] Get scanner commands moved from scanner_set to scanner_config. 75 | - [ ] Implement processing of scanner commands to scanner set. 76 | - [ ] Implement tests 77 | 78 | ## 2021-06-12 79 | - [ ] sbuf_stream and sbuf_private should both be factored into sbuf. 80 | 81 | - [ ] FrequencyReportHistogram should use unique_ptr<> rather than 82 | actually the report elements on the vector. 83 | 84 | ## 2021-11-16 85 | - [ ] Don't need get_scanner_by_name(). I just need a list of the 86 | enabled scanners and a map of scanner names to scanner info 87 | -------------------------------------------------------------------------------- /regex_vector.cpp: -------------------------------------------------------------------------------- 1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ 2 | 3 | #include "regex_vector.h" 4 | 5 | /* rewritten to use C++11's regex */ 6 | const std::string regex_vector::RE_ENGINE {"RE_ENGINE"}; 7 | const std::string regex_vector::regex_engine() 8 | { 9 | #ifdef HAVE_RE2 10 | if (engine_enabled("RE2")) { 11 | return std::string("RE2"); 12 | } 13 | #endif 14 | return std::string("STD::REGEX"); 15 | } 16 | 17 | regex_vector::~regex_vector() 18 | { 19 | clear(); 20 | } 21 | 22 | 23 | /* Only certain characters are assumed to be a regular expression. These characters are 24 | * coincidently never in email addresses. 25 | */ 26 | bool regex_vector::has_metachars(const std::string& str) { 27 | for (auto& it : str) { 28 | switch (it) { 29 | case '*': 30 | case '[': 31 | case '(': 32 | case '?': return true; 33 | } 34 | } 35 | return false; 36 | } 37 | 38 | #ifndef HAVE_RE2 39 | [[noreturn]] 40 | #endif 41 | void regex_vector::push_back(const std::string& val) { 42 | #ifdef HAVE_RE2 43 | RE2::Options options; 44 | options.set_case_sensitive(false); 45 | if (engine_enabled("RE2")){ 46 | regex_strings.push_back(val); 47 | RE2 *re = new RE2(std::string("(") + val + std::string(")"), options); 48 | if (!re->ok()){ 49 | std::cerr << "RE2 compilation failed error: " << re->error() << " compiling: " << val << std::endl; 50 | throw std::runtime_error(std::string("RE2 compilation failed")); 51 | } 52 | re2_regex_comps.push_back( re ); 53 | return; 54 | } 55 | #else 56 | throw std::runtime_error(std::string("RE2 not compiled in")); 57 | #endif 58 | } 59 | 60 | void regex_vector::clear() { 61 | regex_strings.clear(); 62 | #ifdef HAVE_RE2 63 | for (RE2 *re: re2_regex_comps) { 64 | delete re; 65 | } 66 | re2_regex_comps.clear(); 67 | #endif 68 | } 69 | 70 | size_t regex_vector::size() const { 71 | #ifdef HAVE_RE2 72 | return re2_regex_comps.size(); 73 | #else 74 | return 0; 75 | #endif 76 | } 77 | 78 | /** 79 | * perform a search for a single hit. If there is a group and something is found, 80 | * set *found to be what was found, *offset to be the starting offset, and *len to be 81 | * the length. Note that this only handles a single group. 82 | */ 83 | bool regex_vector::search_all(const std::string& probe, std::string* found, size_t* offset, size_t* len) const { 84 | #ifdef HAVE_RE2 85 | for (RE2 *re: re2_regex_comps) { 86 | re2::StringPiece sp; 87 | if (RE2::PartialMatch( probe, *re, &sp) ){ 88 | if (found) *found = std::string(sp.data(), sp.size()); 89 | if (offset) *offset = sp.data() - probe.data(); // this is so gross 90 | if (len) *len = sp.length(); 91 | return true; 92 | } 93 | } 94 | #endif 95 | return false; 96 | } 97 | 98 | int regex_vector::readfile(const std::string& fname) { 99 | std::ifstream f(fname.c_str()); 100 | if (f.is_open()) { 101 | while (!f.eof()) { 102 | std::string line; 103 | getline(f, line); 104 | 105 | /* remove the last character while it is a \n or \r */ 106 | if (line.size() > 0 && (((*line.end()) == '\r') || (*line.end()) == '\n')) { line.erase(line.end()); } 107 | 108 | /* Create a regular expression and add it */ 109 | push_back(line); 110 | } 111 | f.close(); 112 | return 0; 113 | } 114 | return -1; 115 | } 116 | 117 | void regex_vector::dump(std::ostream& os) const { 118 | for (auto const& it : regex_strings) { 119 | os << it << "\n"; 120 | } 121 | } 122 | 123 | std::ostream& operator<<(std::ostream& os, const class regex_vector& rv) { 124 | rv.dump(os); 125 | return os; 126 | } 127 | -------------------------------------------------------------------------------- /machine_stats.h: -------------------------------------------------------------------------------- 1 | #ifndef MACHINE_STATS_H 2 | #define MACHINE_STATS_H 3 | 4 | #ifndef BE20_CONFIGURE_APPLIED 5 | #error config.h with be20_api additions must be included before machine_stats.h 6 | #endif 7 | 8 | #ifdef HAVE_MACH_MACH_H 9 | #include 10 | #include 11 | #include 12 | #include // for mach_msg_type_number_t 13 | #include // for kern_return_t 14 | #include 15 | #endif 16 | 17 | #ifdef HAVE_SYS_VMMETER_H 18 | #include 19 | #endif 20 | 21 | #include 22 | #include 23 | 24 | /** 25 | * return the CPU percentage (0-100) used by the current process. Use 'ps -O %cpu if system call not available. 26 | * The popen implementation is not meant to be efficient. 27 | */ 28 | struct machine_stats { 29 | static float get_cpu_percentage() { 30 | char buf[100]; 31 | snprintf(buf,sizeof(buf),"ps -O %ccpu %d",'%',getpid()); 32 | FILE *f = popen(buf,"r"); 33 | if(f==nullptr){ 34 | perror("popen failed\n"); 35 | return(0); 36 | } 37 | if (fgets(buf,sizeof(buf),f)==NULL) return nan("error1"); /* read the first line */ 38 | if (fgets(buf,sizeof(buf),f)==NULL) return nan("error2"); /* read the second line */ 39 | pclose(f); 40 | buf[sizeof(buf)-1] = 0; // in case it needs termination 41 | int pid=0; 42 | float ff = 0; 43 | int count = sscanf(buf,"%d %f",&pid,&ff); 44 | return (count==2) ? ff : nan("get_cpu_percentage"); 45 | }; 46 | 47 | static uint64_t get_available_memory() { 48 | // If there is a /proc/meminfo, use it 49 | std::ifstream meminfo("/proc/meminfo"); 50 | if (meminfo.is_open()) { 51 | std::string line; 52 | while (std::getline(meminfo, line)) { 53 | if (line.substr(0,13)=="MemAvailable:") { 54 | return std::stoll(line.substr(14))*1024; 55 | } 56 | } 57 | } 58 | 59 | #ifdef HAVE_HOST_STATISTICS64 60 | // on macs, use this 61 | // https://opensource.apple.com/source/system_cmds/system_cmds-496/vm_stat.tproj/vm_stat.c.auto.html 62 | 63 | vm_statistics64_data_t vm_stat; 64 | vm_size_t pageSize = 4096; /* Default */ 65 | mach_port_t myHost = mach_host_self(); 66 | if (host_page_size(myHost, &pageSize) != KERN_SUCCESS) { 67 | pageSize = 4096; // put the default back 68 | } 69 | vm_statistics64_t stat = &vm_stat; 70 | 71 | unsigned int count = HOST_VM_INFO64_COUNT; 72 | if (host_statistics64(myHost, HOST_VM_INFO64, (host_info64_t)stat, &count) != KERN_SUCCESS) { 73 | return 0; 74 | } 75 | return stat->free_count * pageSize; 76 | #else 77 | return 0; 78 | #endif 79 | }; 80 | 81 | static void get_memory(uint64_t *virtual_size, uint64_t *resident_size) { 82 | *virtual_size = 0; 83 | *resident_size = 0; 84 | 85 | #ifdef HAVE_TASK_INFO 86 | kern_return_t error; 87 | mach_msg_type_number_t outCount; 88 | mach_task_basic_info_data_t taskinfo; 89 | 90 | taskinfo.virtual_size = 0; 91 | outCount = MACH_TASK_BASIC_INFO_COUNT; 92 | error = task_info(mach_task_self(), MACH_TASK_BASIC_INFO, (task_info_t)&taskinfo, &outCount); 93 | if (error == KERN_SUCCESS) { 94 | *virtual_size = (uint64_t)taskinfo.virtual_size; 95 | *resident_size = (uint64_t)taskinfo.resident_size; 96 | return; 97 | } 98 | #endif 99 | const char* statm_path = "/proc/self/statm"; 100 | 101 | FILE *f = fopen(statm_path,"r"); 102 | if(f){ 103 | unsigned long size, resident, share, text, lib, data, dt; 104 | if(fscanf(f,"%ld %ld %ld %ld %ld %ld %ld", &size,&resident,&share,&text,&lib,&data,&dt) == 7){ 105 | *virtual_size = size * 4096; 106 | *resident_size = resident * 4096; 107 | fclose(f); 108 | return ; 109 | } 110 | } 111 | fclose(f); 112 | return ; 113 | }; 114 | }; 115 | 116 | 117 | #endif 118 | -------------------------------------------------------------------------------- /utils.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * A collection of utility functions that are typically provided, 3 | * but which are missing in some implementations. 4 | */ 5 | 6 | // Just for this module 7 | //#define _FILE_OFFSET_BITS 64 8 | 9 | #include "config.h" 10 | #include "utils.h" 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include 18 | #include 19 | 20 | /** Extract a buffer... 21 | * @param buf - the buffer to extract; 22 | * @param buflen - the size of the page to extract 23 | * @param pos0 - the byte position of buf[0] 24 | */ 25 | 26 | #ifndef HAVE_LOCALTIME_R 27 | /* locking localtime_r implementation */ 28 | std::mutex localtime_mutex; 29 | void localtime_r(time_t* t, struct tm* tm) { 30 | const std::lock_guard lock(localtime_mutex); 31 | *tm = *localtime(t); 32 | } 33 | #endif 34 | 35 | #ifndef HAVE_GMTIME_R 36 | /* locking gmtime_r implementation */ 37 | std::mutex gmtime_mutex; 38 | void gmtime_r(time_t* t, struct tm* tm) { 39 | if (t && tm) { 40 | const std::lock_guard lock(gmtime_mutex); 41 | struct tm* tmret = gmtime(t); 42 | if (tmret) { 43 | *tm = *tmret; 44 | } else { 45 | memset(tm, 0, sizeof(*tm)); 46 | } 47 | } 48 | } 49 | #endif 50 | 51 | bool getenv_debug(const char *name) 52 | { 53 | const char *e = std::getenv(name); 54 | if (e==nullptr) return false; 55 | if (e[0]=='1' || e[0]=='t' || e[0]=='T' || e[0]=='y' || e[0]=='Y') return true; 56 | return false; 57 | } 58 | 59 | bool starts_with(const std::string& buf, const std::string& with) { 60 | size_t buflen = buf.size(); 61 | size_t withlen = with.size(); 62 | return buflen > withlen && buf.substr(0,withlen) == with; 63 | } 64 | 65 | bool ends_with(const std::string& buf, const std::string& with) { 66 | size_t buflen = buf.size(); 67 | size_t withlen = with.size(); 68 | return buflen > withlen && buf.substr(buflen - withlen, withlen) == with; 69 | } 70 | 71 | bool ends_with(const std::wstring& buf, const std::wstring& with) { 72 | size_t buflen = buf.size(); 73 | size_t withlen = with.size(); 74 | return buflen > withlen && buf.substr(buflen - withlen, withlen) == with; 75 | } 76 | 77 | /****************************************************************/ 78 | /* C++ string splitting code from http://stackoverflow.com/questions/236129/how-to-split-a-string-in-c */ 79 | std::vector& split(const std::string& s, char delim, std::vector& elems) { 80 | std::stringstream ss(s); 81 | std::string item; 82 | while (std::getline(ss, item, delim)) { elems.push_back(item); } 83 | return elems; 84 | } 85 | 86 | std::vector split(const std::string& s, char delim) { 87 | std::vector elems; 88 | return split(s, delim, elems); 89 | } 90 | 91 | /* Read all of the lines of a file and return them as a vector */ 92 | std::vector getLines(const std::filesystem::path path) 93 | { 94 | std::vector lines; 95 | std::string line; 96 | std::ifstream inFile; 97 | inFile.open( path ); 98 | if (!inFile.is_open()) { 99 | std::cerr << "getLines: Cannot open file: " << path << "\n"; 100 | std::string cmd("ls -l " + path.parent_path().string()); 101 | std::cerr << cmd << "\n"; 102 | if (system( cmd.c_str())) { 103 | std::cerr << "error\n"; 104 | } 105 | throw std::runtime_error("test_be:getLines"); 106 | } 107 | while (std::getline(inFile, line)){ 108 | if (line.size()>0){ 109 | lines.push_back(line); 110 | } 111 | } 112 | return lines; 113 | } 114 | 115 | // returns the last line if v has more than one line, otherwise '' 116 | std::string getLast(const std::vector &v) 117 | { 118 | if (v.size() > 0) return v[v.size()-1]; 119 | return std::string(); 120 | } 121 | 122 | 123 | uint64_t scaled_stoi64(const std::string &str) 124 | { 125 | std::stringstream ss(str); 126 | uint64_t val; 127 | ss >> val; 128 | if(str.find('k')!=std::string::npos || str.find('K')!=std::string::npos) val *= 1024LL; 129 | if(str.find('m')!=std::string::npos || str.find('m')!=std::string::npos) val *= 1024LL * 1024LL; 130 | if(str.find('g')!=std::string::npos || str.find('g')!=std::string::npos) val *= 1024LL * 1024LL * 1024LL; 131 | if(str.find('t')!=std::string::npos || str.find('T')!=std::string::npos) val *= 1024LL * 1024LL * 1024LL * 1024LL; 132 | return val; 133 | } 134 | -------------------------------------------------------------------------------- /threadpool.h: -------------------------------------------------------------------------------- 1 | #ifndef _THREADPOOL_H_ 2 | #define _THREADPOOL_H_ 3 | 4 | /**************************************************************** 5 | *** THREADING SUPPORT 6 | ****************************************************************/ 7 | 8 | /** 9 | * \addtogroup internal_interfaces 10 | * @{ 11 | */ 12 | 13 | 14 | /** 15 | * \file 16 | * http://stackoverflow.com/questions/4264460/wait-for-one-of-several-threads-to-finish 17 | * Here is the algorithm to run the thread pool with a work queue: 18 | * 19 | * \verbatim 20 | * main: 21 | * set freethreads to numthreads 22 | * init mutex M, condvars TO_MAIN and TO_WORKER 23 | * start N worker threads 24 | * while true: 25 | * wait for work item 26 | * claim M 27 | * while freethreads == 0: 28 | * cond-wait TO_MAIN, M 29 | * put work item in queue 30 | * decrement freethreads 31 | * cond-signal TO_WORKER 32 | * release M 33 | * 34 | * worker: 35 | * init 36 | * while true: 37 | * claim M 38 | * while no work in queue: 39 | * cond-wait TO_WORKER, M 40 | * get work to local storage 41 | * release M 42 | * do work 43 | * claim M 44 | * increment freethreads 45 | * cond-signal TO_MAIN 46 | * release M 47 | * \endverbatim 48 | */ 49 | 50 | #include 51 | #include 52 | #include 53 | #include 54 | #include 55 | #include // std::future, std::promise 56 | 57 | #include "aftimer.h" 58 | #include "scanner_params.h" 59 | 60 | // There is a single thread_pool object 61 | class worker; 62 | class thread_pool { 63 | /*** neither copying nor assignment is implemented ***/ 64 | thread_pool(const thread_pool &)=delete; 65 | thread_pool &operator=(const thread_pool &)=delete; 66 | std::thread::id main_thread {std::this_thread::get_id()}; 67 | 68 | public: 69 | struct work_unit { 70 | work_unit(){} 71 | work_unit(const sbuf_t *sbuf_):sbuf(sbuf_) {} 72 | work_unit(const sbuf_t *sbuf_, scanner_t *scanner_):sbuf(sbuf_),scanner(scanner_) {} 73 | const sbuf_t *sbuf {nullptr}; // sbuf to process 74 | scanner_t *scanner {nullptr}; // if set, use only this scanner, otherwise use all. 75 | }; 76 | 77 | typedef std::set worker_set_t; 78 | worker_set_t workers {}; 79 | std::set threads {}; 80 | mutable std::mutex M {}; 81 | std::condition_variable TO_MAIN {}; 82 | std::condition_variable TO_WORKER {}; 83 | std::atomic working_workers {0}; 84 | std::atomic freethreads {0}; 85 | std::atomic shutdown_spin_lock_poll_ms {100}; 86 | 87 | // bulk_extractor specialiations 88 | class scanner_set &ss; // one for all the threads; fs and fr are threadsafe 89 | std::queue work_queue {}; // work to be done - here it is just a list of sbufs. 90 | aftimer main_wait_timer {}; // time spend waiting 91 | std::atomic total_worker_wait_ns {0}; 92 | int mode {0}; // 0=running; 1 = waiting for workers to finish; 2=workers should die 93 | std::atomic debug {false}; // display debug messages? 94 | 95 | thread_pool(scanner_set &ss_); 96 | ~thread_pool(); 97 | void launch_workers(size_t num_workers); 98 | void wait_for_tasks(); // wait until there are no tasks in work queue 99 | void join(); // wait_for_tasks() and kill the workers 100 | void main_thread_wait(); 101 | void push_task(const sbuf_t *sbuf, scanner_t *scanner); 102 | void push_task(const sbuf_t *sbuf); 103 | 104 | // Status for callers 105 | size_t get_worker_count() const; 106 | int get_free_count() const; 107 | size_t get_tasks_queued() const; 108 | void debug_pool(std::ostream &os) const; 109 | }; 110 | 111 | // there is a worker object for each thread 112 | class worker { 113 | thread_pool &tp; // my thread pool 114 | void *run(); // run the worker 115 | aftimer worker_wait_timer {}; // time the worker spent 116 | public: 117 | const uint32_t id; 118 | static void * start_worker( void *arg ); 119 | worker(class thread_pool &tp_, uint32_t id_): tp(tp_),id(id_){} // the worker 120 | }; 121 | 122 | 123 | #endif 124 | -------------------------------------------------------------------------------- /atomic_unicode_histogram.h: -------------------------------------------------------------------------------- 1 | #ifndef ATOMIC_UNICODE_HISTOGRAM_H 2 | #define ATOMIC_UNICODE_HISTOGRAM_H 3 | 4 | /** A simple class for making histograms of strings. 5 | * Histograms are kept in printable UTF-8 representation, not in UTF32 internally. 6 | * In part this us due to the legacy code base. 7 | * This part this allows the scanners to determine what the printout should look like, rather than having 8 | * to pass presentation flags. 9 | * 10 | * Histogram maker implement: 11 | * - Counting 12 | * - Determining how much memory is in use by histogram. 13 | * - Writing histogram to a stream (for example, when memory is filled.) 14 | * - Merging multiple histogram files to a single file. 15 | * 16 | * Note - case transitions and text extraction is performed in UTF-32. 17 | * - regular expression are then run on the UTF-8. (Not the best, but it works for now.) 18 | */ 19 | 20 | #include "atomic_map.h" 21 | #include "histogram_def.h" 22 | #include "unicode_escape.h" 23 | #include 24 | 25 | struct AtomicUnicodeHistogram { 26 | static uint32_t debug_histogram_malloc_fail_frequency; // for debugging, make malloc fail sometimes 27 | struct HistogramTally { 28 | uint32_t count{0}; // total strings seen 29 | uint32_t count16{0}; // total utf16 strings seen 30 | HistogramTally(const HistogramTally& a) { 31 | this->count = a.count; 32 | this->count16 = a.count16; 33 | } 34 | HistogramTally& operator=(const HistogramTally& a) { 35 | this->count = a.count; 36 | this->count16 = a.count16; 37 | return *this; 38 | } 39 | 40 | HistogramTally(){}; 41 | virtual ~HistogramTally(){}; 42 | 43 | bool operator==(const HistogramTally& a) const { return this->count == a.count && this->count16 == a.count16; }; 44 | bool operator!=(const HistogramTally& a) const { return !(*this == a); } 45 | bool operator<(const HistogramTally& a) const { 46 | return (this->count < a.count) || ((this->count == a.count && (this->count16 < a.count16))); 47 | } 48 | size_t bytes() const { 49 | return sizeof(*this); 50 | } 51 | }; 52 | 53 | /* A FrequencyReportVector is a vector of report elements when the report is generated.*/ 54 | typedef atomic_map auh_t; 55 | typedef std::vector FrequencyReportVector; 56 | 57 | /* Returns true if acount > b.value->count) return true; 62 | if (a.value->count < b.value->count) return false; 63 | if (a.key < b.key) return true; 64 | return false; 65 | } 66 | 67 | AtomicUnicodeHistogram(const struct histogram_def& def_) : def(def_) {} 68 | virtual ~AtomicUnicodeHistogram(){}; 69 | 70 | // is it empty? 71 | bool empty() { 72 | const std::lock_guard lock(M); 73 | return h.size()==0; 74 | } 75 | void clear(); // empties the histogram 76 | // low-level add, directly to what we display, if the match function checks out. 77 | void add0(const std::string& u8key, const std::string &context, bool found_utf16); 78 | 79 | // adds Unicode string to the histogram count. context is used for histogram_def 80 | void add_feature_context(const std::string& feature, const std::string&context); 81 | size_t size() const; // returns the number of entries in the historam 82 | size_t bytes() const; // returns the number of bytes used by the histogram 83 | 84 | /** makeReport() makes a report and returns a 85 | * FrequencyReportVector. 86 | */ 87 | std::vector makeReport(size_t topN=0); // returns items of 88 | const struct histogram_def def; // the definition we are making 89 | bool debug {false}; // set to enable debugging 90 | 91 | private: 92 | mutable std::mutex M {}; // mutex for the histogram, used to lock individual elements. 93 | auh_t h {}; // the histogram 94 | }; 95 | 96 | std::ostream& operator<<(std::ostream& os, const AtomicUnicodeHistogram::FrequencyReportVector& rep); 97 | std::ostream& operator<<(std::ostream& os, const AtomicUnicodeHistogram::auh_t::item& e); 98 | 99 | #endif 100 | -------------------------------------------------------------------------------- /unicode_escape.h: -------------------------------------------------------------------------------- 1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ 2 | 3 | /* 4 | * Tools for working with Unicode 5 | */ 6 | 7 | #ifndef UNICODE_ESCAPE_H 8 | #define UNICODE_ESCAPE_H 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include "utf8.h" 18 | 19 | /** \addtogroup bulk_extractor_APIs 20 | * @{ 21 | */ 22 | /** \file */ 23 | 24 | /* Our standard escaping is \\ for backslash and \000 for null, \001 for control-a, etc. */ 25 | 26 | std::string octal_escape(unsigned char ch); // escape this character 27 | bool utf8cont(unsigned char ch); // true if a UTF8 continuation character 28 | bool valid_utf8codepoint(uint32_t unichar); // not all unichars are valid codepoints 29 | 30 | /* Our internal, testable, somewhat broken Unicode handling */ 31 | const std::u32string utf32_lowercase(const std::u32string& str); 32 | const std::u32string utf32_extract_numeric(const std::u32string& str); 33 | 34 | struct unicode { 35 | static const uint16_t INTERLINEAR_ANNOTATION_ANCHOR = 0xFFF9; 36 | static const uint16_t INTERLINEAR_ANNOTATION_SEPARATOR = 0xFFFA; 37 | static const uint16_t INTERLINEAR_ANNOTATION_TERMINATOR = 0xFFFB; 38 | static const uint16_t OBJECT_REPLACEMENT_CHARACTER = 0xFFFC; 39 | static const uint16_t REPLACEMENT_CHARACTER = 0xFFFD; 40 | static const uint16_t BOM = 0xFEFF; 41 | }; 42 | 43 | /* Create safe UTF8 from unsafe UTF8. 44 | * if validate is true and the others are false, throws an exception with bad UTF8. 45 | */ 46 | class BadUnicode : public std::exception { 47 | std::string bad_string{}; 48 | public: 49 | BadUnicode(std::string_view bad) : bad_string(bad) {}; 50 | const char *what() const noexcept override { return bad_string.c_str(); }; 51 | }; 52 | 53 | std::string validateOrEscapeUTF8(const std::string& input, bool escape_bad_UTF8, bool escape_backslash, bool validate); 54 | 55 | /* Guess if this is valid utf16 and return likely endian */ 56 | bool looks_like_utf16(const std::string& str, bool& little_endian); 57 | 58 | /* These return the string. If no conversion is possible, 59 | * they throw const utf8::invalid_utf16. 60 | * catch with 'catch (const utf8::invalid_utf16 &)' 61 | */ 62 | 63 | std::string convert_utf16_to_utf8(const std::string& str, bool little_endian); // request specific conversion 64 | std::string convert_utf16_to_utf8(const std::string& str); // guess for best 65 | 66 | // std::u32string convert_utf16_to_utf32(const std::string &str,bool little_endian); // request specific conversion 67 | 68 | // std::u32string convert_utf8_to_utf32(const std::string &str); 69 | // std::string convert_utf32_to_utf8(const std::u32string &str); 70 | // std::string convert_utf32_to_utf8(const std::u32string &str); 71 | std::u32string convert_utf16_to_utf32(const std::string& str); 72 | std::u16string convert_utf32_to_utf16(const std::u32string& str); 73 | std::string make_utf8(const std::string& str); // returns valid, escaped UTF8 for utf8 or utf16 74 | 75 | inline const std::u32string utf32_lowercase(const std::u32string& str) { 76 | std::u32string output; 77 | for (auto& ch : str) { output.push_back(ch < 0xffff ? tolower(ch) : ch); } 78 | return output; 79 | } 80 | 81 | inline const std::u32string utf32_extract_numeric(const std::u32string& str) { 82 | std::u32string output; 83 | for (auto& ch : str) { 84 | if (iswdigit(ch)) { output.push_back(ch); } 85 | } 86 | return output; 87 | } 88 | 89 | /* Now we just pass through to utf8 */ 90 | inline const std::u16string convert_utf8_to_utf16(const std::string& utf8) { 91 | return utf8::utf8to16(utf8); 92 | } 93 | 94 | inline const std::u32string convert_utf8_to_utf32(const std::string utf8) { 95 | return utf8::utf8to32(utf8); 96 | } 97 | 98 | inline const std::string convert_utf32_to_utf8(const std::u32string& u32s) { 99 | return utf8::utf32to8(u32s); 100 | } 101 | 102 | inline std::string safe_utf16to8(std::wstring s) { // needs to be cleaned up 103 | std::string utf8_line; 104 | try { 105 | utf8::utf16to8(s.begin(), s.end(), back_inserter(utf8_line)); 106 | } catch (const utf8::invalid_utf16&) { 107 | /* Exception thrown: bad UTF16 encoding */ 108 | utf8_line = ""; 109 | } 110 | return utf8_line; 111 | } 112 | 113 | // This needs to be cleaned up: 114 | inline std::wstring safe_utf8to16(std::string s) { 115 | std::wstring utf16_line; 116 | try { 117 | utf8::utf8to16(s.begin(), s.end(), back_inserter(utf16_line)); 118 | } catch (const utf8::invalid_utf8&) { 119 | /* Exception thrown: bad UTF8 encoding */ 120 | utf16_line = L""; 121 | } 122 | return utf16_line; 123 | } 124 | 125 | 126 | 127 | #endif 128 | -------------------------------------------------------------------------------- /word_and_context_list.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * class word_and_context_list reads from disk and maintains in memory 3 | * a data structure that is used for the stop list and alert list. 4 | */ 5 | 6 | #include "config.h" 7 | #include 8 | #include 9 | 10 | #include "word_and_context_list.h" 11 | 12 | void word_and_context_list::add_regex(const std::string& pat) { patterns.push_back(pat); } 13 | 14 | /** 15 | * Insert a feature and context, but only if not already present. 16 | * Returns true if added. 17 | */ 18 | bool word_and_context_list::add_fc(const std::string& f, const std::string& c) { 19 | context ctx(f, c); // ctx includes feature, before and after 20 | 21 | if (c.size() > 0 && context_set.find(c) != context_set.end()) return false; // already present 22 | context_set.insert(c); // now we've seen it. 23 | fcmap.insert(std::pair(f, ctx)); 24 | return true; 25 | } 26 | 27 | /** 28 | returns 0 if success, -1 if fail. */ 29 | int word_and_context_list::readfile(const std::filesystem::path path, std::ostream &os) { 30 | std::ifstream i( path ); 31 | if (!i.is_open()) return -1; 32 | os << "Reading context stop list " << path << "\n"; 33 | std::string line; 34 | uint64_t total_context = 0; 35 | uint64_t line_counter = 0; 36 | uint64_t features_read = 0; 37 | while (getline(i, line)) { 38 | line_counter++; 39 | if (line.size() == 0) continue; 40 | if (line[0] == '#') continue; // it's a comment 41 | if ((*line.end()) == '\r') { line.erase(line.end()); /* remove the last character if it is a \r */ } 42 | if (line.size() == 0) continue; // no line content 43 | ++features_read; 44 | 45 | // If there are two tabs, this is a line from a feature file 46 | size_t tab1 = line.find('\t'); 47 | if (tab1 != std::string::npos) { 48 | size_t tab2 = line.find('\t', tab1 + 1); 49 | if (tab2 != std::string::npos) { 50 | size_t tab3 = line.find('\t', tab2 + 1); 51 | if (tab3 == std::string::npos) tab3 = line.size(); 52 | std::string f = line.substr(tab1 + 1, (tab2 - 1) - tab1); 53 | std::string c = line.substr(tab2 + 1, (tab3 - 1) - tab2); 54 | if (add_fc(f, c)) { ++total_context; } 55 | } else { 56 | std::string f = line.substr(tab1 + 1); 57 | add_fc(f, ""); // Insert a feature with no context 58 | } 59 | continue; 60 | } 61 | 62 | // If there is no tab, then this must be a simple item to ignore. 63 | // If it is a regular expression, add it to the list of REs 64 | if (regex_vector::has_metachars(line)) { 65 | patterns.push_back(line); 66 | } else { 67 | // Otherwise, add it as a feature with no context 68 | fcmap.insert(std::pair(line, context(line))); 69 | } 70 | } 71 | os << "Stop list read.\n"; 72 | os << " Total features read: " << features_read << " in " << line_counter << " lines.\n"; 73 | os << " List Size: " << fcmap.size() << "\n"; 74 | os << " Context Strings: " << total_context << "\n"; 75 | os << " Regular Expressions: " << patterns.size() << "\n"; 76 | return 0; 77 | } 78 | 79 | /** check() is threadsafe. */ 80 | bool word_and_context_list::check(const std::string& probe, const std::string& before, const std::string& after) const { 81 | /* First check literals, because they are faster */ 82 | for (stopmap_t::const_iterator it = fcmap.find(probe); it != fcmap.end(); it++) { 83 | if ((rstrcmp((*it).second.before, before) == 0) && (rstrcmp((*it).second.after, after) == 0) && 84 | ((*it).second.feature == probe)) { 85 | return true; 86 | } 87 | } 88 | 89 | /* Now check the patterns; do this second because it is more expensive */ 90 | return patterns.search_all(probe, nullptr); 91 | }; 92 | 93 | bool word_and_context_list::check_feature_context(const std::string& probe, const std::string& context) const { 94 | std::string before; 95 | std::string after; 96 | context::extract_before_after(probe, context, before, after); 97 | return check(probe, before, after); 98 | } 99 | 100 | void word_and_context_list::dump(std::ostream &os) { 101 | os << "dump context list:\n"; 102 | for (auto const& it : fcmap) { os << it.first << " = " << it.second << "\n"; } 103 | os << "dump RE list:\n"; 104 | patterns.dump(os); 105 | } 106 | 107 | #ifdef STAND 108 | int main(int argc, char** argv) { 109 | cout << "testing contxt_list\n"; 110 | word_and_context_list cl; 111 | while (--argc) { 112 | argv++; 113 | if (cl.readfile(*argv)) { err(1, "Cannot read %s", *argv); } 114 | } 115 | cl.dump(); 116 | exit(1); 117 | } 118 | #endif 119 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # be20_api 2 | 3 | |Branch|Coverage| 4 | |------|--------| 5 | |[main](https://github.com/simsong/be20_api/blob/main/README.md)|[![codecov](https://codecov.io/gh/simsong/be20_api/branch/slg-dev/graph/badge.svg?token=Nj8q8eo3Ji)](https://codecov.io/gh/simsong/be20_api)| 6 | |[slg-dev](https://github.com/simsong/be20_api/blob/slg-dev/README.md)| [![codecov](https://codecov.io/gh/simsong/be20_api/branch/slg-dev/graph/badge.svg?token=Nj8q8eo3Ji)](https://codecov.io/gh/simsong/be20_api)| 7 | 8 | 9 | This is the framework for the [bulk_extractor](https://github.com/simsong/bulk_extractor) plug-in API. 10 | It is called *be20_api* because the API was developed for Bulk_Extractor version 1.3. The API has been 11 | used without change in Bulk_Extractor versions 1.4 and 1.5, and will be used without change in Bulk_Extractor version 2.0 12 | 13 | The Bulk_Extractor API is a plug-in API for bulk_extractor "scanners." Scanners are implemented 14 | as `extern "C"` functions which are called from the bulk_extractor C++ framework. All bulk_extractor 15 | scanners are implemented using the API. Scanners can either be compiled into the bulk_extractor executable, or they can be loaded at run-time from the plug-ins directory. The directory contains zero or more shared libraries (on Unix/Linux/MacOS) or DLLs (on Windows). 16 | 17 | There is no differnece in functionality between scanners that are 18 | compiled into the program (e.g. bulk_extractor or tcpflow) and those that are loaded at runtime. 19 | 20 | ## Normal Usage 21 | 22 | The API defines functions for: 23 | 24 | 1. Creating a `scanner_set`. This creates the scanner_set's `feature_recorder_set`. 25 | 26 | 2. Loading scanners into a scanner set. When each scanner is loaded: 27 | 28 | 2.1 Any feature recorders that it specifies will be created and 29 | added to the `feature_recorder_set` if they do not already exist. 30 | 31 | 3. Entering the scanning phase. 32 | 33 | 4. Scanning one or more `sbuf`s, which may cause scanners to create child sbufs 34 | and recursively scan them. 35 | 36 | 5. Exiting the scanning phase and running the histogram phase, which 37 | causes the scanner_set to collect from the scanner all of the 38 | specified histograms (by `feature_recorder` name and regular 39 | expression). Each feature recorder is then asked to make its 40 | histograms (this process can be parallelized too, and will be 41 | parallelized in the future!) 42 | 43 | 6. Finally, the `scanner_set` shuts down and everything is de-allocated. 44 | 45 | ## Path Printing 46 | 47 | The API also defines functions for "path printing," which uses the scanners to decode and print a forensic path. 48 | 49 | |Path|Action| 50 | |----|------| 51 | |0-PRINT|Prints the contents of location 0| 52 | |0-PRINT/r|Raw dumps the length of the buffer in decimal, a \r\n, and then the contents of location 0| 53 | |0-PRINT/h|Hext dump the contents of location 0| 54 | 55 | 56 | ## Working with this repo. 57 | This repo can used in three ways: 58 | 59 | 1. As a stand-alone repo for testing the API modules. 60 | 2. As a stand-alone repo for developing and testing scanners. 61 | 3. As a submodule repo to bulk_extractor or tcplow 62 | 63 | The autotools implementation is this repo is designed to either be included in the parent's `configure.ac` file or to use its own `configure.ac` file. It makes a library called `be20_api.a` which can then be linked into the bulk_extractor program or the testing program. 64 | 65 | Use the `bootstrap.sh` program in *this* repo to compile the test programs. 66 | 67 | ### Help on git submodules 68 | 69 | Git submodules are complicated. Basically, the parent module is linked to a paritcular commit point, and not to a particular branch. This isolates parent modules from changes in the submodule until the parent module wants to accept the change. 70 | 71 | Update to this repository to master: 72 | 73 | (cd be20_api; git pull origin master) 74 | 75 | # Major changes with BE20 v. 2.0: 76 | * `scanner_set` now controls the recursive scanning process. Scanner 77 | set holds the configuration information for the scan and the scanners. 78 | 79 | * sbuf now keeps track of the depth. 80 | * max_depth is now defined for the `scanner_set`, not per scanner. An 81 | individual scanner can just look at the depth in the sbuf and abort 82 | if the scanner things have gone on too long. 83 | 84 | Scanner Activation 85 | ------------------ 86 | * scanner_commands is created from reading the command-line 87 | arguments. It contains enable and disable commands for each scanner. 88 | 89 | * For each scanner, we can then scan the scanner_commands to determine 90 | if the scanner should be initialized, and if we should, we 91 | initialize it. 92 | 93 | * The scanners are then sent 94 | 95 | BE20_API STATUS REPORT 96 | ====================== 97 | BE13_API has been renamed BE20_API and is largley complete. 98 | 99 | Next on the agenda is rewriting tcpflow to use be20_api from be13_api. 100 | -------------------------------------------------------------------------------- /word_and_context_list.h: -------------------------------------------------------------------------------- 1 | #ifndef WORD_AND_CONTEXT_LIST_H 2 | #define WORD_AND_CONTEXT_LIST_H 3 | 4 | /** 5 | * \addtogroup internal_interfaces 6 | * @{ 7 | * \file 8 | * word_and_context_list: 9 | * 10 | * A re-implementation of the basic stop list, regular expression 11 | * stop_list, and context-sensitive stop list. 12 | * 13 | * Method: 14 | * Each entry in the stop list can be represented as: 15 | * - a feature that is stopped, with optional context. 16 | * - a regular expression 17 | * 18 | * Context is represented as a std::string before the feature and a std::string after. 19 | * 20 | * The stop list contains is a map of features that are stopped. 21 | * For each feature, there may be no context or a list of context. 22 | * If there is no context and the feature is in the list, 23 | */ 24 | 25 | /* 26 | * context is a class that records the feature, the text before, and the text after. 27 | * Typically this is used for stop lists and alert lists. 28 | */ 29 | 30 | #include 31 | #include 32 | #include // brings in map and multimap 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | 39 | #include "regex_vector.h" 40 | 41 | class context { 42 | public: 43 | static void extract_before_after(const std::string& feature, const std::string& ctx, std::string& before, 44 | std::string& after) { 45 | if (feature.size() <= ctx.size()) { 46 | /* The most simple algorithm is a sliding window */ 47 | for (size_t i = 0; i < ctx.size() - feature.size(); i++) { 48 | if (ctx.substr(i, feature.size()) == feature) { 49 | before = ctx.substr(0, i); 50 | after = ctx.substr(i + feature.size()); 51 | return; 52 | } 53 | } 54 | } 55 | before.clear(); // can't be done 56 | after.clear(); 57 | } 58 | 59 | // constructors to make a context with nothing before or after, with just a context, or with all three 60 | context(const std::string& f) : feature(f), before(), after() {} 61 | context(const std::string& f, const std::string& c) : feature(f), before(), after() { 62 | extract_before_after(f, c, before, after); 63 | } 64 | context(const std::string& f, const std::string& b, const std::string& a) : feature(f), before(b), after(a) {} 65 | std::string feature; 66 | std::string before; 67 | std::string after; 68 | }; 69 | 70 | inline std::ostream& operator<<(std::ostream& os, const class context& c) { 71 | os << "context[" << c.before << "|" << c.feature << "|" << c.after << "]"; 72 | return os; 73 | } 74 | inline bool operator==(const class context& a, const class context& b) { 75 | return (a.feature == b.feature) && (a.before == b.before) && (a.after == b.after); 76 | } 77 | 78 | /** 79 | * the object that holds the word and context list 80 | * They aren't atomic, but they are read-only. 81 | */ 82 | class word_and_context_list { 83 | private: 84 | typedef std::unordered_multimap stopmap_t; 85 | stopmap_t fcmap; // maps features to contexts; for finding them 86 | 87 | typedef std::unordered_set stopset_t; 88 | stopset_t context_set; // presence of a pair in fcmap 89 | 90 | regex_vector patterns; 91 | 92 | public: 93 | /** 94 | * rstrcmp is like strcmp, except it compares std::strings right-aligned 95 | * and only compares the minimum sized std::string of the two. 96 | */ 97 | static int rstrcmp(const std::string& a, const std::string& b); 98 | 99 | word_and_context_list() : fcmap(), context_set(), patterns() {} 100 | size_t size() { return fcmap.size() + patterns.size(); } 101 | void add_regex(const std::string& pat); // not threadsafe 102 | bool add_fc(const std::string& f, const std::string& c); // not threadsafe 103 | int readfile(const std::filesystem::path path, std::ostream& os = std::cout); // readfile with stats to os 104 | 105 | // return true if the probe with context is in the list or in the stopmap 106 | bool check(const std::string& probe, const std::string& before, const std::string& after) const; // threadsafe 107 | bool check_feature_context(const std::string& probe, const std::string& context) const; // threadsafe 108 | void dump(std::ostream &os = std::cout); 109 | }; 110 | 111 | /* like strcmp, but runs in reverse */ 112 | inline int word_and_context_list::rstrcmp(const std::string& a, const std::string& b) { 113 | size_t alen = a.size(); 114 | size_t blen = b.size(); 115 | size_t len = alen < blen ? alen : blen; 116 | for (size_t i = 0; i < len; i++) { 117 | size_t apos = alen - len + i; 118 | size_t bpos = blen - len + i; 119 | if (a[apos] < b[bpos]) return -1; 120 | if (a[apos] > b[bpos]) return 1; 121 | } 122 | return 0; 123 | } 124 | 125 | #endif 126 | -------------------------------------------------------------------------------- /feature_recorder_mhist.cpp.broken: -------------------------------------------------------------------------------- 1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ 2 | 3 | #include "formatter.h" 4 | 5 | /** 6 | * write() is the main entry point for writing a feature at a given position with context. 7 | * write() checks the stoplist and escapes non-UTF8 characters, then calls write0(). 8 | */ 9 | void feature_recorder::write(const pos0_t& pos0, const std::string& feature_, const std::string& context_) { 10 | if (fs.flags.disabled) return; // disabled 11 | if (fs.flags.pedantic) { 12 | if (feature_.size() > def.max_feature_size) { 13 | throw std::runtime_error(Formatter() << "feature_recorder::write : feature_.size()=" << feature_.size()); 14 | } 15 | if (context_.size() > def.max_context_size) { 16 | throw std::runtime_error(Formatter() << "feature_recorder::write : context_.size()=" << context_.size()); 17 | } 18 | } 19 | 20 | std::string feature = feature_; 21 | std::string context = flags.no_context ? "" : context_; 22 | std::string* feature_utf8 = AtomicUnicodeHistogram::make_utf8(feature); // a utf8 feature 23 | 24 | quote_if_necessary(feature, context); 25 | 26 | if (feature.size() == 0 && fs.flags.pedantic) { 27 | throw std::runtime_error(Formatter() name << ": zero length feature at " << pos0); 28 | } 29 | 30 | /* First check to see if the feature is on the stop list. 31 | * Only do this if we have a stop_list_recorder (the stop list recorder itself 32 | * does not have a stop list recorder. If it did we would infinitely recurse. 33 | */ 34 | if (flags.no_stoplist == false && fs.stop_list && fs.stop_list_recorder && 35 | fs.stop_list->check_feature_context(*feature_utf8, context)) { 36 | fs.stop_list_recorder->write(pos0, feature, context); 37 | delete feature_utf8; 38 | return; 39 | } 40 | 41 | /* The alert list is a special features that are called out. 42 | * If we have one of those, write it to the redlist. 43 | */ 44 | #if 0 45 | if (flags.no_alertlist==false 46 | && fs.alert_list 47 | && fs.alert_list->check_feature_context(*feature_utf8,context)) { 48 | std::string alert_fn = fs.get_outdir() + "/ALERTS_found.txt"; 49 | const std::lock_guard lock(Mr); // notice we are locking the alert list 50 | std::ofstream rf(alert_fn.c_str(),std::ios_base::app); 51 | if(rf.is_open()){ 52 | rf << pos0.shift(fs.offset_add).str() << '\t' << feature << '\t' << "\n"; 53 | } 54 | } 55 | #endif 56 | 57 | #if 0 58 | /* Support in-memory histograms */ 59 | for (const auto &it:mhistograms ){ 60 | const histogram_def &def = it.first; 61 | mhistogram_t *m = it.second; 62 | std::string new_feature = *feature_utf8; 63 | if (def.require.size()==0 || new_feature.find_first_of(def.require)!=std::string::npos){ 64 | /* If there is a pattern to use, use it to simplify the feature */ 65 | if (def.pattern.size()){ 66 | std::smatch sm; 67 | std::regex_search( new_feature, sm, def.reg); 68 | if (sm.size() == 0){ 69 | // no search match; avoid this feature 70 | new_feature = ""; 71 | } 72 | else { 73 | new_feature = sm.str(); 74 | } 75 | } 76 | if(new_feature.size()) m->add(new_feature,1); 77 | } 78 | } 79 | #endif 80 | 81 | /* Finally write out the feature and the context */ 82 | this->write0(pos0, feature, context); 83 | delete feature_utf8; 84 | } 85 | 86 | /** 87 | * Given a buffer, an offset into that buffer of the feature, and the length 88 | * of the feature, make the context and write it out. This is mostly used 89 | * for writing from within the lexical analyzers. 90 | */ 91 | 92 | void feature_recorder::write_buf(const sbuf_t& sbuf, size_t pos, size_t len) { 93 | /* If we are in the margin, ignore; it will be processed again */ 94 | if (pos >= sbuf.pagesize && pos < sbuf.bufsize) { return; } 95 | 96 | if (pos >= sbuf.bufsize) { /* Sanity checks */ 97 | std::cerr << "*** write_buf: WRITE OUTSIDE BUFFER. " 98 | << " pos=" << pos << " sbuf=" << sbuf << "\n"; 99 | return; 100 | } 101 | 102 | /* Asked to write beyond bufsize; bring it in */ 103 | if (pos + len > sbuf.bufsize) { len = sbuf.bufsize - pos; } 104 | 105 | std::string feature = sbuf.substr(pos, len); 106 | std::string context; 107 | 108 | if (flags.no_context == false) { 109 | /* Context write; create a clean context */ 110 | size_t p0 = context_window < pos ? pos - context_window : 0; 111 | size_t p1 = pos + len + context_window; 112 | 113 | if (p1 > sbuf.bufsize) p1 = sbuf.bufsize; 114 | assert(p0 <= p1); 115 | context = sbuf.substr(p0, p1 - p0); 116 | } 117 | this->write(sbuf.pos0 + pos, feature, context); 118 | } 119 | -------------------------------------------------------------------------------- /sbuf_stream.cpp: -------------------------------------------------------------------------------- 1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ 2 | #include "config.h" 3 | #include "sbuf_stream.h" 4 | 5 | /* 6 | * Stream interfaces 7 | */ 8 | sbuf_stream::sbuf_stream(const sbuf_t& sbuf_) : sbuf(sbuf_) { } 9 | sbuf_stream::~sbuf_stream() {} 10 | void sbuf_stream::seek(size_t offset_) { offset = offset_; } 11 | 12 | size_t sbuf_stream::tell() { return offset; } 13 | 14 | /* 15 | * unsigned integers, default little endian 16 | */ 17 | uint8_t sbuf_stream::get8u() { 18 | uint8_t value = sbuf.get8u(offset); 19 | offset++; 20 | return value; 21 | } 22 | uint16_t sbuf_stream::get16u() { 23 | uint16_t value = sbuf.get16u(offset); 24 | offset += 2; 25 | return value; 26 | } 27 | uint32_t sbuf_stream::get32u() { 28 | uint32_t value = sbuf.get32u(offset); 29 | offset += 4; 30 | return value; 31 | } 32 | uint64_t sbuf_stream::get64u() { 33 | uint64_t value = sbuf.get64u(offset); 34 | offset += 8; 35 | return value; 36 | } 37 | 38 | /* 39 | * unsigned integers, big endian 40 | */ 41 | uint8_t sbuf_stream::get8uBE() { 42 | uint8_t value = sbuf.get8uBE(offset); 43 | offset++; 44 | return value; 45 | } 46 | uint16_t sbuf_stream::get16uBE() { 47 | uint16_t value = sbuf.get16uBE(offset); 48 | offset += 2; 49 | return value; 50 | } 51 | uint32_t sbuf_stream::get32uBE() { 52 | uint32_t value = sbuf.get32uBE(offset); 53 | offset += 4; 54 | return value; 55 | } 56 | uint64_t sbuf_stream::get64uBE() { 57 | uint64_t value = sbuf.get64uBE(offset); 58 | offset += 8; 59 | return value; 60 | } 61 | 62 | /* 63 | * unsigned integers, byte order specified 64 | */ 65 | uint8_t sbuf_stream::get8u(sbuf_t::byte_order_t bo) { 66 | uint8_t value = sbuf.get8u(offset, bo); 67 | offset++; 68 | return value; 69 | } 70 | uint16_t sbuf_stream::get16u(sbuf_t::byte_order_t bo) { 71 | uint16_t value = sbuf.get16u(offset, bo); 72 | offset += 2; 73 | return value; 74 | } 75 | uint32_t sbuf_stream::get32u(sbuf_t::byte_order_t bo) { 76 | uint32_t value = sbuf.get32u(offset, bo); 77 | offset += 4; 78 | return value; 79 | } 80 | uint64_t sbuf_stream::get64u(sbuf_t::byte_order_t bo) { 81 | uint64_t value = sbuf.get64u(offset, bo); 82 | offset += 8; 83 | return value; 84 | } 85 | 86 | /* 87 | * signed integers, default little endian 88 | */ 89 | int8_t sbuf_stream::get8i() { 90 | int8_t value = sbuf.get8i(offset); 91 | offset++; 92 | return value; 93 | } 94 | int16_t sbuf_stream::get16i() { 95 | int16_t value = sbuf.get16i(offset); 96 | offset += 2; 97 | return value; 98 | } 99 | int32_t sbuf_stream::get32i() { 100 | int32_t value = sbuf.get32i(offset); 101 | offset += 4; 102 | return value; 103 | } 104 | int64_t sbuf_stream::get64i() { 105 | int64_t value = sbuf.get64i(offset); 106 | offset += 8; 107 | return value; 108 | } 109 | 110 | /* 111 | * signed integers, big endian 112 | */ 113 | int8_t sbuf_stream::get8iBE() { 114 | int8_t value = sbuf.get8iBE(offset); 115 | offset++; 116 | return value; 117 | } 118 | int16_t sbuf_stream::get16iBE() { 119 | int16_t value = sbuf.get16iBE(offset); 120 | offset += 2; 121 | return value; 122 | } 123 | int32_t sbuf_stream::get32iBE() { 124 | int32_t value = sbuf.get32iBE(offset); 125 | offset += 4; 126 | return value; 127 | } 128 | int64_t sbuf_stream::get64iBE() { 129 | int64_t value = sbuf.get64iBE(offset); 130 | offset += 8; 131 | return value; 132 | } 133 | 134 | /* 135 | * signed integers, byte order specified 136 | */ 137 | int8_t sbuf_stream::get8i(sbuf_t::byte_order_t bo) { 138 | int8_t value = sbuf.get8i(offset, bo); 139 | offset++; 140 | return value; 141 | } 142 | int16_t sbuf_stream::get16i(sbuf_t::byte_order_t bo) { 143 | int16_t value = sbuf.get16i(offset, bo); 144 | offset += 2; 145 | return value; 146 | } 147 | int32_t sbuf_stream::get32i(sbuf_t::byte_order_t bo) { 148 | int32_t value = sbuf.get32i(offset, bo); 149 | offset += 4; 150 | return value; 151 | } 152 | int64_t sbuf_stream::get64i(sbuf_t::byte_order_t bo) { 153 | int64_t value = sbuf.get64i(offset, bo); 154 | offset += 8; 155 | return value; 156 | } 157 | 158 | /* 159 | * string readers 160 | */ 161 | std::string sbuf_stream::getUTF8(size_t num_octets_requested) 162 | { 163 | std::string utf8_string = sbuf.getUTF8(num_octets_requested); 164 | offset += utf8_string.length(); 165 | return utf8_string; 166 | } 167 | std::string sbuf_stream::getUTF8() { 168 | std::string ret = sbuf.getUTF8(offset); 169 | size_t num_bytes = ret.length(); 170 | // if anything was read then also skip \0 171 | if (num_bytes > 0) { 172 | num_bytes++; 173 | } 174 | offset += num_bytes; 175 | return ret; 176 | } 177 | 178 | std::wstring sbuf_stream::getUTF16(size_t code_units_requested) { 179 | std::wstring ret = sbuf.getUTF16(offset, code_units_requested); 180 | offset += ret.length() * 2; 181 | return ret; 182 | } 183 | std::wstring sbuf_stream::getUTF16() { 184 | std::wstring utf16_string = sbuf.getUTF16(offset); 185 | size_t num_bytes = utf16_string.length() * 2; 186 | if (num_bytes > 0) { 187 | // if anything was read then also skip \U0000 188 | num_bytes += 2; 189 | } 190 | offset += num_bytes; 191 | return utf16_string; 192 | } 193 | -------------------------------------------------------------------------------- /aftimer.h: -------------------------------------------------------------------------------- 1 | #ifndef __AFTIMER_H__ 2 | #define __AFTIMER_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "utils.h" 14 | 15 | /** 16 | * threadsafe timer. 17 | */ 18 | class aftimer { 19 | aftimer(const aftimer & s) = delete; 20 | aftimer & operator=(const aftimer &s) = delete; 21 | std::chrono::time_point t0 {}; 22 | std::atomic running {}; 23 | std::atomic elapsed_ns {}; // for all times we have started and stopped 24 | std::atomic last_ns {}; // time from when we last did a "start" 25 | public: 26 | static std::string now_str(std::string prefix="",std::string suffix=""); // return a high-resolution string as now. 27 | static std::string hms_str(long t); // turn a number of seconds into h:m:s 28 | static std::string hms_ns_str(uint64_t ns); // turn a number of nanoseconds into h:m:s 29 | static const uint64_t ns_per_s = 1000*1000*1000; // seconds per nanoseconds 30 | aftimer() {} 31 | 32 | void start(); // start the timer 33 | void stop(); // stop the timer 34 | void lap(); // note the time for elapsed_seconds() below 35 | 36 | uint64_t running_nanoseconds() const; // for how long have we been running? 37 | double elapsed_seconds() const; // how long timer has been running; timer can be running from the beginning 38 | uint64_t elapsed_nanoseconds() const; 39 | uint64_t lap_seconds() const; // how long the timer is running this time 40 | double eta(double fraction_done) const; // calculate ETA in seconds, given fraction 41 | std::string elapsed_text() const; // how long we have been running 42 | std::string eta_text(double fraction_done) const; // h:m:s 43 | std::string eta_time(double fraction_done) const; // the actual time 44 | std::string eta_date(double fraction_done) const; // the actual date and time 45 | }; 46 | 47 | /* This code is from: 48 | * http://social.msdn.microsoft.com/Forums/en/vcgeneral/thread/430449b3-f6dd-4e18-84de-eebd26a8d668 49 | * and: 50 | * https://gist.github.com/ugovaretto/5875385 51 | */ 52 | 53 | // https://stackoverflow.com/questions/16177295/get-time-since-epoch-in-milliseconds-preferably-using-c11-chrono 54 | inline std::string aftimer::now_str(std::string prefix,std::string suffix) { 55 | //uint64_t nanoseconds_since_epoch = std::chrono::duration_cast(std::chrono::steady_clock::now()); 56 | //std::chrono::duration_cast(std::chrono::high_resolution_clock::now().time_since_epoch()).count(); 57 | uint64_t microseconds_since_epoch = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); 58 | std::stringstream ss; 59 | ss << std::setprecision(4) << std::fixed << prefix << microseconds_since_epoch/1000 << suffix; 60 | return ss.str(); 61 | } 62 | 63 | inline std::string aftimer::hms_str(long t) { 64 | char buf[64]; 65 | int days = t / (60 * 60 * 24); 66 | 67 | t = t % (60 * 60 * 24); /* what's left */ 68 | 69 | int h = t / 3600; 70 | int m = (t / 60) % 60; 71 | int s = t % 60; 72 | buf[0] = 0; 73 | switch (days) { 74 | case 0: snprintf(buf, sizeof(buf), "%2d:%02d:%02d", h, m, s); break; 75 | case 1: snprintf(buf, sizeof(buf), "%d day, %2d:%02d:%02d", days, h, m, s); break; 76 | default: snprintf(buf, sizeof(buf), "%d days %2d:%02d:%02d", days, h, m, s); 77 | } 78 | return std::string(buf); 79 | } 80 | 81 | inline std::string aftimer::hms_ns_str(uint64_t ns) { 82 | return hms_str(ns / ns_per_s); 83 | } 84 | 85 | inline void aftimer::start() { 86 | assert (running == false); 87 | t0 = std::chrono::steady_clock::now(); 88 | running = true; 89 | } 90 | 91 | inline uint64_t aftimer::running_nanoseconds() const { 92 | auto v = std::chrono::duration_cast(std::chrono::steady_clock::now() - t0 ); 93 | return v.count(); 94 | } 95 | 96 | inline void aftimer::stop() { 97 | assert (running==true); 98 | last_ns = running_nanoseconds(); 99 | elapsed_ns += last_ns; 100 | running = false; 101 | } 102 | 103 | inline void aftimer::lap() { 104 | stop(); 105 | start(); 106 | } 107 | 108 | inline uint64_t aftimer::elapsed_nanoseconds() const { 109 | if (running) { 110 | return elapsed_ns + running_nanoseconds(); 111 | } else { 112 | return elapsed_ns; 113 | } 114 | } 115 | 116 | inline double aftimer::elapsed_seconds() const { 117 | return elapsed_nanoseconds() / double(ns_per_s); 118 | } 119 | 120 | inline std::string aftimer::elapsed_text() const { 121 | return hms_str((int)elapsed_seconds()); 122 | } 123 | 124 | /** 125 | * returns the number of seconds until the job is complete. 126 | */ 127 | inline double aftimer::eta(double fraction_done) const { 128 | double t = elapsed_seconds(); 129 | if (t <= 0) return -1; // can't figure it out 130 | if (fraction_done <= 0) return -1; // can't figure it out 131 | return (t * 1.0 / fraction_done - t); 132 | } 133 | 134 | /** 135 | * Retuns the number of hours:minutes:seconds until the job is done. 136 | */ 137 | inline std::string aftimer::eta_text(double fraction_done) const { 138 | double e = eta(fraction_done); 139 | if (e < 0) return std::string("n/a"); // can't figure it out 140 | return hms_str((long)e); 141 | } 142 | 143 | /** 144 | * Returns the time when data is due. 145 | */ 146 | inline std::string aftimer::eta_time(double fraction_done) const { 147 | time_t t = time_t(eta(fraction_done)) + time(0); 148 | struct tm tm; 149 | localtime_r(&t, &tm); 150 | char buf[64]; 151 | snprintf(buf, sizeof(buf), "%02d:%02d:%02d", tm.tm_hour, tm.tm_min, tm.tm_sec); 152 | return std::string(buf); 153 | } 154 | 155 | inline std::string aftimer::eta_date(double fraction_done) const { 156 | time_t t = time_t(eta(fraction_done)) + time(0); 157 | struct tm tm; 158 | localtime_r(&t, &tm); 159 | char buf[64]; 160 | snprintf(buf, sizeof(buf), "%04d-%02d-%02d %02d:%02d:%02d", 161 | tm.tm_year+1900, tm.tm_mon+1, tm.tm_mday, 162 | tm.tm_hour, tm.tm_min, tm.tm_sec); 163 | return std::string(buf); 164 | } 165 | 166 | #endif 167 | -------------------------------------------------------------------------------- /histogram_def.h: -------------------------------------------------------------------------------- 1 | #ifndef HISTOGRAM_DEF_H 2 | #define HISTOGRAM_DEF_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "unicode_escape.h" 10 | 11 | /** 12 | * histogram_def defines the histograms that will be made by a feature recorder. 13 | * If the mhistogram is set, the histogram is generated when features are recorded 14 | * and kept in memory. If mhistogram is not set, the histogram is generated when the feature recorder is closed. 15 | */ 16 | 17 | struct histogram_def { 18 | struct flags_t { 19 | flags_t(const flags_t& a) { 20 | this->lowercase = a.lowercase; 21 | this->numeric = a.numeric; 22 | this->require_feature = a.require_feature; 23 | this->require_context = a.require_context; 24 | }; 25 | 26 | flags_t& operator=(const flags_t& a) { 27 | this->lowercase = a.lowercase; 28 | this->numeric = a.numeric; 29 | this->require_feature = a.require_feature; 30 | this->require_context = a.require_context; 31 | return *this; 32 | }; 33 | 34 | bool operator<(const flags_t& a) const { 35 | if (this->lowercase < a.lowercase) return true; 36 | if (this->lowercase > a.lowercase) return false; 37 | if (this->numeric < a.numeric) return true; 38 | if (this->numeric > a.numeric) return false; 39 | 40 | if (this->require_feature < a.require_feature) return true; 41 | if (this->require_feature > a.require_feature) return false; 42 | if (this->require_context < a.require_context) return true; 43 | if (this->require_context > a.require_context) return false; 44 | return false; 45 | } 46 | 47 | bool operator==(const flags_t& a) const { 48 | return (this->lowercase == a.lowercase) && (this->numeric == a.numeric) && (this->require_feature==a.require_feature) && (this->require_context==a.require_context); 49 | } 50 | 51 | flags_t(){}; 52 | flags_t(bool lowercase_, bool numeric_) : lowercase(lowercase_), numeric(numeric_) {} 53 | bool lowercase {false}; // make all flags lowercase 54 | bool numeric {false}; // extract digits only 55 | bool require_feature {true}; // require text is applied to feature 56 | bool require_context {false}; // require text is applied to context 57 | }; 58 | 59 | /** 60 | * @param feature - the feature file to histogram (no .txt) 61 | * @param pattern - the regular expression to extract. 62 | * @param suffix - the suffix to add to the histogram file after feature name before .txt 63 | * @param flags - any flags (see above) 64 | * @param require- require this string on the line (usually in context) 65 | */ 66 | 67 | histogram_def(const std::string& name_, 68 | const std::string& feature_, // which feature file to use 69 | const std::string& pattern_, // which pattern to abstract 70 | const std::string& require_, // text required on the line 71 | const std::string& suffix_, // which suffix to add to the feature file name for the histogram 72 | const struct flags_t& flags_); 73 | std::string name{}; // name of the hsitogram 74 | std::string feature{}; // feature file to extract 75 | std::string 76 | pattern{}; // regular expression used to extract feature substring from feature. "" means use the entire feature 77 | mutable std::regex reg{}; // the compiled regular expression. 78 | std::string require{}; // text required somewhere on the feature line. Sort of like grep. used for IP histograms 79 | std::string suffix{}; // suffix to append to histogram report name 80 | 81 | /* flags */ 82 | struct flags_t flags {}; 83 | 84 | /* default copy construction and assignment */ 85 | histogram_def(const histogram_def& a) { 86 | this->name = a.name; 87 | this->feature = a.feature; 88 | this->pattern = a.pattern; 89 | this->reg = a.reg; 90 | this->require = a.require; 91 | this->suffix = a.suffix; 92 | this->flags = a.flags; 93 | }; 94 | 95 | /* assignment operator */ 96 | histogram_def& operator=(const histogram_def& a) { 97 | this->name = a.name; 98 | this->feature = a.feature; 99 | this->pattern = a.pattern; 100 | this->reg = a.reg; 101 | this->require = a.require; 102 | this->suffix = a.suffix; 103 | this->flags = a.flags; 104 | return *this; 105 | } 106 | 107 | bool operator==(const histogram_def& a) const { 108 | return (this->name == a.name) && (this->feature == a.feature) && (this->pattern == a.pattern) && 109 | (this->require == a.require) && (this->suffix == a.suffix) && (this->flags == a.flags); 110 | } 111 | 112 | bool operator!=(const histogram_def& a) const { return !(*this == a); } 113 | 114 | /* comparator, so we can have a functioning map and set classes.' 115 | * ignores reg. 116 | */ 117 | bool operator<(const histogram_def& a) const { 118 | if (this->name < a.name) return true; 119 | if (this->name > a.name) return false; 120 | if (this->feature < a.feature) return true; 121 | if (this->feature > a.feature) return false; 122 | if (this->pattern < a.pattern) return true; 123 | if (this->pattern > a.pattern) return false; 124 | if (this->require < a.require) return true; 125 | if (this->require > a.require) return false; 126 | if (this->suffix < a.suffix) return true; 127 | if (this->suffix > a.suffix) return false; 128 | if (this->flags < a.flags) return true; 129 | return false; 130 | } 131 | 132 | /* Match and extract: 133 | * If the string matches this histogram, return true and optionally 134 | * set match to Extract and match: Does this string match 135 | */ 136 | 137 | bool match(std::u32string u32key, std::string* displayString, const std::string &context) const; 138 | bool match(std::string u32key, std::string* displayString, const std::string &context) const; 139 | }; 140 | 141 | std::ostream& operator<<(std::ostream& os, const histogram_def::flags_t& f); 142 | std::ostream& operator<<(std::ostream& os, const histogram_def& hd); 143 | 144 | #endif 145 | -------------------------------------------------------------------------------- /scanner_config.h: -------------------------------------------------------------------------------- 1 | /* 2 | * scanner_config.h: 3 | * 4 | * class to hold the full configuration of the scanner_set and the feature recorders. 5 | * 6 | * Includes a set of name=value pairs from the command line and the list of all scanners that 7 | * are enabled or disabled. 8 | * 9 | * This class is also used to build the help string. 10 | * 11 | * All of the scanners get the same config, so the names that the scanners want need to be unique. 12 | * We could have adopted a system where each scanner had its own configuraiton space, but we didn't. 13 | * Scanner histograms are added to 'histograms' by machinery. 14 | */ 15 | 16 | #ifndef _SCANNER_CONFIG_H_ 17 | #define _SCANNER_CONFIG_H_ 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #include "utils.h" 27 | 28 | /* There is only one scanner-config object. It is called for all of the scanners 29 | */ 30 | struct scanner_config { 31 | /** 32 | * Commands whether to enable or disable a scanner. 33 | * Typically created from parsing command-line arguments 34 | */ 35 | struct scanner_command { 36 | static inline const std::string ALL_SCANNERS = "all"; 37 | enum command_t { DISABLE, ENABLE }; 38 | scanner_command(const scanner_command& sc) : scannerName(sc.scannerName), command(sc.command){}; 39 | scanner_command(const std::string& scannerName_, scanner_command::command_t c) 40 | : scannerName(scannerName_), command(c){}; 41 | std::string scannerName{}; 42 | command_t command{}; 43 | /* default copy construction and assignment */ 44 | scanner_command& operator=(const scanner_command& a) { 45 | this->scannerName = a.scannerName; 46 | this->command = a.command; 47 | return *this; 48 | } 49 | }; 50 | 51 | private: 52 | /* The global configuration */ 53 | typedef std::map config_t; // configuration for scanner passed in 54 | config_t namevals{}; // (input) name=val map 55 | std::string global_help_options {""}; 56 | // The commands for those scanners (enable, disable, options, etc. 57 | typedef std::vector scanner_commands_t; 58 | scanner_commands_t scanner_commands {}; 59 | 60 | public: 61 | const scanner_commands_t get_scanner_commands() { 62 | return static_cast(scanner_commands); 63 | } 64 | void set_config(std::string name, std::string val) { 65 | namevals[name] = val; 66 | } 67 | std::string get_help() const { return global_help_options;} 68 | 69 | template void get_global_config(const std::string& name, T* val, const std::string& help) { 70 | std::stringstream s; 71 | s << " -S " << name << "=" << *val << " " << help << " (" << name << ")\n"; 72 | global_help_options += s.str(); // add the help in 73 | 74 | auto it = namevals.find(name); 75 | if (it != namevals.end() && val) { 76 | set_from_string(val, it->second); 77 | } 78 | } 79 | 80 | /* Find options */ 81 | struct { 82 | std::vector files {}; // accumulates pattern files 83 | std::vector patterns {}; // accumulates cmdline patterns 84 | } FindOpts {}; 85 | 86 | bool find_opts_empty() const { 87 | return FindOpts.files.empty() && FindOpts.patterns.empty(); 88 | } 89 | 90 | // Find interface 91 | const std::vector &find_patterns() const { return FindOpts.patterns; } 92 | const std::vector &find_files() const { return FindOpts.files; } 93 | void add_find_pattern(std::string pattern) { FindOpts.patterns.push_back(pattern);} 94 | void add_find_path(std::filesystem::path path) { FindOpts.files.push_back(path);} 95 | 96 | 97 | size_t context_window_default{16}; // global option 98 | uint64_t offset_add{0}; // add this number to the first offset in every feature file (used for parallelism) 99 | std::filesystem::path banner_file{}; // add the contents of this file to the top of every feature file 100 | static inline const uint32_t DEFAULT_MAX_DEPTH {12}; 101 | static inline const uint32_t DEFAULT_MAX_NGRAM {10}; 102 | virtual ~scanner_config(){}; 103 | scanner_config(){}; 104 | scanner_config(const scanner_config&) = default; 105 | std::filesystem::path input_fname {NO_INPUT}; // where input comes from 106 | std::filesystem::path outdir {NO_OUTDIR}; // where output goes 107 | std::string hash_algorithm {"sha1"}; // which hash algorithm are using; default to SHA1 108 | 109 | bool allow_recurse { true }; // can be turned off for testing 110 | 111 | inline static const std::string NO_INPUT = ""; // 'filename' indicator that the FRS has no input file 112 | inline static const std::string NO_OUTDIR = ""; // 'dirname' indicator that the FRS produces no file output 113 | inline static const std::string CARVE_MODE_SUFFIX = "_carve_mode"; 114 | 115 | std::string get_nameval(std::string name) const { 116 | auto it = namevals.find(name); 117 | return it != namevals.end() ? it->second : ""; 118 | } 119 | 120 | int get_carve_mode(const std::string name) const { 121 | std::string option_name = name + CARVE_MODE_SUFFIX; 122 | config_t::const_iterator it = namevals.find(option_name); 123 | if (it == namevals.end()) return -1; 124 | return std::stoi( std::string(it->second)); 125 | } 126 | 127 | /* Set configuration; added to the static config */ 128 | uint32_t max_depth {DEFAULT_MAX_DEPTH}; 129 | uint32_t max_ngram {DEFAULT_MAX_NGRAM}; // maximum ngram size to scan for 130 | 131 | /* Control which scanners are enabled */ 132 | // enable/disable a specific scanner 133 | void push_scanner_command(const std::string& scannerName, scanner_command::command_t c) { 134 | scanner_commands.push_back(scanner_command(scannerName, c)); 135 | } 136 | void enable_all_scanners() { 137 | push_scanner_command(scanner_command::ALL_SCANNERS, scanner_command::ENABLE); 138 | } 139 | void disable_all_scanners() { 140 | push_scanner_command(scanner_command::ALL_SCANNERS, scanner_command::DISABLE); 141 | } 142 | }; 143 | 144 | #endif 145 | -------------------------------------------------------------------------------- /atomic_unicode_histogram.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * atomic_unicode_histogram.cpp: 3 | * Maintain a histogram for Unicode strings provided with either UTF-8 or UTF-16 encodings. 4 | * Track number of UTF-16 encodings provided. 5 | * 6 | * Currently, all operations are done on UTF-8 values, because the C++17 regular expression package 7 | * does not handle 32-bit regular expressions. 8 | */ 9 | 10 | #include "unicode_escape.h" 11 | #include "utf8.h" 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include "atomic_unicode_histogram.h" 20 | 21 | std::ostream& operator<<(std::ostream& os, const AtomicUnicodeHistogram::FrequencyReportVector& rep) { 22 | for (const auto& it : rep) { 23 | os << it; 24 | } 25 | return os; 26 | } 27 | 28 | /* Output is in UTF-8 */ 29 | std::ostream& operator<<(std::ostream& os, const AtomicUnicodeHistogram::auh_t::item& e) { 30 | os << "n=" << e.value->count << "\t" << validateOrEscapeUTF8(e.key, true, false, false); 31 | if (e.value->count16 > 0) os << "\t(utf16=" << e.value->count16 << ")"; 32 | os << "\n"; 33 | return os; 34 | } 35 | 36 | /* Create a histogram report. 37 | * @param topN - if >0, return only this many. 38 | * Return only the topN. 39 | */ 40 | std::vector AtomicUnicodeHistogram::makeReport(size_t topN) 41 | { 42 | const std::lock_guard lock(M); 43 | std::vector ret = h.items(); 44 | 45 | std::sort(ret.begin(), ret.end(), AtomicUnicodeHistogram::histogram_compare); // reverse sort 46 | 47 | /* If we only want some of them, delete the extra */ 48 | if ((topN > 0) && (topN < ret.size())) { 49 | ret.erase( ret.begin()+topN, ret.end()); 50 | } 51 | return ret; 52 | } 53 | 54 | /** 55 | * Takes a string (the key) passed in, figure out what it is, and add it to a unicode histogram. 56 | * Typically it is going to be UTF16 or UTF8. 57 | * Regular expressions are applied, if requested, in the UTF32 world. 58 | * 59 | * @param - key - either a UTF8 or UTF16 string. 60 | * If the string appears to be UTF16, convert it to UTF-8 and note that it was converted. 61 | * 62 | * def.flags.digits - extract the digits first and throw away the rest. 63 | * def.flags.lower - also convert to lowercase using Unicode rules. 64 | */ 65 | 66 | // https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex 67 | 68 | // debug_histogram_malloc_fail_frequency allows us to simulate low-memory situations for testing the code. 69 | uint32_t AtomicUnicodeHistogram::debug_histogram_malloc_fail_frequency = 0; 70 | void AtomicUnicodeHistogram::clear() 71 | { 72 | const std::lock_guard lock(M); 73 | h.clear(); 74 | } 75 | 76 | // low-level add after key has been converted to UTF8 77 | void AtomicUnicodeHistogram::add0(const std::string& u8key, const std::string &context, bool found_utf16) 78 | { 79 | std::string displayString; 80 | 81 | if (def.match(u8key, &displayString, context)) { 82 | 83 | if (debug) std::cerr << " AtomicUnicodeHistogram::add0 match u8key=" << u8key << std::endl; 84 | 85 | /* Escape as necessary */ 86 | displayString = validateOrEscapeUTF8(displayString, true, true, false); 87 | 88 | /* For debugging low-memory handling logic, 89 | * specify DEBUG_MALLOC_FAIL to make malloc occasionally fail (not yet implemented) 90 | */ 91 | if (debug_histogram_malloc_fail_frequency) { 92 | const std::lock_guard lock(M); 93 | if ((h.size() % debug_histogram_malloc_fail_frequency) == (debug_histogram_malloc_fail_frequency - 1)) { 94 | throw std::bad_alloc(); 95 | } 96 | } 97 | 98 | /* Add the key to the histogram. Note that this is threadsafe */ 99 | const std::lock_guard lock(M); 100 | h[displayString].count++; 101 | if (found_utf16) { 102 | h[displayString].count16++; // track how many UTF16s were converted 103 | } 104 | if (debug) std::cerr << " AtomicUnicodeHistogram::add0 h[" < 0) { ss << path << "-"; } 83 | ss << offset; 84 | return ss.str(); 85 | } 86 | bool isRecursive() const { // is there a path? 87 | return path.size() > 0; 88 | } 89 | bool contains(const std::string &name) const { // does it contain this name? 90 | return (path.find(name) != std::string::npos); 91 | } 92 | 93 | std::string firstPart() const { // the first part of the path 94 | size_t p = path.find('-'); 95 | if (p == std::string::npos) return std::string(""); 96 | return path.substr(0, p); 97 | } 98 | std::string lastAddedPart() const { // the last part of the path, before the offset 99 | size_t p = path.rfind('-'); 100 | if (p == std::string::npos) return std::string(""); 101 | return path.substr(p + 1); 102 | } 103 | std::string alphaPart() const { // return the non-numeric parts, with /'s between each 104 | std::string desc; 105 | bool inalpha = false; 106 | /* Now get the std::string part of pos0 */ 107 | for (const auto &it : path) { 108 | if ((it) == '-') { 109 | if (desc.size() > 0 && desc.at(desc.size() - 1) != '/') desc += '/'; 110 | inalpha = false; 111 | } 112 | if (isalpha(it) || (inalpha && isdigit(it))) { 113 | desc += it; 114 | inalpha = true; 115 | } 116 | } 117 | return desc; 118 | } 119 | uint64_t imageOffset() const { // return the offset from start of disk 120 | if (path.size() > 0) return stoi64(path); 121 | return offset; 122 | } 123 | 124 | /** 125 | * Return a new position that's been shifted by an offset 126 | */ 127 | pos0_t shift(int64_t s) const { 128 | if (s == 0) return *this; 129 | size_t p = path.find('-'); 130 | if (p == std::string::npos) { // no path 131 | return pos0_t("", offset + s); 132 | } 133 | /* Figure out the value of the shift */ 134 | int64_t baseOffset = stoi64(path.substr(0, p - 1)); 135 | std::stringstream ss; 136 | ss << (baseOffset + s) << path.substr(p); 137 | return pos0_t(ss.str(), offset); 138 | } 139 | }; 140 | 141 | /** iostream support for the pos0_t */ 142 | inline std::ostream& operator<<(std::ostream& os, const class pos0_t& pos0) { 143 | os << "(" << pos0.path << "|" << pos0.offset << ")"; 144 | return os; 145 | } 146 | 147 | /** Append a string (subdir). 148 | * The current offset is a prefix to the subdir. 149 | */ 150 | inline class pos0_t operator+(pos0_t pos, const std::string& subdir) { 151 | std::stringstream ss; 152 | ss << pos.path << (pos.path.size() > 0 ? "-" : "") << pos.offset << "-" << subdir; 153 | return pos0_t(ss.str(), 0); 154 | }; 155 | 156 | /** Adding an offset */ 157 | inline class pos0_t operator+(pos0_t pos, size_t delta) { 158 | return pos0_t(pos.path, pos.offset + delta); 159 | }; 160 | 161 | /** Subtracting an offset */ 162 | inline class pos0_t operator-(pos0_t pos, size_t delta) { 163 | if (delta > pos.offset) { 164 | throw std::runtime_error("attempt to subtract a delta from an pos0_t that is larger that pos.offset"); 165 | } 166 | return pos0_t(pos.path, pos.offset - delta); 167 | }; 168 | 169 | /** \name Comparision operations 170 | * @{ 171 | */ 172 | inline bool operator<(const class pos0_t& pos0, const class pos0_t& pos1) { 173 | if (pos0.path.size() == 0 && pos1.path.size() == 0) return pos0.offset < pos1.offset; 174 | if (pos0.path == pos1.path) return pos0.offset < pos1.offset; 175 | return pos0.path < pos1.path; 176 | }; 177 | 178 | inline bool operator>(const class pos0_t& pos0, const class pos0_t& pos1) { 179 | if (pos0.path.size() == 0 && pos1.path.size() == 0) return pos0.offset > pos1.offset; 180 | if (pos0.path == pos1.path) return pos0.offset > pos1.offset; 181 | return pos0.path > pos1.path; 182 | }; 183 | 184 | inline bool operator==(const class pos0_t& pos0, const class pos0_t& pos1) { 185 | return pos0.path == pos1.path && pos0.offset == pos1.offset; 186 | }; 187 | 188 | inline bool operator!=(const class pos0_t& pos0, const class pos0_t& pos1) { return !(pos0 == pos1); }; 189 | /** @} */ 190 | #endif 191 | -------------------------------------------------------------------------------- /feature_recorder_sql.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Feature recorder mods for writing features into an SQLite3 database. 3 | */ 4 | 5 | /* http://blog.quibb.org/2010/08/fast-bulk-inserts-into-sqlite/ */ 6 | 7 | #include "config.h" 8 | 9 | #ifdef HAVE_SQLITE3_H 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include "feature_recorder_set.h" 17 | #include "feature_recorder_sql.h" 18 | #include "sbuf.h" 19 | 20 | feature_recorder_sql::feature_recorder_sql(class feature_recorder_set& fs_, const feature_recorder_def def_) 21 | : feature_recorder(fs_, def_) { 22 | /* 23 | * If the feature recorder set is disabled, just return. 24 | */ 25 | if (fs.flags.disabled) return; 26 | /* write to a database? Create tables if necessary and create a prepared statement */ 27 | 28 | #if 0 29 | char buf[1024]; 30 | fs.db_create_table(name); 31 | snprintf( buf, sizeof(buf), db_insert_stmt,name.c_str() ); 32 | bs = new besql_stmt( fs.db3, buf ); 33 | #endif 34 | } 35 | 36 | feature_recorder_sql::~feature_recorder_sql() {} 37 | 38 | #if 0 39 | #define DB_INSERT_STMT \ 40 | "INSERT INTO f_%s (offset,path,feature_eutf8,feature_utf8,context_eutf8) VALUES (?1, ?2, ?3, ?4, ?5)" 41 | const char *feature_recorder::db_insert_stmt = DB_INSERT_STMT; 42 | 43 | void feature_recorder::besql_stmt::insert_feature(const pos0_t &pos, 44 | const std::string &feature, 45 | const std::string &feature8, const std::string &context) 46 | { 47 | assert(stmt!=0); 48 | const std::lock_guard lock(Mstmt); // grab a lock 49 | const std::string &path = pos.str(); 50 | sqlite3_bind_int64(stmt, 1, pos.imageOffset()); // offset 51 | sqlite3_bind_text(stmt, 2, path.data(), path.size(), SQLITE_STATIC); // path 52 | sqlite3_bind_text(stmt, 3, feature.data(), feature.size(), SQLITE_STATIC); 53 | sqlite3_bind_text(stmt, 4, feature8.data(), feature8.size(), SQLITE_STATIC); 54 | sqlite3_bind_text(stmt, 5, context.data(), context.size(), SQLITE_STATIC); 55 | if (sqlite3_step(stmt) != SQLITE_DONE) { 56 | fprintf(stderr,"sqlite3_step failed\n"); 57 | } 58 | sqlite3_reset(stmt); 59 | }; 60 | 61 | feature_recorder::besql_stmt::besql_stmt(sqlite3 *db3,const char *sql):Mstmt(),stmt() 62 | { 63 | assert(db3!=0); 64 | assert(sql!=0); 65 | sqlite3_prepare_v2(db3,sql, strlen(sql), &stmt, NULL); 66 | assert(stmt!=0); 67 | } 68 | 69 | feature_recorder::besql_stmt::~besql_stmt() 70 | { 71 | assert(stmt!=0); 72 | sqlite3_finalize(stmt); 73 | stmt = 0; 74 | } 75 | 76 | /* Hook for writing feature to SQLite3 database */ 77 | void feature_recorder::write0_sqlite3(const pos0_t &pos0,const std::string &feature,const std::string &context) 78 | { 79 | /** 80 | * Note: this is not very efficient, passing through a quoted feature and then unquoting it. 81 | * We could make this more efficient. 82 | */ 83 | std::string *feature8 = AtomicUnicodeHistogram::convert_utf16_to_utf8(feature_recorder::unquote_string(feature)); 84 | assert(bs!=0); 85 | bs->insert_feature(pos0,feature, 86 | feature8 ? *feature8 : feature, 87 | flag_set(feature_recorder::FLAG_NO_CONTEXT) ? "" : context); 88 | if (feature8) delete feature8; 89 | } 90 | 91 | /*** SQL Routines Follow *** 92 | * 93 | * Time results with ubnist1 on R4: 94 | * no SQL - 79 seconds 95 | * no pragmas - 651 seconds 96 | * "PRAGMA synchronous = OFF", - 146 second 97 | * "PRAGMA synchronous = OFF", "PRAGMA journal_mode=MEMORY", - 79 seconds 98 | * 99 | * Time with domexusers: 100 | * no SQL - 101 | */ 102 | 103 | #define SQLITE_EXTENSION ".sqlite" 104 | #ifndef SQLITE_DETERMINISTIC 105 | #define SQLITE_DETERMINISTIC 0 106 | #endif 107 | 108 | /* This creates the base histogram. Note that the SQL fails if the histogram exists */ 109 | static const char *schema_hist[] = { 110 | "CREATE TABLE h_%s (count INTEGER(12), feature_utf8 TEXT)", 111 | "CREATE INDEX h_%s_idx1 ON h_%s(count)", 112 | "CREATE INDEX h_%s_idx2 ON h_%s(feature_utf8)", 113 | 0}; 114 | 115 | /* This performs the histogram operation */ 116 | static const char *schema_hist1[] = { 117 | "INSERT INTO h_%s select COUNT(*),feature_utf8 from f_%s GROUP BY feature_utf8", 118 | 0}; 119 | 120 | static const char *schema_hist2[] = { 121 | "INSERT INTO h_%s select sum(count),BEHIST(feature_utf8) from h_%s where BEHIST(feature_utf8)!='' GROUP BY BEHIST(feature_utf8)", 122 | 0}; 123 | 124 | 125 | 126 | void feature_recorder::dump_histogram_sqlite3(const histogram_def &def,void *user,feature_recorder::dump_callback_t cb) const 127 | { 128 | /* First check to see if there exists a feature histogram summary. If not, make it */ 129 | std::string query = "SELECT name FROM sqlite_master WHERE type='table' AND name='h_" + def.feature +"'"; 130 | char *errmsg=0; 131 | int rowcount=0; 132 | if ( sqlite3_exec(fs.db3,query.c_str(),callback_counter,&rowcount,&errmsg)){ 133 | std::cerr << "sqlite3: " << errmsg << "\n"; 134 | return; 135 | } 136 | if (rowcount==0){ 137 | const char *feature = def.feature.c_str(); 138 | fs.db_send_sql( fs.db3, schema_hist, feature, feature); // creates the histogram 139 | fs.db_send_sql( fs.db3, schema_hist1, feature, feature); // creates the histogram 140 | } 141 | /* Now create the summarized histogram for the regex, if it is not existing, but only if we have 142 | * sqlite3_create_function_v2 143 | */ 144 | if (def.pattern.size()>0){ 145 | /* Create the database where we will add the histogram */ 146 | std::string hname = def.feature + "_" + def.suffix; 147 | 148 | /* Remove any "-" characters if present */ 149 | for(size_t i=0;iflag_notset(feature_recorder::FLAG_NO_FEATURES_SQL) ) { 177 | write0_sqlite3( pos0, feature, context); 178 | } 179 | } 180 | #endif 181 | 182 | #endif 183 | -------------------------------------------------------------------------------- /pcap_fake.cpp: -------------------------------------------------------------------------------- 1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ 2 | 3 | /* 4 | * pcap_fake.cpp 5 | * A fake libpcap implementation that can only read files without a filter. 6 | */ 7 | 8 | // config.h is needed solely to find out if we need pcap_fake.h or not. 9 | #include "config.h" 10 | 11 | #ifndef HAVE_LIBPCAP 12 | #include "pcap_fake.h" 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #ifdef _WIN32 20 | #define SET_BINMODE(f) _setmode(_fileno(f), _O_BINARY) 21 | #else 22 | #define SET_BINMODE(f) /* ignore */ 23 | #endif 24 | 25 | /* pcap_fake's struct pcap just keeps track of the file that was opened and 26 | * whether or not it was byteswapped. 27 | */ 28 | struct pcap { 29 | FILE* fp; // input file we are reading from 30 | int swapped; // whether magic number was swapped? 31 | uint32_t linktype; 32 | bool error; // an error occured 33 | bool break_loop; // break_loop was called 34 | bool must_close; 35 | char err_buf[128]; 36 | uint8_t* pktbuf; 37 | }; 38 | 39 | char* pcap_geterr(pcap_t* p) { 40 | snprintf(p->err_buf, sizeof(p->err_buf), "not implemented in pcap_fake"); 41 | return p->err_buf; 42 | } 43 | 44 | /** 45 | * pcap_open_offline() 46 | * -- "The name "-" is a synonym for stdin" (pcap manual) 47 | * -- allocate the pcap_t structure 48 | * -- open a pcap capture file. 49 | */ 50 | pcap_t* pcap_open_offline(const char* fname, char* errbuf) { 51 | FILE* fp = strcmp(fname, "-") == 0 ? stdin : fopen(fname, "rb"); 52 | if (!fp) { 53 | snprintf(errbuf, PCAP_ERRBUF_SIZE, "%s:%s", fname, strerror(errno)); 54 | return 0; 55 | } 56 | pcap_t* p = pcap_fopen_offline(fp, errbuf); 57 | if (p && p->fp != stdin) p->must_close = true; 58 | return p; 59 | } 60 | 61 | char* pcap_lookupdev(char*) // not implemented 62 | { 63 | fprintf(stderr, "pcap_fake.cpp:pcap_lookupdev: tcpflow was compiled without LIBPCAP. Will not live capture.\n"); 64 | return 0; 65 | } 66 | 67 | pcap_t* pcap_open_live(const char*, int, int, int, char*) { 68 | fprintf(stderr, "pcap_fake.cpp:pcap_open_live: tcpflow was compiled without LIBPCAP. Will not live capture.\n"); 69 | return 0; 70 | } 71 | 72 | inline uint32_t swap4(uint32_t x) { 73 | return (((x & 0xff000000) >> 24) | ((x & 0x00ff0000) >> 8) | ((x & 0x0000ff00) << 8) | ((x & 0x000000ff) << 24)); 74 | } 75 | 76 | inline uint32_t swap2(uint16_t x) { return (((x & 0xff00) >> 8) | ((x & 0x00ff) << 8)); } 77 | 78 | pcap_t* pcap_fopen_offline(FILE* fp, char* errbuf) { 79 | SET_BINMODE(fp); 80 | bool swapped = false; 81 | struct pcap_file_header header; 82 | if (fread(&header, sizeof(header), 1, fp) != 1) { 83 | snprintf(errbuf, PCAP_ERRBUF_SIZE, "Cannot read pcap header"); 84 | return 0; // cannot read header 85 | } 86 | if (header.magic == 0xd4c3b2a1) { // check for swap 87 | header.magic = swap4(header.magic); 88 | header.version_major = swap2(header.version_major); 89 | header.version_minor = swap2(header.version_minor); 90 | header.thiszone = swap4(header.thiszone); 91 | header.sigfigs = swap4(header.sigfigs); 92 | header.snaplen = swap4(header.snaplen); 93 | header.linktype = swap4(header.linktype); 94 | swapped = true; 95 | } 96 | if (header.magic != 0xa1b2c3d4) { 97 | snprintf(errbuf, PCAP_ERRBUF_SIZE, "Cannot decode pcap header 0x%x; swapped=%d", header.magic, swapped); 98 | return 0; 99 | } 100 | if (header.version_major != PCAP_VERSION_MAJOR || header.version_minor != PCAP_VERSION_MINOR) { 101 | snprintf(errbuf, PCAP_ERRBUF_SIZE, "Cannot read pcap version %d.%d", header.version_major, 102 | header.version_minor); 103 | return 0; 104 | } 105 | 106 | pcap_t* ret = (pcap_t*)calloc(1, sizeof(pcap_t)); 107 | if (ret == 0) { 108 | snprintf(errbuf, PCAP_ERRBUF_SIZE, "Cannot calloc %lu bytes", sizeof(pcap_t)); 109 | return 0; 110 | } 111 | ret->pktbuf = (uint8_t*)malloc(header.snaplen); 112 | if (ret->pktbuf == 0) { // did we get the snaplen? 113 | std::cerr << "Couldn't get header snaplen"; 114 | free(ret); 115 | return 0; 116 | } 117 | // DEBUG(100) ("pcap_fake.cpp DEBUG: header.magic = %x", header.magic); 118 | // DEBUG(100) ("pcap_fake.cpp DEBUG: header.version_major = %d", header.version_major); 119 | // DEBUG(100) ("pcap_fake.cpp DEBUG: header.version_minor = %d", header.version_minor); 120 | // DEBUG(100) ("pcap_fake.cpp DEBUG: header.thiszone = %d", header.thiszone); 121 | // DEBUG(100) ("pcap_fake.cpp DEBUG: header.sigfigs = %d", header.sigfigs); 122 | // DEBUG(100) ("pcap_fake.cpp DEBUG: header.snaplen = %d", header.snaplen); 123 | // DEBUG(100) ("pcap_fake.cpp DEBUG: header.linktype = %d",header.linktype); 124 | // DEBUG(100) ("pcap_fake.cpp DEBUG: ret->pktbuf = %s". ret->pktbuf); 125 | ret->fp = fp; 126 | ret->swapped = swapped; 127 | ret->linktype = header.linktype; 128 | return ret; 129 | } 130 | 131 | /* 132 | * These are not implemented in pcap_fake 133 | */ 134 | 135 | int pcap_compile(pcap_t* p, struct bpf_program* program, const char* expression, int optimize, uint32_t mask) { 136 | if (strlen(expression) == 0) { 137 | program->valid = true; 138 | return 0; // we can compile the empty expression 139 | } 140 | return -1; // we cannot compile otherwise 141 | } 142 | 143 | int pcap_datalink(pcap_t* p) { return p->linktype; } 144 | 145 | int pcap_setfilter(pcap_t* p, struct bpf_program* prog) { 146 | if (prog->valid) return 0; 147 | return -1; 148 | } 149 | 150 | int pcap_loop(pcap_t* p, int cnt, pcap_handler callback, uint8_t* user) { 151 | while (cnt != 0 && !feof(p->fp) && p->break_loop == false) { 152 | uint32_t tv_sec; 153 | uint32_t tv_usec; 154 | 155 | struct pcap_pkthdr hdr; 156 | 157 | /* Note: struct timeval is 16 bytes on MacOS and not 8 bytes, 158 | * so we manually read and set up the structure 159 | */ 160 | if (fread(&tv_sec, sizeof(uint32_t), 1, p->fp) != 1) break; 161 | if (fread(&tv_usec, sizeof(uint32_t), 1, p->fp) != 1) break; 162 | hdr.ts.tv_sec = tv_sec; 163 | hdr.ts.tv_usec = tv_usec; 164 | 165 | if (fread(&hdr.caplen, sizeof(uint32_t), 1, p->fp) != 1) break; 166 | if (fread(&hdr.len, sizeof(uint32_t), 1, p->fp) != 1) break; 167 | 168 | /* Swap the header if necessary */ 169 | if (p->swapped) { 170 | hdr.ts.tv_sec = swap4(hdr.ts.tv_sec); 171 | hdr.ts.tv_usec = swap4(hdr.ts.tv_usec); 172 | hdr.caplen = swap4(hdr.caplen); 173 | hdr.len = swap4(hdr.len); 174 | } 175 | 176 | /* Read the packet */ 177 | if (fread(p->pktbuf, hdr.caplen, 1, p->fp) != 1) break; // no more to read 178 | 179 | // DEBUG(100) ("pcap_fake: read tv_sec.tv_usec=%d.%06d caplen=%d len=%d", 180 | // (int)hdr.ts.tv_sec,(int)hdr.ts.tv_usec,hdr.caplen,hdr.len); 181 | 182 | /* Process the packet */ 183 | (*callback)(user, &hdr, p->pktbuf); 184 | 185 | /* And loop */ 186 | if (cnt > 0) cnt--; // decrease the packet count 187 | } 188 | return 0; 189 | } 190 | 191 | void pcap_break_loop(pcap_t* p) { p->break_loop = true; } 192 | 193 | void pcap_close(pcap_t* p) // close the file 194 | { 195 | if (p->must_close) fclose(p->fp); 196 | free(p->pktbuf); 197 | free(p); 198 | } 199 | 200 | #endif 201 | -------------------------------------------------------------------------------- /atomic_map.h: -------------------------------------------------------------------------------- 1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ 2 | 3 | /** 4 | * defines atomic_map and atomic_set. 5 | * This is a nice lightweight atomic set when not much else is needed. 6 | * 7 | * 2020-07-06 - slg - Upgraded to to C++17. 8 | */ 9 | 10 | #ifndef ATOMIC_MAP_H 11 | #define ATOMIC_MAP_H 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | /* 23 | * Sample usage: 24 | * struct {int a, int b, int c} mycounter_t; 25 | * atomic_map. 26 | * Creates a defaultdict, and automatically cleans up memory. 27 | * Could be reimplemented to use smart pointers. 28 | */ 29 | 30 | template class atomic_map { 31 | // T1 - key. For example, std::string 32 | // T2 - value. Should be a pointer. 33 | // Mutex M protects mymap. 34 | // It is mutable to allow modification in const methods 35 | mutable std::mutex M{}; 36 | std::map mymap{}; 37 | 38 | public: 39 | atomic_map() {} 40 | ~atomic_map() { 41 | /* delete everything in the map. Could do this with a unique_ptr? */ 42 | clear(); 43 | } 44 | class KeyError : public std::exception { 45 | T1 key; 46 | public: 47 | KeyError(T1 key_) : key(key_) {} 48 | const char* what() const noexcept override { return "did not convert key_ to a string"; } 49 | }; 50 | /* 51 | * Create the behavior of a Python defaultdict: 52 | * If the object is not in the map, add it. 53 | * then return a reference to the object that is in the map. 54 | */ 55 | T2 &operator[](const T1& key) { 56 | const std::lock_guard lock(M); 57 | auto it = mymap.find(key); 58 | if (it == mymap.end()) { 59 | mymap[key] = new T2(); 60 | return *(mymap[key]); 61 | } 62 | return *(it->second); 63 | } 64 | /* 65 | * Get behavior throws a key error if not present, and is const. 66 | */ 67 | T2 &get(const T1& key) const { 68 | const std::lock_guard lock(M); 69 | auto it = mymap.find(key); 70 | if (it == mymap.end()) { 71 | throw KeyError(key); 72 | } 73 | return *(it->second); 74 | } 75 | /* 76 | * insert. We want this in some cases. Fail if it already exists 77 | */ 78 | void insert(const T1 &key, T2 *value) { 79 | const std::lock_guard lock(M); 80 | auto it = mymap.find(key); 81 | if (it != mymap.end()) { 82 | throw KeyError(key); 83 | } 84 | mymap[key] = value; 85 | } 86 | 87 | /* We can't just pass-through to find, because we need to lock the mutext */ 88 | typename std::map::const_iterator find(const T1& key) const { 89 | const std::lock_guard lock(M); 90 | return mymap.find(key); 91 | } 92 | /* We can't allow iteration through the map, since that would not be threadsafe, but we can allow the caller to get end(). */ 93 | #if 0 94 | typename std::map::const_iterator begin() const { 95 | const std::lock_guard lock(M); 96 | return mymap.begin(); 97 | } 98 | #endif 99 | typename std::map::const_iterator end() const { 100 | const std::lock_guard lock(M); 101 | return mymap.end(); 102 | } 103 | 104 | void clear() { 105 | /* First delete all of the elements, then clear the map. This 106 | * might be better done with unique_ptr(). However, then we 107 | * couldn't return a pointer, so we would need to use 108 | * shared_ptr(), which would incur a higher cost. 109 | */ 110 | for (const auto &it : mymap) { 111 | delete it.second; 112 | } 113 | mymap.clear(); 114 | } 115 | /* Number of elements */ 116 | size_t size() const { 117 | const std::lock_guard lock(M); 118 | return mymap.size(); 119 | } 120 | /* implement this later */ 121 | /* bytes */ 122 | size_t bytes() const { 123 | const std::lock_guard lock(M); 124 | size_t count = sizeof(*this); 125 | for (const auto &it : mymap) { 126 | count += sizeof(it.first) + sizeof(it.second) + it.first.size() + it.second->bytes(); 127 | } 128 | return count; 129 | } 130 | 131 | bool contains(T1 key) const { 132 | const std::lock_guard lock(M); 133 | return mymap.find(key) != mymap.end(); 134 | } 135 | /* Like python .keys() */ 136 | typename std::vector keys() const { 137 | const std::lock_guard lock(M); 138 | std::vectorret; 139 | for (const auto &it : mymap) { 140 | ret.push_back( it.first ); 141 | } 142 | return ret; 143 | } 144 | 145 | /* This is only threadsafe if the it.second is an object, and not a pointer*/ 146 | /* Like python .values(). It should actually return objects. */ 147 | typename std::vector values() const { 148 | const std::lock_guard lock(M); 149 | std::vectorret; 150 | for (const auto &it : mymap) { 151 | ret.push_back( it.second ); 152 | } 153 | return ret; 154 | } 155 | 156 | 157 | /* like Python .items() */ 158 | /* This is used for dumping the contents in a mostly threadsafe manner. 159 | * The item that is return is a reference to what's in the atomic_map, so it better not be deleted, 160 | * and if you want multiple threads to access it, the elements should be atomic. 161 | * There is no reference counting on the pointer, so be careful! 162 | * It would be useful to have a priority queue to get the topN. 163 | */ 164 | struct item { 165 | item(const item& s): key(s.key), value(s.value){}; 166 | item(item &&that) noexcept : key(that.key), value(that.value) {} 167 | item& operator=(const item& s) { this->key = s.key; this->value = s.value; return *this;} 168 | 169 | item(T1 key_, T2 *value_) : key(key_), value(value_){}; 170 | T1 key{}; // reference to the key in the histogram 171 | T2 *value{}; // a pointer to the histogram's object 172 | // these comparisions only look at the keys 173 | bool operator==(const item& a) const { return (this->key == a.key); } 174 | bool operator!=(const item& a) const { return !(*this == a); } 175 | bool operator<(const item& a) const { 176 | if (this->key < a.key) return true; 177 | return false; 178 | } 179 | static bool compare(const item& e1, const item& e2) { return e1 < e2; } 180 | virtual ~item(){}; 181 | size_t bytes() const { 182 | return sizeof(*this) + value->bytes(); 183 | } // number of bytes used by object 184 | }; 185 | 186 | std::vector items() const { 187 | std::vector ret; 188 | /* Protect access to mymap with mutex */ 189 | const std::lock_guard lock(M); 190 | for ( auto &pair:mymap ){ 191 | ret.push_back( item(pair.first, pair.second)); 192 | } 193 | return ret; 194 | } 195 | void write(std::ostream &os) const { 196 | const std::lock_guard lock(M); 197 | for (const auto &it : mymap) { 198 | os << " " << it.first << ": " << (it.second) << "\n"; 199 | } 200 | } 201 | }; 202 | 203 | #endif 204 | -------------------------------------------------------------------------------- /utils.h: -------------------------------------------------------------------------------- 1 | /**************************************************************** 2 | *** utils.h 3 | *** 4 | *** To use utils.c/utils.h, be sure this is in your configure.ac file: 5 | m4_include([be20_api/be20_configure.m4]) 6 | *** 7 | ****************************************************************/ 8 | 9 | #ifndef UTILS_H 10 | #define UTILS_H 11 | 12 | #ifndef PACKAGE_NAME 13 | #error utils.h requires that autoconf-generated config.h be included first 14 | #endif 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | 30 | bool getenv_debug(const char *name); // look for an environment variable and return TRUE if it is set and not 0 or FALSE 31 | bool starts_with(const std::string& buf, const std::string& with); 32 | bool ends_with(const std::string& buf, const std::string& with); 33 | bool ends_with(const std::wstring& buf, const std::wstring& with); 34 | std::vector& split(const std::string& s, char delim, std::vector& elems); 35 | std::vector split(const std::string& s, char delim); 36 | 37 | /* Read all of the lines of a file and return them as a vector */ 38 | std::vector getLines(const std::filesystem::path path); 39 | std::string getLast(const std::vector &v); // returns the last line if v has more than one line, otherwise '' 40 | 41 | inline void truncate_at(std::string& line, char ch) { 42 | size_t pos = line.find(ch); 43 | if (pos != std::string::npos) line.resize(pos); 44 | }; 45 | 46 | inline void set_from_string(int *ret, std::string v) { *ret = std::stoi(v); }; 47 | inline void set_from_string(unsigned int *ret, std::string v) { *ret = std::stoul(v); }; 48 | inline void set_from_string(uint64_t *ret, std::string v) { *ret = std::stoull(v); }; 49 | inline void set_from_string(uint8_t *ret, std::string v) { *ret = std::stoul(v); }; 50 | 51 | inline void set_from_string(std::string *ret, std::string v) { *ret = v; }; 52 | inline void set_from_string(bool *ret, std::string v) { 53 | *ret = (v.size()>0 && (v[0]=='Y' || v[0]=='y' || v[0]=='T' || v[0]=='t' || v[0]=='1')); 54 | }; 55 | 56 | 57 | 58 | #ifndef HAVE_LOCALTIME_R 59 | #ifdef __MINGW32__ 60 | #undef localtime_r 61 | #endif 62 | void localtime_r(time_t* t, struct tm* tm); 63 | #endif 64 | 65 | #ifndef HAVE_GMTIME_R 66 | #ifdef __MINGW32__ 67 | #undef gmtime_r 68 | #endif 69 | void gmtime_r(time_t* t, struct tm* tm); 70 | #endif 71 | 72 | int64_t get_filesize(int fd); 73 | 74 | #ifndef HAVE_ISHEXNUMBER 75 | inline int ishexnumber(int c) { 76 | switch (c) { 77 | case '0': 78 | case '1': 79 | case '2': 80 | case '3': 81 | case '4': 82 | case '5': 83 | case '6': 84 | case '7': 85 | case '8': 86 | case '9': 87 | case 'A': 88 | case 'B': 89 | case 'C': 90 | case 'D': 91 | case 'E': 92 | case 'F': 93 | case 'a': 94 | case 'b': 95 | case 'c': 96 | case 'd': 97 | case 'e': 98 | case 'f': return 1; 99 | } 100 | return 0; 101 | } 102 | #endif 103 | 104 | #ifndef HAVE_ISXDIGIT 105 | inline int isxdigit(int c) { return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); } 106 | #endif 107 | 108 | /* Useful functions for scanners */ 109 | #define ONE_HUNDRED_NANO_SEC_TO_SECONDS 10000000 110 | #define SECONDS_BETWEEN_WIN32_EPOCH_AND_UNIX_EPOCH 11644473600LL 111 | /* 112 | * 11644473600 is the number of seconds between the Win32 epoch 113 | * and the Unix epoch. 114 | * 115 | * http://arstechnica.com/civis/viewtopic.php?f=20&t=111992 116 | * gmtime_r() is Linux-specific. You'll find a copy in util.cpp for Windows. 117 | */ 118 | 119 | inline std::string microsoftDateToISODate(const uint64_t& time) { 120 | time_t tmp = (time / ONE_HUNDRED_NANO_SEC_TO_SECONDS) - SECONDS_BETWEEN_WIN32_EPOCH_AND_UNIX_EPOCH; 121 | 122 | struct tm time_tm; 123 | gmtime_r(&tmp, &time_tm); 124 | char buf[256]; 125 | strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%SZ", &time_tm); // Zulu time 126 | return std::string(buf); 127 | } 128 | 129 | /* Convert Unix timestamp to ISO format */ 130 | inline std::string unixTimeToISODate(const uint64_t t) { 131 | struct tm time_tm; 132 | time_t tmp = t; 133 | gmtime_r(&tmp, &time_tm); 134 | char buf[256]; 135 | strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%SZ", &time_tm); // Zulu time 136 | return std::string(buf); 137 | } 138 | 139 | /* Many internal windows and Linux structures require a valid printable name in ASCII */ 140 | inline bool validASCIIName(const std::string name) { 141 | for (auto ch : name) { 142 | if (ch & 0x80) return false; // high bit should not be set 143 | if (ch < ' ') return false; // should not be control character 144 | if (ch == 0x7f) return false; // DEL is not printable 145 | } 146 | return true; 147 | } 148 | 149 | // https://stackoverflow.com/questions/3379956/how-to-create-a-temporary-directory-in-c 150 | inline std::filesystem::path NamedTemporaryDirectory(unsigned long long max_tries = 1000) { 151 | std::random_device dev; 152 | std::mt19937 prng(dev()); 153 | std::uniform_int_distribution rand(0); 154 | std::filesystem::path path; 155 | for (unsigned int i=0; i buffer; 182 | std::stringstream ss; 183 | std::unique_ptr pipe(popen(cmd, "r"), pclose); 184 | if (!pipe) { 185 | throw std::runtime_error("popen() failed!"); 186 | } 187 | while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) { 188 | ss << buffer.data(); 189 | } 190 | return ss.str(); 191 | } 192 | 193 | 194 | #ifndef HAVE_STRPTIME 195 | // https://stackoverflow.com/questions/321849/strptime-equivalent-on-windows 196 | inline char* strptime(const char* s, const char* f, struct tm* tm) { 197 | // Isn't the C++ standard lib nice? std::get_time is defined such that its 198 | // format parameters are the exact same as strptime. Of course, we have to 199 | // create a string stream first, and imbue it with the current C locale, and 200 | // we also have to make sure we return the right things if it fails, or 201 | // if it succeeds, but this is still far simpler an implementation than any 202 | // of the versions in any of the C standard libraries. 203 | std::istringstream input(s); 204 | input.imbue(std::locale(setlocale(LC_ALL, nullptr))); 205 | input >> std::get_time(tm, f); 206 | if (input.fail()) { 207 | return nullptr; 208 | } 209 | return (char*)(s + input.tellg()); 210 | } 211 | #endif 212 | 213 | 214 | 215 | #endif 216 | -------------------------------------------------------------------------------- /threadpool.cpp: -------------------------------------------------------------------------------- 1 | #include "config.h" 2 | #include "threadpool.h" 3 | #include "scanner_set.h" 4 | 5 | thread_pool::thread_pool(scanner_set &ss_): ss(ss_) 6 | { 7 | } 8 | 9 | void thread_pool::launch_workers(size_t num_workers) 10 | { 11 | for (size_t i=0; i < num_workers; i++){ 12 | std::unique_lock lock(M); 13 | class worker *w = new worker(*this,i); 14 | workers.insert(w); 15 | threads.insert(new std::thread( &worker::start_worker, static_cast(w) )); 16 | } 17 | } 18 | 19 | 20 | thread_pool::~thread_pool() 21 | { 22 | /* We previously sent the termination message to all of the sub-threads here. 23 | * However, their terminating caused wacky problems with the malloc library. 24 | * So we just leave them floating around now. Doesn't matter much, because 25 | * the main process will die soon enough. 26 | */ 27 | for (auto &it : threads ){ 28 | it->join(); 29 | delete it; 30 | } 31 | } 32 | 33 | /* 34 | * Wait until there are no tasks and none of the threads are running 35 | */ 36 | void thread_pool::wait_for_tasks() 37 | { 38 | if(debug) std::cerr << "thread_pool::wait_for_tasks work_queue.size()=" << work_queue.size() << std::endl; 39 | std::unique_lock lock(M); 40 | if(debug) std::cerr << "thread_pool::wait_for_tasks got lock work_queue.size()=" << work_queue.size() << " working_workers=" << working_workers << std::endl; 41 | // wait until a thread is free (doesn't matter which) 42 | while (work_queue.size() > 0 || working_workers>0){ 43 | if(debug) std::cerr << "thread_pool::wait_for_tasks work_queue.size()==" << work_queue.size() << " working_workers=" << working_workers << std::endl; 44 | TO_WORKER.notify_one(); // wake up a worker in case one is sleeping 45 | TO_MAIN.wait( lock ); // wait for a message from a worker 46 | } 47 | if(debug) std::cerr << "thread_pool::wait_for_tasks done. work_queue.size()=" << work_queue.size() << " working_workers=" << working_workers << std::endl; 48 | }; 49 | 50 | 51 | void thread_pool::join() 52 | { 53 | wait_for_tasks(); /* Wait until there are no messages in the work queue */ 54 | /* Next, send a kill message to each active thread. */ 55 | size_t num_threads = get_worker_count(); // get the count with lock 56 | for(size_t i=0;i < num_threads;i++){ 57 | if (debug) std::cerr << "thread_pool::join: pushing null task #" << i << std::endl; 58 | push_task(nullptr); // tell a thread to die 59 | } 60 | 61 | // This is a spin lock until there are no more workers. Gross, but it works. 62 | while (get_worker_count()>0){ 63 | std::this_thread::sleep_for( std::chrono::milliseconds( shutdown_spin_lock_poll_ms )); 64 | if (debug) { 65 | debug_pool(std::cerr); 66 | } 67 | } 68 | } 69 | 70 | void thread_pool::main_thread_wait() 71 | { 72 | std::unique_lock lock(M); 73 | main_wait_timer.start(); 74 | //TO_WORKER.notify_one(); // if a worker is sleeping, wake it up 75 | TO_MAIN.wait( lock ); 76 | main_wait_timer.stop(); 77 | } 78 | 79 | 80 | /* 81 | * This may be called from any thread. 82 | * Right now it only works if called by main thread. 83 | */ 84 | 85 | void thread_pool::push_task(const sbuf_t *sbuf, scanner_t *scanner) 86 | { 87 | if (debug) { 88 | std::cerr << "thread_pool::push_task( "; 89 | if (sbuf) { 90 | std::cerr << *sbuf; 91 | } else { 92 | std::cerr << "nullptr"; 93 | } 94 | std::cerr << " , scanner=" << scanner << ") "; 95 | } 96 | std::unique_lock lock(M); 97 | /* In the main thread, make sure there is a free worker before continuing. 98 | * We don't do this in the worker threads because we want them to clear. 99 | */ 100 | if (main_thread == std::this_thread::get_id() && scanner==nullptr) { 101 | while (freethreads==0){ // if there are no free threads, wait. 102 | main_wait_timer.start(); 103 | //TO_WORKER.notify_one(); // if a worker is sleeping, wake it up 104 | TO_MAIN.wait( lock ); 105 | main_wait_timer.stop(); 106 | } 107 | } 108 | 109 | /* Add to the count */ 110 | work_queue.push( new work_unit(sbuf, scanner) ); 111 | if (debug) std::cerr << "added work unit to queue. size=" << work_queue.size() << std::endl; 112 | TO_WORKER.notify_one(); 113 | }; 114 | 115 | 116 | void thread_pool::push_task(const sbuf_t *sbuf) 117 | { 118 | push_task(sbuf, nullptr); 119 | } 120 | 121 | 122 | int thread_pool::get_free_count() const 123 | { 124 | std::lock_guard lock(M); 125 | return freethreads; 126 | }; 127 | 128 | size_t thread_pool::get_worker_count() const 129 | { 130 | std::lock_guard lock(M); 131 | return workers.size(); 132 | } 133 | 134 | size_t thread_pool::get_tasks_queued() const 135 | { 136 | std::lock_guard lock(M); 137 | return work_queue.size(); 138 | } 139 | 140 | 141 | void thread_pool::debug_pool(std::ostream &os) const 142 | { 143 | os << " worker_count: " << get_worker_count() 144 | << " free_count: " << get_free_count() 145 | << " tasks_queued: " << get_tasks_queued() 146 | << std::endl; 147 | } 148 | 149 | /* Launch the worker. It's kept on the per-thread stack. When it is done, delete it. 150 | */ 151 | void * worker::start_worker(void *arg) 152 | { 153 | worker *w = static_cast(arg); 154 | auto ret = w->run(); 155 | delete w; 156 | return ret; 157 | } 158 | 159 | 160 | /* Run the worker. 161 | * Note that we used to throw internal errors, but this caused problems with some versions of GCC. 162 | * Now we simply return when there is an error. 163 | */ 164 | void *worker::run() 165 | { 166 | if (tp.debug) std::cerr << "worker " << std::this_thread::get_id() << " starting " << std::endl; 167 | tp.freethreads++; // this thread is free 168 | while(true){ 169 | /* Get the lock, then wait for the queue to be empty. 170 | * If it is not empty, wait for the lock again. 171 | */ 172 | thread_pool::work_unit wu; 173 | { 174 | std::unique_lock lock( tp.M ); 175 | if (tp.debug) std::cerr << "worker " << std::this_thread::get_id() << " has lock " << std::endl; 176 | worker_wait_timer.start(); // waiting for work 177 | while ( tp.work_queue.size()==0 ){ // wait until something is in the task queue 178 | if (tp.debug) std::cerr << "worker " << std::this_thread::get_id() << " waiting " << std::endl; 179 | /* I didn't get any work; go to sleep */ 180 | //std::cerr << std::this_thread::get_id() << " #1 tp.tasks.size()=" << tp.tasks.size() << std::endl; 181 | tp.ss.thread_set_status("waiting"); 182 | tp.TO_MAIN.notify_one(); // if main is sleeping, wake it up 183 | tp.TO_WORKER.wait( lock ); 184 | //std::cerr << std::this_thread::get_id() << " #2 tp.tasks.size()=" << tp.tasks.size() << std::endl; 185 | } 186 | worker_wait_timer.stop(); // no longer waiting 187 | tp.ss.thread_set_status("working"); 188 | 189 | /* Worker still has the lock */ 190 | thread_pool::work_unit *wup = tp.work_queue.front(); // get the task 191 | tp.work_queue.pop(); // remove it 192 | wu = *wup; 193 | delete wup; 194 | tp.freethreads--; // no longer free 195 | tp.working_workers++; // a worker is working 196 | } 197 | if (wu.sbuf==nullptr) { // special code to exit thread 198 | //tp.TO_MAIN.notify_one(); // tell the master that one is gone 199 | if (tp.debug) std::cerr << std::this_thread::get_id() << "got wu.sbuf=nullptr" << std::endl; 200 | break; 201 | } 202 | /* dispatch the work unit. 203 | * if wu.scanner is not set, process_sbuf will run all scanners in sequence, or schedule each. 204 | * if wu.scanner is set, process_sbuf will just run that one scanner. 205 | */ 206 | if (wu.scanner) { 207 | tp.ss.process_sbuf( wu.sbuf, wu.scanner); 208 | } 209 | else { 210 | tp.ss.process_sbuf( wu.sbuf); 211 | } 212 | tp.ss.release_sbuf(wu.sbuf); 213 | tp.working_workers--; 214 | { 215 | std::unique_lock lock( tp.M ); 216 | tp.freethreads++; // and now the thread is free again! 217 | tp.TO_MAIN.notify_one(); // tell the master that we are free! 218 | } 219 | } 220 | tp.ss.thread_set_status("exiting"); 221 | if (tp.debug) std::cerr << std::this_thread::get_id() << " exiting "<< std::endl; 222 | { 223 | std::unique_lock lock(tp.M); 224 | tp.workers.erase(this); 225 | tp.working_workers--; // a worker is working 226 | } 227 | tp.total_worker_wait_ns += worker_wait_timer.running_nanoseconds(); 228 | tp.ss.thread_set_status("exited"); 229 | return nullptr; 230 | } 231 | -------------------------------------------------------------------------------- /feature_recorder_set.h: -------------------------------------------------------------------------------- 1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ 2 | #ifndef FEATURE_RECORDER_SET_H 3 | #define FEATURE_RECORDER_SET_H 4 | 5 | #include 6 | #include 7 | 8 | #if defined(HAVE_SQLITE3_H) 9 | #include 10 | #endif 11 | 12 | #include "atomic_map.h" 13 | #include "atomic_set.h" 14 | #include "feature_recorder.h" 15 | #include "sbuf.h" 16 | #include "scanner_config.h" 17 | 18 | /** \addtogroup internal_interfaces 19 | * @{ 20 | */ 21 | /** \file */ 22 | 23 | /** 24 | * \class feature_recorder_set 25 | * The feature_recorder_set is an object that controls output. It knows where the output goes (outdir), 26 | * the various feature recorders that write to that output, and provides for synchronization. 27 | * It also has the factory method for new feature_recorders. Therefore if you want a different feature_recorder, 28 | * this set should be subclassed as well. 29 | * 30 | * NOTE: plugins can only call virtual functions! 31 | * 32 | */ 33 | 34 | /* Define a map of feature recorders with atomic access. */ 35 | /* TODO: This should probably be a unique_ptr */ 36 | typedef atomic_map feature_recorder_map_t; 37 | inline std::ostream& operator<<(std::ostream& os, const feature_recorder_map_t& m) { 38 | m.write(os); 39 | return os; 40 | } 41 | 42 | class word_and_context_list; 43 | class feature_recorder_set { 44 | private: 45 | // neither copying nor assignment is implemented 46 | feature_recorder_set(const feature_recorder_set& fs) = delete; 47 | feature_recorder_set& operator=(const feature_recorder_set& fs) = delete; 48 | 49 | friend class feature_recorder; 50 | 51 | //const std::string input_fname{}; // input file; copy for convenience. 52 | //const std::string outdir{}; // where output goes; must know. 53 | 54 | /* map of feature recorders, name->feature recorder It is 55 | * read-write when BE is running single-threaded. After we go into 56 | * multi-threaded mode, it is read-only. 57 | */ 58 | feature_recorder_map_t frm{}; 59 | bool frm_frozen {false}; // once the frm is frozen, it is read-only. 60 | feature_recorder* stop_list_recorder{nullptr}; // where stopped features get written (if there is one) 61 | #if defined(HAVE_SQLITE3_H) and defined(HAVE_LIBSQLITE3) 62 | /* If we are compiled with SQLite3, this is the handle to the open database */ 63 | sqlite3* db3{}; 64 | #endif 65 | 66 | public: 67 | void frm_freeze() { assert(frm_frozen==false); frm_frozen=true;} 68 | size_t feature_recorder_count() const { return frm.size(); } 69 | /* Flags for feature recorders. This used to be a bitmask, but Stroustrup (2013) recommends just having 70 | * a bunch of bools. 71 | */ 72 | struct flags_t { 73 | bool disabled{false}; // do not record anything! This is is just used for a path-printer 74 | bool pedantic{false}; // make sure that all features written are valid utf-8 75 | bool no_alert{false}; // no alert recorder 76 | bool only_alert{false}; // always return the alert recorder 77 | bool create_stop_list_recorders{false}; // static const uint32_t CREATE_STOP_LIST_RECORDERS= 0x04; // 78 | bool debug{false}; // enable debug printing 79 | bool record_files{true}; // record to files 80 | bool record_sql{false}; // record to SQL 81 | } flags; 82 | 83 | static flags_t flags_disabled() { // return a frs that is disabled 84 | flags_t f; 85 | f.disabled = true; 86 | return f; 87 | } 88 | 89 | /** Constructor: 90 | * create an emptry feature recorder set. If disabled, create a disabled recorder. 91 | * @param flags_ = config flags 92 | * @param hash_algorithm - which algorithm to use for de-duplication 93 | * @param input_fname_ = where input comes from 94 | * @param outdir_ = output directory (passed to feature recorders). "" if disabled. 95 | * This clearly needs work. 96 | */ 97 | feature_recorder_set(const flags_t& flags_, const scanner_config& sc); 98 | virtual ~feature_recorder_set(); 99 | 100 | /* Configuration. This is a copy; it should be a reference, but that caused an AddressSanitizer error. */ 101 | const scanner_config sc; 102 | 103 | /* Read-only functions for the scanner-config file management variables */ 104 | virtual std::filesystem::path get_input_fname() const { return sc.input_fname; } 105 | virtual std::filesystem::path get_outdir() const { return sc.outdir; } 106 | 107 | /* the feature recorder set automatically hashes all of the sbuf's that it processes. */ 108 | typedef std::string (*hash_func_t)(const uint8_t* buf, size_t bufsize); 109 | struct hash_def { 110 | hash_def(std::string name_, hash_func_t func_) : name(name_), func(func_){}; 111 | std::string name; // name of hash 112 | hash_func_t func; // hash function 113 | static std::string md5_hasher(const uint8_t* buf, size_t bufsize); 114 | static std::string sha1_hasher(const uint8_t* buf, size_t bufsize); 115 | static std::string sha256_hasher(const uint8_t* buf, size_t bufsize); 116 | static hash_func_t hash_func_for_name(const std::string& name); 117 | }; 118 | 119 | const word_and_context_list* alert_list{}; /* shold be flagged */ 120 | const word_and_context_list* stop_list{}; /* should be ignored */ 121 | 122 | /** hashing system */ 123 | const hash_def hasher; // name and function that perform hashing; set by allocator 124 | 125 | static const std::string ALERT_RECORDER_NAME; // the name of the alert recorder 126 | // static const std::string DISABLED_RECORDER_NAME; // the fake disabled feature recorder 127 | 128 | void set_stop_list(const word_and_context_list* alist) { stop_list = alist; } 129 | void set_alert_list(const word_and_context_list* alist) { alert_list = alist; } 130 | 131 | /** Initialize a feature_recorder_set. Previously this was a constructor, but it turns out that 132 | * virtual functions for the create_name_factory aren't honored in constructors. 133 | * 134 | * init() is called after all of the scanners have been loaded. It 135 | * tells each feature file about its histograms (among other things) 136 | */ 137 | 138 | /* feature_recorder_set flags */ 139 | /* Flags are now implemented as booleans per stroustrup 2013 */ 140 | 141 | int64_t offset_add{0}; // added to every reported offset, for use with hadoop 142 | std::string banner_filename{}; // banner for top of every file 143 | 144 | /* histogram support */ 145 | void histogram_add(const histogram_def& def); // adds it to a local set or to the specific feature recorder 146 | size_t histogram_count() const; // counts histograms in all feature recorders 147 | 148 | void set_carve_defaults(); 149 | 150 | // called when scanner_set shuts down: 151 | void feature_recorders_shutdown(); 152 | void histograms_generate(); // make the histograms in the output directory (and optionally in the database) 153 | 154 | //typedef void (*xml_notifier_t)(const std::string &xmlstring); 155 | 156 | /* support for creating and finding feature recorders 157 | * Previously called create_name(). 158 | * functions must be virtual so they can be called by plug-in. 159 | * All return a reference to the named (or created) feature recorder, or else throw exception indicated 160 | */ 161 | class NoSuchFeatureRecorder : public std::exception { 162 | std::string m_error{}; 163 | 164 | public: 165 | NoSuchFeatureRecorder(std::string_view error) : m_error(error) {} 166 | const char* what() const noexcept override { return m_error.c_str(); } 167 | }; 168 | 169 | class FeatureRecorderAlreadyExists : public std::exception { 170 | std::string m_error{}; 171 | 172 | public: 173 | FeatureRecorderAlreadyExists(std::string_view error) : m_error(error) {} 174 | const char* what() const noexcept override { return m_error.c_str(); } 175 | }; 176 | 177 | class FeatureRecorderNullName : public std::exception { 178 | public: 179 | FeatureRecorderNullName() {} 180 | const char* what() const noexcept override { return "FeatureRecorderNullName"; } 181 | }; 182 | 183 | /* create a feature recorder, and return it as well */ 184 | virtual void create_alert_recorder(); 185 | virtual feature_recorder& create_feature_recorder(feature_recorder_def def); // create a feature recorder 186 | virtual feature_recorder& create_feature_recorder(std::string name); // convenience function 187 | 188 | // Just return it 189 | virtual feature_recorder& named_feature_recorder(const std::string name) const; // returns the named feature recorder 190 | virtual feature_recorder& get_alert_recorder() const; // returns the alert recorder 191 | virtual std::vector feature_file_list() const; // returns a list of feature file names 192 | 193 | void dump_name_count_stats(class dfxml_writer& writer) const; // dumps the standard dfxml 194 | 195 | void info_feature_recorders( std::ostream &os) const; 196 | 197 | /**************************************************************** 198 | *** DB interface 199 | ****************************************************************/ 200 | 201 | #if 0 202 | #if defined(HAVE_SQLITE3_H) and defined(HAVE_LIBSQLITE3) 203 | virtual void db_send_sql(sqlite3 *db3,const char **stmts, ...) ; 204 | virtual sqlite3 *db_create_empty(const std::string &name) ; 205 | void db_create_table(const std::string &name) ; 206 | void db_create() ; 207 | void db_transaction_begin() ; 208 | void db_transaction_commit() ; // commit current transaction 209 | void db_close() ; // 210 | #endif 211 | #endif 212 | /**************************************************************** 213 | *** External Functions 214 | ****************************************************************/ 215 | }; 216 | 217 | #endif 218 | --------------------------------------------------------------------------------