├── COPYING
├── etc
    ├── .gitignore
    ├── coverage_report.sh
    ├── CONFIGURE_WINDOWS_LATEST.bash
    └── install_autotools.sh
├── tests
    ├── .gitignore
    ├── README.md
    ├── random.dat
    ├── unilang.htm
    ├── run_each.sh
    ├── regex_demo.cpp
    └── unilang8.htm
├── demos
    ├── .gitignore
    ├── README.md
    ├── thread_demo.cpp
    └── regex_demo.cpp
├── AUTHORS
├── NEWS
├── make_debug
├── INSTALL
├── thread-pool
    ├── 2105.00613.pdf
    └── LICENSE.md
├── abstract_image_reader.cpp
├── scan_sha1_test.h
├── pos0.cpp
├── .gitmodules
├── codecov.yml
├── bootstrap.sh
├── doc
    └── unit-tests.txt
├── .make-codecov
├── abstract_image_reader.h
├── test_image_reader.h
├── test_be20_api_malloc_debug
├── scanner_config.cpp
├── feature_recorder_mhist.h.broken
├── test_image_reader.cpp
├── .gitignore
├── formatter.h
├── .clang-format
├── ChangeLog
├── char_class.h
├── README_WIN.md
├── m4
    ├── slg_noopt.m4
    ├── slg_address_sanitizer.m4
    └── slg_gcc_all_warnings.m4
├── feature_recorder_sql.h
├── CODING_STANDARDS.txt
├── Makefile.am
├── .github
    └── workflows
    │   ├── build-windows.yml
    │   └── build-ubuntu-macos.yml
├── test_be20_threadpool.cpp
├── sbuf_stream.h
├── configure.ac
├── scanner_params.cpp
├── be20_configure.m4
├── path_printer.h
├── scan_sha1_test.cpp
├── atomic_set.h
├── histogram_def.cpp
├── Makefile.defs
├── pcap_fake.h
├── regex_vector.h
├── LICENSE.md
├── net_ethernet.h
├── feature_recorder_file.h
├── TODO.md
├── regex_vector.cpp
├── machine_stats.h
├── utils.cpp
├── threadpool.h
├── atomic_unicode_histogram.h
├── unicode_escape.h
├── word_and_context_list.cpp
├── README.md
├── word_and_context_list.h
├── feature_recorder_mhist.cpp.broken
├── sbuf_stream.cpp
├── aftimer.h
├── histogram_def.h
├── scanner_config.h
├── atomic_unicode_histogram.cpp
├── pos0.h
├── feature_recorder_sql.cpp
├── pcap_fake.cpp
├── atomic_map.h
├── utils.h
├── threadpool.cpp
└── feature_recorder_set.h


/COPYING:
--------------------------------------------------------------------------------
1 | Go for it.
2 | 


--------------------------------------------------------------------------------
/etc/.gitignore:
--------------------------------------------------------------------------------
1 | *.secret
2 | 


--------------------------------------------------------------------------------
/tests/.gitignore:
--------------------------------------------------------------------------------
1 | regex_demo
2 | 


--------------------------------------------------------------------------------
/demos/.gitignore:
--------------------------------------------------------------------------------
1 | a.out
2 | a.out.dSYM
3 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
1 | Simson L. Garfinkel <simsong@acm.org>
2 | 


--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
1 | Sept 17, 2025 - Removed support for pcre and std::regex_match
2 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
1 | unilang from http://www.humancomp.org/unichtm/unichtm.htm
2 | 


--------------------------------------------------------------------------------
/tests/random.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simsong/be20_api/HEAD/tests/random.dat


--------------------------------------------------------------------------------
/tests/unilang.htm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simsong/be20_api/HEAD/tests/unilang.htm


--------------------------------------------------------------------------------
/make_debug:
--------------------------------------------------------------------------------
1 | make clean
2 | ./configure CFLAGS="-g -O0" CXXFLAGS="-g -O0"
3 | make test_be20_api
4 | 


--------------------------------------------------------------------------------
/INSTALL:
--------------------------------------------------------------------------------
1 | Typically you don't install this. It's meant to be a submodule for bulk_extractor and tcptrans
2 | 


--------------------------------------------------------------------------------
/thread-pool/2105.00613.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simsong/be20_api/HEAD/thread-pool/2105.00613.pdf


--------------------------------------------------------------------------------
/demos/README.md:
--------------------------------------------------------------------------------
1 | This directory is little test and demo programs used by the author to learn the ins and outs of C++17
2 | 


--------------------------------------------------------------------------------
/abstract_image_reader.cpp:
--------------------------------------------------------------------------------
1 | #include "abstract_image_reader.h"
2 | 
3 | abstract_image_reader::~abstract_image_reader()
4 | {
5 | }
6 | 


--------------------------------------------------------------------------------
/scan_sha1_test.h:
--------------------------------------------------------------------------------
1 | #ifndef SCAN_SHA1_H
2 | #define SCAN_SHA1_H
3 | 
4 | #include "scanner_params.h"
5 | 
6 | scanner_t scan_sha1_test;
7 | #endif
8 | 


--------------------------------------------------------------------------------
/pos0.cpp:
--------------------------------------------------------------------------------
1 | #include "pos0.h"
2 | 
3 | /**
4 |  *  Map a file; falls back to read if mmap is not available
5 |  */
6 | std::string pos0_t::map_file_delimiter(pos0_t::U10001C);
7 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "dfxml_cpp"]
2 | 	path = dfxml_cpp
3 | 	url = https://github.com/dfxml-working-group/dfxml_cpp.git
4 | [submodule "utfcpp"]
5 | 	path = utfcpp
6 | 	url = https://github.com/nemtrif/utfcpp.git
7 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | coverage:
 2 |   range: 40..60
 3 |   round: nearest
 4 |   precision: 2
 5 | 
 6 | ignore:
 7 |   - "catch.hpp"
 8 |   - "utf8.h"
 9 |   - "utf8/*"
10 |   - "tests/regex_demo.cpp"
11 |   - "test_be20_api.cpp"
12 | 


--------------------------------------------------------------------------------
/tests/run_each.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # run each test once
 3 | TEST=test_be20_api
 4 | PATH=$PATH:.:..
 5 | tests=$($TEST -l | egrep -v 'All available|test cases|\[')
 6 | for test in $tests ; do
 7 |     echo ========== $test ===========
 8 |     echo '$' test_be $test
 9 |     $TEST $test
10 |     echo
11 |     echo
12 | done
13 | 


--------------------------------------------------------------------------------
/bootstrap.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | mkdir -p build-aux
 4 | 
 5 | # have automake do an initial population if necessary
 6 | autoheader -f
 7 | touch NEWS README AUTHORS ChangeLog
 8 | touch stamp-h
 9 | aclocal -I m4
10 | autoconf -f
11 | automake --add-missing --copy
12 | # bootstrap is complete
13 | echo
14 | echo The bootstrap.sh is complete.  Be sure to run ./configure.
15 | echo
16 | 


--------------------------------------------------------------------------------
/doc/unit-tests.txt:
--------------------------------------------------------------------------------
1 | the following unit test frameworks were considered in order:
2 | 
3 | 1. https://github.com/exoticlibraries/libcester
4 | 2. https://github.com/catchorg/Catch2/blob/master/docs/assertions.md#top
5 | 3. https://github.com/cpputest/cpputest
6 | 4. https://github.com/unittest-cpp/unittest-cpp
7 | 
8 | Currently we are using libcester due to excellent support from the author!
9 | 


--------------------------------------------------------------------------------
/.make-codecov:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # 2020-10-29 - slg - compile for codecov, run self-test, and upload results.
 4 | #
 5 | bash bootstrap.sh
 6 | ./configure CFLAGS='-g -O0 -fprofile-arcs -ftest-coverage' \
 7 |             CXXFLAGS='-g -O0 -fprofile-arcs -ftest-coverage' \
 8 |             LIBS='-lgcov'
 9 | make clean \
10 |      && make test_be20_api \
11 |      && ./test_be20_api \
12 |      && gcov-9 -n -o . *cpp \
13 |      && bash <(curl -s https://codecov.io/bash)
14 | make distclean
15 | 


--------------------------------------------------------------------------------
/abstract_image_reader.h:
--------------------------------------------------------------------------------
 1 | #ifndef ABSTRACT_IMAGE_READER
 2 | #define ABSTRACT_IMAGE_READER
 3 | 
 4 | #include <string>
 5 | #include <cstdint>
 6 | #include <filesystem>
 7 | 
 8 | class abstract_image_reader {
 9 | public:
10 |     abstract_image_reader() {};
11 |     virtual ssize_t pread(void *buf, size_t bufsize, uint64_t offset) const = 0;
12 |     virtual int64_t image_size() const=0;
13 |     virtual std::filesystem::path image_fname() const = 0;
14 |     virtual ~abstract_image_reader();
15 | };
16 | 
17 | #endif
18 | 


--------------------------------------------------------------------------------
/test_image_reader.h:
--------------------------------------------------------------------------------
 1 | #ifndef TEST_IMAGE_READER
 2 | #define TEST_IMAGE_READER
 3 | 
 4 | 
 5 | #include "abstract_image_reader.h"
 6 | 
 7 | class test_image_reader : public abstract_image_reader {
 8 | public:
 9 |     test_image_reader();
10 |     virtual ~test_image_reader();
11 |     virtual ssize_t pread(void *buf, size_t bufsize, uint64_t offset) const;
12 |     virtual int64_t image_size() const;
13 |     virtual std::filesystem::path image_fname() const { return std::filesystem::path("/"); }
14 | };
15 | 
16 | #endif
17 | 


--------------------------------------------------------------------------------
/test_be20_api_malloc_debug:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # This is a shell script that rus the test with malloc debug enabled on MacOS.
 3 | # https://developer.apple.com/library/archive/documentation/Performance/Conceptual/ManagingMemory/Articles/MallocDebug.html
 4 | export MallocStackLogging=1
 5 | export MallocStackLoggingNoCompact=1
 6 | export MallocScribble=1
 7 | export MallocPreScribble=1
 8 | export MallocGuardEdges=1
 9 | ./test_be20_api $* || exit 1
10 | 
11 | export MallocCheckHeapStart=1000
12 | export MallocCheckHeapEach=100
13 | ./test_be20_api $* || exit 1
14 | 


--------------------------------------------------------------------------------
/scanner_config.cpp:
--------------------------------------------------------------------------------
 1 | #include "config.h"
 2 | 
 3 | #include <iostream>
 4 | #include <sstream>
 5 | 
 6 | #include "scanner_config.h"
 7 | 
 8 | /************************************
 9 |  *** HELP and  option processing  ***
10 |  ************************************/
11 | 
12 | //void scanner_config::set_config(const std::string& name, const std::string& val) { namevals[name] = val; }
13 | //void scanner_config::push_scanner_command(const std::string& scannerName, scanner_command::command_t c) {
14 | //    scanner_commands.push_back(scanner_command(scannerName, c));
15 | //}
16 | 


--------------------------------------------------------------------------------
/etc/coverage_report.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Create a code-coverage report locally and upload one to codecov
 4 | # Should be run from the root directory
 5 | 
 6 | if [ -r coverage_report.sh ]; then
 7 |     echo "coverage_report.sh run in /etc directory. moving to .."
 8 |     cd ..
 9 | fi
10 | 
11 | #make distclean
12 | #CFLAGS="--coverage" CXXFLAGS="--coverage" LDFLAGS="--coverage" ./configure
13 | make check
14 | lcov --capture --directory . --output-file main_coverage.info
15 | genhtml main_coverage.info --output-directory out
16 | 
17 | # Upload the coverage report
18 | bash <(curl -s https://codecov.io/bash)
19 | 
20 | /bin/rm -f *.gcov *.gcda *.gcno
21 | 
22 | 


--------------------------------------------------------------------------------
/feature_recorder_mhist.h.broken:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * histogram support.
 3 |  * We can ask the feature recorder to generate a histogram.
 4 |  * The file feature recorder re-reads the file.
 5 |  * The in-memory histogram feature recorder just records the features and outputs
 6 |  * them as a histogram when it shuts down (and whenever it runs out of memory!)
 7 |  * The SQL recorder uses an SQL query to make the histogram. So it never runs out of memory,
 8 |  * but it may run slow.
 9 |  */
10 | 
11 | /* in-memory histograms */
12 | typedef atomic_histogram<std::string, uint64_t> mhistogram_t; // memory histogram
13 | typedef std::map<histogram_def, mhistogram_t*> mhistograms_t;
14 | 


--------------------------------------------------------------------------------
/test_image_reader.cpp:
--------------------------------------------------------------------------------
 1 | #include "test_image_reader.h"
 2 | 
 3 | test_image_reader::test_image_reader()
 4 | {
 5 | }
 6 | 
 7 | test_image_reader::~test_image_reader()
 8 | {
 9 | }
10 | 
11 | /*
12 |  * Virtual data is 0..255 in positions 0..255
13 |  */
14 | 
15 | ssize_t test_image_reader::pread(void *buf, size_t bufsize, uint64_t offset) const
16 | {
17 |     if ( offset>=256) return 0;
18 |     if ( offset+bufsize > 256) bufsize = 256-offset;
19 |     for ( size_t i=0;i<bufsize;i++){
20 |         (reinterpret_cast<uint8_t *>(buf))[i] = i+offset;
21 |     }
22 |     return bufsize;
23 | }
24 | 
25 | int64_t test_image_reader::image_size() const
26 | {
27 |     return 256;
28 | }
29 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.d
 2 | *.exe
 3 | *.gcda
 4 | *.gcno
 5 | *.gcov
 6 | *.info
 7 | *.log
 8 | *.o
 9 | *.so
10 | *.sql3
11 | *.swo
12 | *.swp
13 | *.tar.gz
14 | *.tmp
15 | *.trs
16 | *.zip
17 | *~
18 | .DS_Store
19 | .deps
20 | .dirstamp
21 | Makefile
22 | Makefile.in
23 | TAGS
24 | _deps
25 | a.out
26 | aclocal.m4
27 | ar-lib
28 | autom4te.cache
29 | be20_api-*-*/
30 | build-aux/
31 | compile
32 | config.guess
33 | config.h
34 | config.h.in
35 | config.log
36 | config.status
37 | config.sub
38 | configure
39 | depcomp
40 | install-sh
41 | missing
42 | out/
43 | stamp-h1
44 | stand
45 | test-driver
46 | test-program.cpp
47 | test_be20_api
48 | test_be20api
49 | test_be20api_catch2
50 | tests/Makefile
51 | x.cpp
52 | test_be20_api
53 | README
54 | stamp-h
55 | *.old
56 | 


--------------------------------------------------------------------------------
/demos/thread_demo.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Example of C++11 threads and atomic variables
 3 |  */
 4 | 
 5 | #include <atomic>
 6 | #include <iostream>
 7 | #include <stdexcept>
 8 | #include <thread>
 9 | 
10 | std::atomic<int> v(0);
11 | int x(0);
12 | 
13 | void adder()
14 | {
15 |     for(int i=0;i<1000000;i++){
16 |         v += 1;
17 |         x += 1;
18 |     }
19 | }
20 | 
21 | int main(int argc, char **argv)
22 | {
23 |     std::thread *t[10];
24 |     for(int i=0;i<10;i++){
25 |         std::cout << "i=" << i << std::endl;
26 |         t[i] = new std::thread(adder);
27 |     }
28 |     for(int i=0;i<10;i++){
29 |         t[i]->join();
30 |     }
31 |     std::cout << "v=" << v << std::endl;
32 |     std::cout << "x=" << x << std::endl;
33 |     return(0);
34 | }
35 | 
36 | 


--------------------------------------------------------------------------------
/tests/regex_demo.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Small test program to show how to use C++17 regular expressions.
 3 |  */
 4 | 
 5 | #include <iostream>
 6 | #include <string>
 7 | #include <regex>
 8 | 
 9 | int main(int argc,char **argv)
10 | {
11 |     std::string s("abc123def");
12 |     std::regex  r("([0-9]+)");
13 |     std::smatch m;
14 |     if (std::regex_search(s, m, r)){
15 |         std::cout << "Matches '" << m.str() << "'\n";
16 |     }
17 | 
18 |     /* Try 32-bit vecotrs */
19 |     std::u32string s32(U"Hello");
20 |     std::cout << "len(s32)=" << s32.size() << "\n";
21 |     std::basic_regex<char>  r8("([0-9]+)");
22 |     std::basic_regex<wchar_t> r16(L"([0-9]+)");
23 | 
24 |     // this doesn't work:
25 |     //std::basic_regex<char32_t>  r32(U"([0-9]+)");
26 |     return(0);
27 | }
28 | 


--------------------------------------------------------------------------------
/etc/CONFIGURE_WINDOWS_LATEST.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # etc/CONFIGURE_WINDOWS_MSYS2.bash
 3 | # Configure MSYS2/MinGW environment for be20_api build
 4 | # See: https://www.msys2.org/
 5 | 
 6 | OS_NAME=msys
 7 | MAKE_CONCURRENCY=-j2
 8 | MPKGS=""
 9 | 
10 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
11 | cd "$SCRIPT_DIR"
12 | 
13 | . ./paths.bash 2>/dev/null || true
14 | 
15 | echo "******************************************************************"
16 | echo "Configuring Windows/MSYS2 environment to compile be20_api"
17 | echo "******************************************************************"
18 | 
19 | # Ensure MSYS2 is updated
20 | 
21 | # Install required packages
22 | if [ $? != 0 ]; then
23 |   echo "Could not install some of the packages. Will not proceed."
24 |   exit 1
25 | fi
26 | 


--------------------------------------------------------------------------------
/formatter.h:
--------------------------------------------------------------------------------
 1 | // https://stackoverflow.com/questions/12261915/how-to-throw-stdexceptions-with-variable-messages
 2 | 
 3 | #ifndef FORMATTER_H
 4 | #define FORMATTER_H
 5 | 
 6 | #include <sstream>
 7 | #include <stdexcept>
 8 | 
 9 | class Formatter {
10 | public:
11 |     Formatter() {};
12 |     ~Formatter() {};
13 | 
14 |     template <typename Type> Formatter& operator<<(const Type& value) {
15 |         stream_ << value;
16 |         return *this;
17 |     }
18 | 
19 |     std::string str() const { return stream_.str(); }
20 |     operator std::string() const { return stream_.str(); }
21 | 
22 |     enum ConvertToString { to_str };
23 |     std::string operator>>(ConvertToString) { return stream_.str(); }
24 | 
25 | private:
26 |     std::stringstream stream_{};
27 |     Formatter(const Formatter&);
28 |     Formatter& operator=(Formatter&);
29 | };
30 | 
31 | #endif
32 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
 1 | ---
 2 | # We'll use defaults from the LLVM style, but with 4 columns indentation.
 3 | BasedOnStyle: LLVM
 4 | IndentWidth: 4
 5 | UseTab: Never
 6 | ---
 7 | Language: Cpp
 8 | # Force pointers to the type for C++.
 9 | DerivePointerAlignment: false
10 | PointerAlignment: Left
11 | ColumnLimit: 120
12 | AccessModifierOffset: -4
13 | AllowShortBlocksOnASingleLine: true
14 | AllowShortIfStatementsOnASingleLine: true
15 | AllowShortCaseLabelsOnASingleLine: true
16 | AllowShortFunctionsOnASingleLine: true
17 | AllowShortLambdasOnASingleLine: true
18 | AllowShortLoopsOnASingleLine: true
19 | SpaceBeforeCtorInitializerColon: true
20 | Standard: Cpp17
21 | MaxEmptyLinesToKeep: 2
22 | ---
23 | Language: JavaScript
24 | # Use 100 columns for JS.
25 | ColumnLimit: 100
26 | ---
27 | Language: Proto
28 | # Don't format .proto files.
29 | DisableFormat: true
30 | ---
31 | Language: CSharp
32 | # Use 100 columns for C#.
33 | ColumnLimit: 100
34 | 


--------------------------------------------------------------------------------
/ChangeLog:
--------------------------------------------------------------------------------
 1 | 2020-07-10  Simson Garfinkel  <simsong@nimi.local>
 2 | 
 3 | 	* removed main_thread detection.
 4 | 
 5 | 2020-06-13  Simson Garfinkel <simsong@acm.org>
 6 | 	* updated license to MIT License, copyright Simson L. Garfinkel, consistent with the fact that this is no longer an official US Government work product.
 7 | 
 8 | 2019-11-11  Simson Garfinkel  <simsong@acm.org>
 9 | 
10 | 	* Tagged v1.6.0
11 | 
12 | 2019-11-10  user  <user@localhost.localdomain>
13 | 
14 | 	* bulk_extractor_i.h (safe_utf16to8): fixed catching of polymorphic type value value. Exceptions need to be caught by reference.
15 | 	(safe_utf8to16): fixed catching of polymorphic type value value. Exceptions need to be caught by reference.
16 | 
17 | 2019-11-10  Simson Garfinkel  <simsong@nimi.local>
18 | 
19 | 	* sbuf.h (class sbuf_t): Really want to make *buf private, but it's still used too many places.
20 | 
21 | 2021-06-01 Simson Garfinkel  <simsong@nimi.local>
22 | 	* Complete rewrite for 2.0
23 | 


--------------------------------------------------------------------------------
/char_class.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * \class CharClass
 3 |  * Examine a block of text and count the number of characters
 4 |  * in various ranges. This is useful for determining if a block of
 5 |  * bytes is coded in BASE16, BASE64, etc.
 6 |  */
 7 | 
 8 | #ifndef CHAR_CLASS_H
 9 | #define CHAR_CLASS_H
10 | struct CharClass {
11 |     uint32_t range_0_9{0};  // a range_0_9 character
12 |     uint32_t range_A_Fi{0}; // a-f or A-F
13 |     uint32_t range_g_z{0};  // g-z
14 |     uint32_t range_G_Z{0};  // G-Z
15 |     CharClass() {}
16 |     void add(const uint8_t ch) {
17 |         if (ch >= 'a' && ch <= 'f') range_A_Fi++;
18 |         if (ch >= 'A' && ch <= 'F') range_A_Fi++;
19 |         if (ch >= 'g' && ch <= 'z') range_g_z++;
20 |         if (ch >= 'G' && ch <= 'Z') range_G_Z++;
21 |         if (ch >= '0' && ch <= '9') range_0_9++;
22 |     }
23 |     void add(const uint8_t* buf, size_t len) {
24 |         for (size_t i = 0; i < len; i++) { add(buf[i]); }
25 |     }
26 | };
27 | 
28 | #endif
29 | 


--------------------------------------------------------------------------------
/README_WIN.md:
--------------------------------------------------------------------------------
 1 | ## The joy of building with Make on Windows
 2 | 
 3 | * There are sub-modules here which require specific hydration
 4 | * `git clone --recurse-submodules https://github.com/simsong/be20_api`
 5 |    * If you've already cloned - `git submodule update --init --recursive` 
 6 | * Install mysys64 - https://www.msys2.org/
 7 | * Create toolchain using `pacman`
 8 | 
 9 | ```sh
10 | pacman -S \
11 |   base-devel \
12 |   mingw-w64-ucrt-x86_64-gcc \
13 |   mingw-w64-ucrt-x86_64-make \
14 |   mingw-w64-ucrt-x86_64-re2 \
15 |   mingw-w64-ucrt-x86_64-abseil-cpp \
16 |   mingw-w64-ucrt-x86_64-sqlite3 \
17 |   mingw-w64-ucrt-x86_64-openssl \
18 |   mingw-w64-ucrt-x86_64-expat
19 | ```
20 | * Generate the `config.h`
21 | ```
22 | ./bootstrap.sh
23 | ```
24 | * Configure
25 | ```
26 | ./configure
27 | ```
28 | * Then make the executable  
29 | ```
30 | make
31 | ```
32 | * Time for tests
33 | ```shell
34 | ./test_be20_api.exe    
35 | 
36 | make check  || (for fn in test*.log ; do echo ""; echo $fn ; cat $fn ; done; exit 1)
37 | ```
38 | Done!
39 | 


--------------------------------------------------------------------------------
/thread-pool/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Barak Shoshany
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/m4/slg_noopt.m4:
--------------------------------------------------------------------------------
 1 | ################################################################
 2 | ############## drop optimization flags and add -g if requested ################
 3 | # Should we disable optimization?
 4 | AC_ARG_ENABLE([opt],
 5 |         [AS_HELP_STRING([--disable-opt],[Drop all -O C flags])],
 6 |         [with_opt=no],
 7 |         [with_opt=yes])
 8 | 
 9 | # Or maybe just tone it down a bit?
10 | AC_ARG_ENABLE([o3],
11 |         [AS_HELP_STRING([--disable-o3],[Do not force O3 optimization; use default level])],
12 |         [with_o3=no],
13 |         [with_o3=yes])
14 | 
15 | if test "${with_opt}" = "no" ; then
16 |   CFLAGS=`echo   -g "$CFLAGS"   | sed s/-O[[0-9]]//`             # note the double quoting!
17 |   CXXFLAGS=`echo -g "$CXXFLAGS" | sed s/-O[[0-9]]//`
18 | else
19 |   # If we are not stripping the optimizer,
20 |   # increase optimizer from -O2 to -O3 if not explicitly forbidden
21 |   if test "${with_o3}" != "no" ; then
22 |       AC_MSG_NOTICE([adding -O3 to CFLAGS and CXXFLAGS])
23 |       CFLAGS=`echo   -g "$CFLAGS"   | sed 's/-O[123]//'`             # note the double quoting!
24 |       CFLAGS="$CFLAGS -O3"
25 | 
26 |       CXXFLAGS=`echo -g "$CXXFLAGS" | sed 's/-O[123]//'`
27 |       CXXFLAGS="$CXXFLAGS -O3"
28 |   fi
29 | fi
30 | 


--------------------------------------------------------------------------------
/feature_recorder_sql.h:
--------------------------------------------------------------------------------
 1 | #include <cassert>
 2 | #include <cinttypes>
 3 | 
 4 | #include <atomic>
 5 | #include <map>
 6 | #include <regex>
 7 | #include <string>
 8 | #include <thread>
 9 | 
10 | #include "feature_recorder.h"
11 | #include "pos0.h"
12 | 
13 | #ifdef HAVE_SQLITE3_H
14 | #include <sqlite3.h>
15 | 
16 | class feature_recorder_sql : public feature_recorder {
17 |     struct besql_stmt {
18 |         besql_stmt(const besql_stmt&) = delete;
19 |         besql_stmt& operator=(const besql_stmt&) = delete;
20 |         std::mutex Mstmt{};
21 |         sqlite3_stmt* stmt{}; // the prepared statement
22 |         besql_stmt(sqlite3* db3, const char* sql);
23 |         virtual ~besql_stmt();
24 |         void insert_feature(const pos0_t& pos, // insert it into this table!
25 |                             const std::string& feature, const std::string& feature8, const std::string& context);
26 |     };
27 | #if defined(HAVE_SQLITE3_H) and defined(HAVE_LIBSQLITE3)
28 |     // virtual void dump_histogram_sqlite3(const histogram_def &def,void *user,feature_recorder::dump_callback_t cb)
29 |     // const;
30 | #endif
31 | public:
32 |     feature_recorder_sql(class feature_recorder_set& fs, feature_recorder_def def);
33 |     virtual ~feature_recorder_sql();
34 |     //virtual void histogram_write(AtomicUnicodeHistogram& h) override; // flush a specific histogram
35 | };
36 | #endif
37 | 


--------------------------------------------------------------------------------
/CODING_STANDARDS.txt:
--------------------------------------------------------------------------------
 1 | Coding Standards v1.0
 2 | Simson L. Garfinkel
 3 | December 3, 2013
 4 | 
 5 | All standards are based on compromise. These standards seem to be a
 6 | good compromise between a variety of coding styles and existing
 7 | standards.
 8 | 
 9 | Executive summary:
10 | 
11 | * No tabs in source code.
12 | 
13 |   Legacy code has tabs at 8 characters; they can be freely converted
14 |   to spaces as necessary.
15 | 
16 | * Indent at 4 spaces.
17 | 
18 | * Open braces start on the SAME LINE for:
19 |   - if statements
20 |   - inline functions in .h headers
21 |   - Java function declarations
22 | 
23 | * Open braces start on NEXT LINE for:
24 |   - C function declarations
25 | 
26 | * We use the following lines/configuration variables to try to enforce
27 |   the above:
28 | 
29 |   For EMACS at the top of c programs:
30 |   /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
31 | 
32 |   In .emacs files:
33 |   (setq-default indent-tabs-mode nil)
34 |   (setq c-basic-offset 4)
35 | 
36 | 
37 | * In general, do not use pointers in structures if nullptr is undefined. Always use references in these cases.
38 | 
39 | References:
40 | ===========
41 | 
42 | * http://www.emacswiki.org/emacs/NoTabs
43 | 
44 | * http://www.jwz.org/doc/tabs-vs-spaces.html
45 | 
46 | * http://slashdot.org/pollBooth.pl?qid=395&aid=-1
47 | 
48 | * http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml
49 | 
50 | * http://www.python.org/dev/peps/pep-0008/#maximum-line-length
51 | 


--------------------------------------------------------------------------------
/tests/unilang8.htm:
--------------------------------------------------------------------------------
 1 | <html>
 2 | <head>
 3 | <meta http-equiv="content-type" content="text/html; charset=UTF-8">
 4 | <title>"Language Learning and Teaching" (more or less) in 16 or more languages</title>
 5 | </head>
 6 | <body bgcolor="c0c0c0" text="#000040" background="/graphics/backgrnd.gif">
 7 | <B><center><h1>
 8 | "Language Learning and Teaching" (more or less) in 16 or more languages
 9 | </h1></center></B>
10 | <hr>
11 | 外国語の学習と教授
12 | <P>
13 | Language Learning and Teaching
14 | <P>
15 | Изучение и обучение иностранных языков
16 | <P>
17 | Tere Daaheng Aneng Karimah
18 | <P>
19 | 語文教學・语文教学
20 | <P>
21 | Enseñanza y estudio de idiomas
22 | <P>
23 | Изучаване и Преподаване на Чужди Езици
24 | <P>
25 | ქართული ენის შესწავლა და სწავლება
26 | <P>
27 | 'læŋɡwidʒ 'lɘr:niŋ ænd 'ti:tʃiŋ
28 | <P>
29 | Lus kawm thaib qhia
30 | <P>
31 | Ngôn Ngữ, Sự học,
32 | <P>
33 | ‭‫ללמוד וללמד את השֵפה
34 | <P>
35 | L'enseignement et l'étude des langues
36 | <P>
37 | 말배우기와 가르치기
38 | <P>
39 | Nauka języków obcych
40 | <P>
41 | Γλωσσική Εκμὰθηση και Διδασκαλία
42 | <P>
43 | ‭‫ﺗﺪﺭﯾﺲ ﻭ ﯾﺎﺩﮔﯿﺮﯼ ﺯﺑﺎﻥ
44 | <P>
45 | Sprachlernen und -lehren
46 | <P>
47 | ‭‫ﺗﻌﻠﻢ ﻭﺗﺪﺭﻳﺲ ﺍﻟﻌﺮﺑﻴﺔ
48 | <P>
49 | เรียนและสอนภาษา
50 | <p>
51 | <HR><CENTER><A HREF="/index.htm">Home</A> | <A HREF="/sitemap.htm">Site Map</A>
52 | | <A HREF="/hcf/hcf.htm">Services</A> | <A HREF="/whatsnew.htm">New</A>
53 | | <A HREF="/wincalis.htm">WinCALIS</A> | <A HREF="/uniintro.htm">UniEdit</A>
54 | <HR></CENTER>
55 | 
56 | </body>
57 | </html>
58 | 


--------------------------------------------------------------------------------
/etc/install_autotools.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # Originally from https://gist.github.com/GraemeConradie/49d2f5962fa72952bc6c64ac093db2d5
 4 | # Install gnu autotools for running under github actions
 5 | 
 6 | ##
 7 | # Install autoconf, automake and libtool smoothly on Mac OS X.
 8 | # Newer versions of these libraries are available and may work better on OS X
 9 | ##
10 | 
11 | export build=~/devtools             # or wherever you'd like to build
12 | export PATH=$PATH:/usr/local/bin
13 | mkdir -p $build
14 | 
15 | ##
16 | # Autoconf
17 | # https://ftpmirror.gnu.org/autoconf
18 | 
19 | AUTOCONF="autoconf-2.69"
20 | cd $build
21 | curl -k -OL https://ftpmirror.gnu.org/autoconf/$AUTOCONF.tar.gz || exit 1
22 | ls -l $AUTOCONF.tar.gz
23 | tar xzf $AUTOCONF.tar.gz  || xxd $AUTOCONF.tar.gz || exit 1
24 | cd $AUTOCONF
25 | ./configure --prefix=/usr/local || exit 1
26 | make || exit 1
27 | sudo make install || exit 1
28 | 
29 | ##
30 | # Automake
31 | # https://ftpmirror.gnu.org/automake
32 | 
33 | AUTOMAKE="automake-1.16.3"
34 | cd $build
35 | curl -k -OL https://ftpmirror.gnu.org/automake/$AUTOMAKE.tar.gz || exit 1
36 | ls -l $AUTOMAKE.tar.gz
37 | tar xzf $AUTOMAKE.tar.gz
38 | cd $AUTOMAKE
39 | ./configure --prefix=/usr/local
40 | make
41 | sudo make install
42 | 
43 | ##
44 | # Libtool
45 | # https://ftpmirror.gnu.org/libtool
46 | 
47 | LIBTOOL=libtool-2.4.6
48 | cd $build
49 | curl -k -OL https://ftpmirror.gnu.org/libtool/$LIBTOOL.tar.gz || exit 1
50 | ls -l $LIBTOOL.tar.gz
51 | tar xzf $LIBTOOL.tar.gz
52 | cd $LIBTOOL
53 | ./configure --prefix=/usr/local
54 | make
55 | sudo make install
56 | 
57 | echo "Installation complete."
58 | 


--------------------------------------------------------------------------------
/m4/slg_address_sanitizer.m4:
--------------------------------------------------------------------------------
 1 | ################################################################
 2 | ## AddressSanitizer support
 3 | # https://github.com/libMesh/libmesh/issues/1396
 4 | AC_ARG_ENABLE([address-sanitizer],
 5 |               [AS_HELP_STRING([--enable-address-sanitizer],
 6 |                               [enabled AddressSanitizer support for detecting a wide variety of
 7 |                                memory allocation and deallocation errors])],
 8 |               [AC_DEFINE(HAVE_ADDRESS_SANITIZER, 1, [enable AddressSanitizer])
 9 |               address_sanitizer="yes"
10 |               CXXFLAGS="$CXXFLAGS -fsanitize=address -fsanitize-address-use-after-scope"
11 |               ],
12 |               [])
13 | 
14 | AC_ARG_ENABLE([thread-sanitizer],
15 |               [AS_HELP_STRING([--enable-thread-sanitizer],
16 |                               [enabled ThreadSanitizer support for detecting a wide variety of
17 |                                thread interlocking errors])],
18 |               [AC_DEFINE(HAVE_THREAD_SANITIZER, 1, [enable ThreadSanitizer])
19 |               thread_sanitizer="yes"
20 |               CXXFLAGS="$CXXFLAGS -fsanitize=thread "
21 |               ],
22 |               [])
23 | 
24 | AC_ARG_ENABLE([undefined-sanitizer],
25 |               [AS_HELP_STRING([--enable-undefined-sanitizer],
26 |                               [enabled UndefinedSanitizer support for detecting a wide variety of undefined])],
27 |               [AC_DEFINE(HAVE_UNDEFINED_SANITIZER, 1, [enable UndefinedSanitizer])
28 |               undefined_sanitizer="yes"
29 |               CXXFLAGS="$CXXFLAGS -fsanitize=undefined "
30 |               ],
31 |               [])
32 | 


--------------------------------------------------------------------------------
/Makefile.am:
--------------------------------------------------------------------------------
 1 | # be20_api Makefile.am
 2 | # This file is compiled with automake to create Makefile.in.
 3 | # Makefile.in is transformed by "configure" to create Makefile
 4 | #
 5 | # (C) 2020-2022 Simson L. Garfinkel
 6 | # (C) 2020-2023 BasisTech LLC
 7 | # https://www.gnu.org/licenses/lgpl-3.0.en.html
 8 | 
 9 | # don't include bootstrap. People run it, and they shouldn't
10 | # It's only for people who check out the git repo
11 | 
12 | # Use the current directory and include the list of BE20_API sources
13 | BE20_API_DIR = .
14 | include Makefile.defs
15 | 
16 | DFXML_SRC_DIR=dfxml_cpp/src/
17 | include $(DFXML_SRC_DIR)Makefile.defs
18 | 
19 | # Hardcode dfxml_cpp/src/Makefile.defs ; there is a typo somewhere.
20 | EXTRA_DIST = \
21 | 	$(DFXML_EXTRA_DIST) \
22 | 	$(BE20_API_EXTRA_DIST) \
23 | 	bootstrap.sh \
24 | 	test_be20_api_malloc_debug \
25 | 	tests/random.dat \
26 | 	tests/regex_demo.cpp \
27 | 	tests/unilang.htm \
28 | 	tests/unilang8.htm
29 | 
30 | ETAGS = etags-emacs
31 | ACLOCAL_AMFLAGS = -I m4
32 | 
33 | AM_CPPFLAGS = @RE2_CFLAGS@ -I$(top_srcdir)/utfcpp/source
34 | 
35 | clean-local:
36 | 	rm -f *.gcov *~ *.gcda *.gcno
37 | 
38 | clean-gcov:
39 | 	rm -f *.gcov *.gcda *.gcno
40 | 
41 | clang-format:
42 | 	clang-format* -i *h *cpp
43 | 
44 | AUTOMAKE_OPTIONS = subdir-objects
45 | 
46 | bin_PROGRAMS = test_be20_api
47 | check_PROGRAMS = test_be20_api
48 | check_SCRIPTS = test_be20_api_malloc_debug
49 | TESTS = $(check_PROGRAMS)
50 | 
51 | # apitest: test_be20_api
52 | 
53 | test_be20_api_LDADD = @RE2_LIBS@ $(LIBS)
54 | test_be20_api_SOURCES = $(BE20_API_SRC) $(DFXML_READER) $(DFXML_WRITER) \
55 | 	catch.hpp \
56 | 	test_be20_api.cpp \
57 | 	test_be20_threadpool.cpp \
58 | 	test_image_reader.h \
59 | 	test_image_reader.cpp
60 | 


--------------------------------------------------------------------------------
/.github/workflows/build-windows.yml:
--------------------------------------------------------------------------------
 1 | name: BE20_API CI Windows
 2 | on:
 3 |   pull_request:
 4 |     branches: [ main ]
 5 |   push:
 6 |     branches: [ main ]
 7 | 
 8 | 
 9 | jobs:
10 |   build:
11 |     runs-on: 'windows-latest'
12 |     steps:
13 |       - name: Checkout
14 |         uses: actions/checkout@v4
15 |         with:
16 |           submodules: recursive
17 | 
18 |       # ----------------------------
19 |       # Windows ( mSYS2 UCRT64)
20 |       # ----------------------------
21 |       - name: Setup MSYS2
22 |         uses: msys2/setup-msys2@v2
23 |         with:
24 |           update: true
25 |           msystem: ucrt64
26 |           path-type: inherit
27 | 
28 |       - name: Install Windows dependencies and bootstrap
29 |         shell: msys2 {0}
30 |         env:
31 |           WANT_AUTOCONF: "2.71"
32 |         run: |
33 |           pacman -Syu --noconfirm
34 |           pacman -S --needed --noconfirm base-devel automake autoconf pkgconf mingw-w64-ucrt-x86_64-gcc mingw-w64-ucrt-x86_64-make \
35 |                mingw-w64-ucrt-x86_64-re2 mingw-w64-ucrt-x86_64-abseil-cpp mingw-w64-ucrt-x86_64-sqlite3 mingw-w64-ucrt-x86_64-openssl \
36 |                mingw-w64-ucrt-x86_64-expat
37 |           bash bootstrap.sh
38 | 
39 |       - name: configure for windows (ucrt64)
40 |         shell: msys2 {0}
41 |         run: |
42 |           ./configure --prefix=/ucrt64
43 | 
44 |       # ----------------------------
45 |       # build + test windows
46 |       # ----------------------------
47 |       - name: make check
48 |         shell: msys2 {0}
49 |         run: |
50 |           make 
51 |           ./test_be20_api.exe
52 |           make check  || (for fn in test*.log ; do echo ""; echo $fn ; cat $fn ; done; exit 1)
53 | 
54 |       - uses: ammaraskar/gcc-problem-matcher@master
55 |         name: GCC Problem Matcher
56 | 
57 | 


--------------------------------------------------------------------------------
/m4/slg_gcc_all_warnings.m4:
--------------------------------------------------------------------------------
 1 | ################################################################
 2 | #
 3 | # Enable all the compiler debugging we can find
 4 | # Simson L. Garfinkel
 5 | #
 6 | # This is originally from PhotoRec, but modified substantially by Simson
 7 | # Figure out which flags we can use with the compiler.
 8 | #
 9 | # These I don't like:
10 | # -Wdeclaration-after-statement -Wconversion
11 | # doesn't work: -Wunreachable-code
12 | # causes configure to crash on gcc-4.2.1: -Wsign-compare-Winline
13 | # causes warnings with unistd.h:  -Wnested-externs
14 | # Just causes too much annoyance: -Wmissing-format-attribute
15 | 
16 | # Check G++
17 | # We don't use these warnings:
18 | # -Waggregate-return -- aggregate returns are GOOD; they simplify code design
19 | # We can use these warnings after ZLIB gets upgraded:
20 | # -Wundef  --- causes problems with zlib
21 | # -Wcast-qual
22 | # -Wmissing-format-attribute  --- Just too annoying
23 | 
24 | AC_LANG_PUSH(C++)
25 | AC_CHECK_HEADERS([string])
26 | CXX_WARNINGS_TO_TEST="-Wall -MD -Wpointer-arith -Wshadow -Wwrite-strings -Wcast-align -Wredundant-decls -Wdisabled-optimization -Wfloat-equal -Wmultichar -Wmissing-noreturn -Woverloaded-virtual -Wsign-promo"
27 | 
28 | if test x"${mingw}" != "xyes" ; then
29 |   # add the warnings we don't want to do on mingw
30 |   CXX_WARNINGS_TO_TEST="$CXX_WARNINGS_TO_TEST  -Weffc++"
31 | fi
32 | 
33 | AC_MSG_NOTICE([C++ Warnings to test: $CXX_WARNINGS_TO_TEST])
34 | 
35 | for option in $CXX_WARNINGS_TO_TEST
36 | do
37 |   SAVE_CXXFLAGS="$CXXFLAGS"
38 |   CXXFLAGS="$CXXFLAGS $option"
39 |   AC_MSG_CHECKING([whether g++ understands $option])
40 |   AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[]], [[]])],
41 |       [has_option=yes],
42 |       [has_option=no; CXXFLAGS="$SAVE_CXXFLAGS"])
43 |   AC_MSG_RESULT($has_option)
44 |   unset has_option
45 |   unset SAVE_CXXFLAGS
46 | done
47 | unset option
48 | AC_LANG_POP()
49 | 


--------------------------------------------------------------------------------
/demos/regex_demo.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <regex>
 3 | #include <string>
 4 | 
 5 | int main(int argc, char** argv) {
 6 |     // Try out the new loop
 7 |     std::string s = "my mother mary.";
 8 |     for (auto& it : s) { std::cout << "it=" << it << "\n"; }
 9 |     exit(0);
10 | 
11 |     // Simple regular expression matching
12 |     const std::string fnames[] = {"foo.txt", "bar.txt", "baz.dat", "zoidberg"};
13 |     const std::regex txt_regex("[a-z]+\\.txt");
14 | 
15 |     for (const auto& fname : fnames) { std::cout << fname << ": " << std::regex_match(fname, txt_regex) << '\n'; }
16 | 
17 |     // Extraction of a sub-match
18 |     const std::regex base_regex("([a-z]+)\\.txt");
19 |     std::smatch base_match;
20 | 
21 |     for (const auto& fname : fnames) {
22 |         if (std::regex_match(fname, base_match, base_regex)) {
23 |             // The first sub_match is the whole string; the next
24 |             // sub_match is the first parenthesized expression.
25 |             if (base_match.size() == 2) {
26 |                 std::ssub_match base_sub_match = base_match[1];
27 |                 std::string base = base_sub_match.str();
28 |                 std::cout << fname << " has a base of " << base << '\n';
29 |             }
30 |         }
31 |     }
32 | 
33 |     // Extraction of several sub-matches
34 |     const std::regex pieces_regex("([a-z]+)\\.([a-z]+)");
35 |     std::smatch pieces_match;
36 | 
37 |     for (const auto& fname : fnames) {
38 |         if (std::regex_match(fname, pieces_match, pieces_regex)) {
39 |             std::cout << fname << '\n';
40 |             for (size_t i = 0; i < pieces_match.size(); ++i) {
41 |                 std::ssub_match sub_match = pieces_match[i];
42 |                 std::string piece = sub_match.str();
43 |                 std::cout << "  submatch " << i << ": " << piece << '\n';
44 |             }
45 |         }
46 |     }
47 | }
48 | 


--------------------------------------------------------------------------------
/test_be20_threadpool.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * be20_api threadpool test is in this file.
 3 |  * The goal is to have complete test coverage of the v2 API
 4 |  *
 5 |  */
 6 | 
 7 | // https://github.com/catchorg/Catch2/blob/master/docs/tutorial.md#top
 8 | 
 9 | #define CATCH_CONFIG_CONSOLE_WIDTH 120
10 | 
11 | #include "config.h"
12 | #include "catch.hpp"
13 | 
14 | #include <algorithm>
15 | #include <array>
16 | #include <chrono>
17 | #include <thread>
18 | #include <cstdlib>
19 | #include <cstring>
20 | #include <fcntl.h>
21 | #include <filesystem>
22 | #include <functional>
23 | #include <iostream>
24 | #include <random>
25 | #include <string>
26 | #include <csignal>
27 | 
28 | #include "dfxml_cpp/src/hash_t.h"
29 | #include "dfxml_cpp/src/dfxml_writer.h"
30 | 
31 | #include "atomic_unicode_histogram.h"
32 | #include "sbuf.h"
33 | #include "sbuf_stream.h"
34 | #include "scanner_set.h"
35 | #include "threadpool.h"
36 | #include "utils.h"
37 | 
38 | #ifndef O_BINARY
39 | #define O_BINARY 0
40 | #endif
41 | 
42 | [[noreturn]] void alarm_handler(int signal)
43 | {
44 |     std::cerr << "alarm\n";
45 |     throw std::runtime_error("scanner_set_mt timeout");
46 | }
47 | 
48 | // This will give an error unless run with MallocNanoZone=0
49 | TEST_CASE("scanner_set_mt", "[thread_pool]") {
50 |     std::cout << std::endl << "This will take at least 60 seconds. Don't give up..." << std::endl;
51 |     INFO("scanner_set_mt test start");
52 |     std::atomic<bool> done{false};
53 | 
54 |     std::thread watchdog([&] {
55 |         using namespace std::chrono_literals;
56 |         std::this_thread::sleep_for(60s);
57 |         if (!done) {
58 |             FAIL("scanner_set_mt test timed out");
59 |         }
60 |     });
61 | 
62 |     scanner_config sc;
63 |     feature_recorder_set::flags_t f;
64 |     scanner_set ss(sc, f, nullptr);
65 |     ss.launch_workers(12);
66 |     ss.set_spin_poll_time(1);
67 |     ss.join();
68 | 
69 |     done = true;
70 |     watchdog.join();
71 | }
72 | 


--------------------------------------------------------------------------------
/sbuf_stream.h:
--------------------------------------------------------------------------------
 1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
 2 | #ifndef SBUF_STREAM_H
 3 | #define SBUF_STREAM_H
 4 | 
 5 | #include "sbuf.h"
 6 | 
 7 | /** \addtogroup bulk_extractor_APIs
 8 |  * @{
 9 |  */
10 | 
11 | /** \file */
12 | /**
13 |  * sbuf_stream provides the get services of sbuf_t but wrapped in a Stream interface.
14 |  * Note that sbuf_stream is not particularly optimized; it is simply a wrapper.
15 |  * Right now this is only used by scan_winprefetch. It could become a general iterator.
16 |  */
17 | class sbuf_stream {
18 | private:
19 |     const sbuf_t& sbuf;
20 |     size_t offset {};
21 | 
22 | public:
23 |     sbuf_stream(const sbuf_t& sbuf_);
24 |     ~sbuf_stream();
25 |     void seek(size_t offset);
26 |     size_t tell();
27 | 
28 |     /**
29 |      * \name integer-based stream readers
30 |      * @{ */
31 |     uint8_t get8u();
32 |     uint16_t get16u();
33 |     uint32_t get32u();
34 |     uint64_t get64u();
35 | 
36 |     uint8_t get8uBE();
37 |     uint16_t get16uBE();
38 |     uint32_t get32uBE();
39 |     uint64_t get64uBE();
40 | 
41 |     uint8_t get8u(sbuf_t::byte_order_t bo);
42 |     uint16_t get16u(sbuf_t::byte_order_t bo);
43 |     uint32_t get32u(sbuf_t::byte_order_t bo);
44 |     uint64_t get64u(sbuf_t::byte_order_t bo);
45 | 
46 |     int8_t get8i();
47 |     int16_t get16i();
48 |     int32_t get32i();
49 |     int64_t get64i();
50 | 
51 |     int8_t get8iBE();
52 |     int16_t get16iBE();
53 |     int32_t get32iBE();
54 |     int64_t get64iBE();
55 | 
56 |     int8_t get8i(sbuf_t::byte_order_t bo);
57 |     int16_t get16i(sbuf_t::byte_order_t bo);
58 |     int32_t get32i(sbuf_t::byte_order_t bo);
59 |     int64_t get64i(sbuf_t::byte_order_t bo);
60 |     /** @} */
61 | 
62 |     /**
63 |      * \name string and wstring stream readers
64 |      * @{ */
65 |     std::string  getUTF8();
66 |     std::string  getUTF8(size_t num_octets_requested );
67 |     std::wstring getUTF16();
68 |     std::wstring getUTF16(size_t num_code_units_requested);
69 |     /** @} */
70 | };
71 | 
72 | #endif
73 | 


--------------------------------------------------------------------------------
/configure.ac:
--------------------------------------------------------------------------------
 1 | AC_PREREQ([2.71])
 2 | AC_INIT([BE20_API],[2.1.0],[bugs@digitalcorpora.org])
 3 | AC_CONFIG_SRCDIR([Makefile.am])        dnl or src/main.c / src/...
 4 | AC_CONFIG_AUX_DIR([build-aux])
 5 | AC_USE_SYSTEM_EXTENSIONS
 6 | AM_INIT_AUTOMAKE
 7 | 
 8 | m4_include([be20_configure.m4])
 9 | m4_include([dfxml_cpp/src/dfxml_configure.m4])
10 | m4_include([m4/slg_address_sanitizer.m4])
11 | m4_include([m4/slg_noopt.m4])
12 | 
13 | AC_LANG([C++])
14 | AC_PROG_CC
15 | AC_PROG_CXX
16 | 
17 | AC_CONFIG_HEADERS([config.h])
18 | AC_CONFIG_FILES([Makefile ])
19 | 
20 | dnl Enforce C++20
21 | AX_CXX_COMPILE_STDCXX([17], [noext], [mandatory])
22 | 
23 | dnl Optional: pkg-config and deps
24 | PKG_PROG_PKG_CONFIG
25 | dnl PKG_CHECK_MODULES([DEPS], [foo >= 1.2 bar])
26 | 
27 | dnl Optional feature toggles (asan example)
28 | AC_ARG_ENABLE([asan],
29 |   [AS_HELP_STRING([--enable-asan], [Build with AddressSanitizer])],
30 |   [], [enable_asan=no])
31 | AS_IF([test "x$enable_asan" = "xyes"], [
32 |   dnl Append rather than overwrite; keep user flags intact
33 |   CXXFLAGS="$CXXFLAGS -fsanitize=address -fno-omit-frame-pointer"
34 |   LDFLAGS="$LDFLAGS -fsanitize=address"
35 | ])
36 | 
37 | 
38 | ################################################################
39 | # Take out duplicate flags
40 | CFLAGS=$(echo $CFLAGS | tr ' ' '\n' | sort -u | tr '\n' ' ')
41 | CPPFLAGS=$(echo $CPPFLAGS | tr ' ' '\n' | sort -u | tr '\n' ' ')
42 | CXXFLAGS=$(echo $CXXFLAGS | tr ' ' '\n' | sort -u | tr '\n' ' ')
43 | AM_LDFLAGS=$(echo $LDFLAGS | tr ' ' '\n' | sort -u | tr '\n' ' ')
44 | 
45 | AC_MSG_NOTICE([*************************************])
46 | AC_MSG_NOTICE([*************************************])
47 | AC_MSG_NOTICE([  PACKAGE_NAME:     $PACKAGE_NAME])
48 | AC_MSG_NOTICE([  PACKAGE_VERSION:  $PACKAGE_VERSION])
49 | AC_MSG_NOTICE([  CC:               $CC])
50 | AC_MSG_NOTICE([  CXX:              $CXX])
51 | AC_MSG_NOTICE([  CPPFLAGS:         $CPPFLAGS])
52 | AC_MSG_NOTICE([  CFLAGS:           $CFLAGS])
53 | AC_MSG_NOTICE([  CXXFLAGS:         $CXXFLAGS])
54 | AC_MSG_NOTICE([  LIBS:             $LIBS])
55 | AC_MSG_NOTICE([  LDFLAGS:          $LDFLAGS])
56 | 
57 | 
58 | AC_OUTPUT
59 | 


--------------------------------------------------------------------------------
/scanner_params.cpp:
--------------------------------------------------------------------------------
 1 | #include "config.h"
 2 | #include "scanner_params.h"
 3 | #include "feature_recorder.h"
 4 | #include "feature_recorder_set.h"
 5 | #include "scanner_set.h"
 6 | #include "path_printer.h"
 7 | 
 8 | scanner_params::scanner_params(struct scanner_config &sc_, class scanner_set  *ss_,
 9 |                                const path_printer *pp_, phase_t phase_, const sbuf_t* sbuf_)
10 |     : sc(sc_), ss(ss_), pp(pp_), phase(phase_), sbuf(sbuf_)
11 | {
12 | }
13 | 
14 | scanner_params::scanner_params(const scanner_params& sp_existing, const sbuf_t* sbuf_, std::string pp_path_)
15 |     : sc(sp_existing.sc), ss(sp_existing.ss), pp(sp_existing.pp), phase(sp_existing.phase), sbuf(sbuf_),
16 |       pp_path(pp_path_), pp_po(sp_existing.pp_po)
17 | {
18 | }
19 | 
20 | 
21 | /* This interface creates if we are in init phase, doesn't if we are in scan phase */
22 | feature_recorder& scanner_params::named_feature_recorder(const std::string feature_recorder_name) const
23 | {
24 |     assert(ss!=nullptr);
25 |     return ss->named_feature_recorder(feature_recorder_name);
26 | }
27 | 
28 | /*
29 |  * Allow call by scanners using the sp. Currently used in scan_zip
30 |  */
31 | bool scanner_params::check_previously_processed(const sbuf_t &s) const
32 | {
33 |     assert(ss!=nullptr);
34 |     return ss->previously_processed_count(s)==0;
35 | }
36 | 
37 | void scanner_params::recurse(const sbuf_t* new_sbuf) const {
38 |     if (pp!=nullptr) {                  // we have a path printer; call that instead
39 |         scanner_params sp_new(*this, new_sbuf, this->pp_path);
40 |         try {
41 |             pp->process_sp( sp_new );           // where do we keep the path being processed? In scanner_params...
42 |         }
43 |         catch (path_printer::path_printer_finished &e) {
44 |             delete new_sbuf;                    // make sure it gets deleted
45 |             throw;                              // and re-throw
46 |         }
47 |         delete new_sbuf;                    // and now we are done with it.
48 |         return;
49 |     }
50 | 
51 |     assert(ss!=nullptr);                // make sure there is a scanner set if we are descending
52 |     // In normal operations we recurse. However, in unit testing recursion is sometimes intentionally disabled.
53 |     // In such a situation, the sbuf is just deleted.
54 |     if (ss->allow_recurse()) {
55 |         ss->schedule_sbuf(new_sbuf);    /* sbuf will be deleted after it is processed */
56 |     } else {
57 |         delete new_sbuf;                // just delete it
58 |     }
59 | }
60 | 


--------------------------------------------------------------------------------
/be20_configure.m4:
--------------------------------------------------------------------------------
 1 | #
 2 | # mix-ins for be20_api
 3 | #
 4 | 
 5 | AC_MSG_NOTICE([be20_api/be20_configure.m4 start])
 6 | AC_DEFINE(BE20_CONFIGURE_APPLIED, 1, [be20_configure.m4 was included by autoconf.ac])
 7 | 
 8 | ################################################################
 9 | ## Endian check. Used for sbuf code.
10 | AC_C_BIGENDIAN([AC_DEFINE(BE20_API_BIGENDIAN, 1, [Big Endian aarchitecutre - like M68K])],
11 |                 AC_DEFINE(BE20_API_LITTLEENDIAN, 1, [Little Endian aarchitecutre - like x86]))
12 | 
13 | ################################################################
14 | ## Headers
15 | AC_CHECK_HEADERS([ dlfcn.h fcntl.h limits.h limits/limits.h linux/if_ether.h net/ethernet.h netinet/if_ether.h netinet/in.h pcap.h pcap/pcap.h sqlite3.h sys/cdefs.h sys/mman.h sys/stat.h sys/time.h sys/types.h sys/vmmeter.h unistd.h windows.h windows.h windowsx.h winsock2.h wpcap/pcap.h mach/mach.h mach-o/dyld.h])
16 | 
17 | AC_CHECK_FUNCS([gmtime_r ishexnumber isxdigit localtime_r unistd.h mmap err errx warn warnx pread64 pread strptime _lseeki64 task_info utimes host_statistics64])
18 | 
19 | ################################################################
20 | ## Libraries
21 | ## Note that we now require pkg-config
22 | 
23 | AC_CHECK_LIB([sqlite3],[sqlite3_libversion])
24 | AC_CHECK_FUNCS([sqlite3_create_function_v2 sysctlbyname])
25 | 
26 | AC_MSG_NOTICE([be20_configure: CPPFLAGS are now $CPPFLAGS])
27 | 
28 | # re2
29 | AC_LANG_PUSH(C++)
30 | AC_CHECK_HEADERS([re2/re2.h])
31 | PKG_CHECK_MODULES([RE2], [re2],
32 |   [
33 |     AC_MSG_NOTICE([re2 detected])
34 |     AC_DEFINE([HAVE_RE2], [1], [Define if you have the RE2 library])
35 |     AC_DEFINE([HAVE_RE2], [1], [Define if you have the RE2 library]) ],
36 |   [AC_MSG_NOTICE([Could not find RE2 library. Please install libre2-dev or equivalent.])]
37 | )
38 | AC_LANG_POP()
39 | 
40 | ################################################################
41 | ## Check on two annoying warnings
42 | AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
43 | [[#pragma GCC diagnostic ignored "-Wredundant-decls"
44 |   int a=3;
45 | ]])],
46 |  [AC_DEFINE(HAVE_DIAGNOSTIC_REDUNDANT_DECLS,1,[define 1 if GCC supports -Wredundant-decls])]
47 | )
48 | 
49 | AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
50 | [[#pragma GCC diagnostic ignored "-Wcast-align"
51 |  int a=3;
52 |   ]])],
53 |  [AC_DEFINE(HAVE_DIAGNOSTIC_CAST_ALIGN,1,[define 1 if GCC supports -Wcast-align])]
54 | )
55 | AC_MSG_NOTICE([be20_api/be20_configure.m4 end])
56 | 
57 | # Take out duplicate flags
58 | RE2_CFLAGS=$(echo $RE2_CFLAGS | tr ' ' '\n' | sort -u | tr '\n' ' ')
59 | RE2_LIBS=$(echo $RE2_LIBS | tr ' ' '\n' | sort -u | tr '\n' ' ')
60 | 


--------------------------------------------------------------------------------
/path_printer.h:
--------------------------------------------------------------------------------
 1 | #ifndef PATH_PRINTER_H
 2 | #define PATH_PRINTER_H
 3 | 
 4 | #include <string>
 5 | #include <map>
 6 | 
 7 | #include "scanner_params.h"
 8 | #include "abstract_image_reader.h"
 9 | 
10 | // C++ does not allow forward references on nested classes.
11 | // "You can't do it, it's a hole in the C++ language. You'll have to un-nest at least one of the nested classes."
12 | // https://stackoverflow.com/questions/951234/forward-declaration-of-nested-types-classes-in-c
13 | 
14 | struct PrintOptions : public std::map<std::string, std::string> {
15 |     static inline const std::string HTTP_EOL {"\r\n"};		// stdout is in binary form
16 |     static inline const size_t DEFAULT_BUFSIZE = 16384;
17 |     enum print_mode_t { MODE_NONE = 0, MODE_HEX, MODE_RAW, MODE_HTTP };
18 |     print_mode_t print_mode {MODE_NONE};
19 |     size_t process_path_bufsize {DEFAULT_BUFSIZE};
20 |     bool http_mode {false};
21 |     std::string get(std::string key, std::string default_) const;
22 |     void add_rfc822_header(std::ostream &os, std::string line);
23 |     size_t content_length {0};
24 | };
25 | 
26 | 
27 | class path_printer {
28 |     class scanner_set &ss;
29 |     abstract_image_reader *reader {nullptr};
30 |     mutable std::stringstream os {};    // for temp creation
31 |     std::ostream &out;                  // for output
32 |     path_printer(const path_printer &) = delete;
33 |     path_printer &operator=(const path_printer &) = delete;
34 | 
35 | public:;
36 |     class path_printer_finished: public std::exception {
37 |     public:
38 |         virtual const char *what() const throw() {
39 |             return "path printer finished.";
40 |         }
41 |     };
42 | 
43 | 
44 | 
45 |     path_printer(scanner_set &ss_, abstract_image_reader *reader_, std::ostream &out);
46 |     static inline const std::string PRINT {"PRINT"};
47 |     static inline const std::string CONTENT_LENGTH {"Content-Length"};
48 |     static inline const std::string DEFAULT_CONTENT_LENGTH {"4096"};
49 | 
50 |     static std::string lowerstr(const std::string str);
51 |     static std::string get_and_remove_token(std::string &path);
52 | 
53 |     void process_sp( const scanner_params &sp ) const; // called recursively by sp.recurse()
54 |     void display_path( std::string path, const PrintOptions &po) const; // entry point for process() command
55 | 
56 |     void process_path(std::string path) ;     // main entrance point to display a path, output to os
57 |     void process_interactive(std::istream &is) ;          // run an interactive server on is
58 |     void process_http(std::istream &is); // read an HTTP command from is and send result to os
59 | };
60 | 
61 | #endif
62 | 


--------------------------------------------------------------------------------
/scan_sha1_test.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  *
 3 |  * scan_sha1:
 4 |  * plug-in demonstration that shows how to write a simple plug-in scanner that calculates
 5 |  * the SHA1 of each sbuf. The hash is written to both the XML file and to the sha1 feature file.
 6 |  *
 7 |  * Don't use this in production systems! It has a histogram that isn't useful for most applications.
 8 |  */
 9 | 
10 | #include "config.h" // needed for hash_t
11 | 
12 | #include <iostream>
13 | #include <sys/types.h>
14 | 
15 | #include "dfxml_cpp/src/hash_t.h"
16 | #include "dfxml_cpp/src/dfxml_writer.h"
17 | #include "scan_sha1_test.h"
18 | #include "scanner_params.h"
19 | #include "scanner_set.h"
20 | 
21 | feature_recorder *sha1_recorder  = nullptr;
22 | void scan_sha1_test(struct scanner_params& sp) {
23 |     if (sp.phase == scanner_params::PHASE_INIT) {
24 |         /* Create a scanner_info block to register this scanner */
25 |         sp.info->set_name("sha1_test");
26 |         sp.info->author = "Simson L. Garfinkel";
27 |         sp.info->description = "Compute the SHA1 of every sbuf.";
28 |         sp.info->url = "https://digitalcorpora.org/bulk_extractor";
29 |         sp.info->scanner_version = "1.0.0";
30 |         sp.info->pathPrefix = "SHA1";      // just use SHA1
31 |         sp.info->min_sbuf_size = 1;        // we can hash a single byte
32 | 
33 |         // specify the feature_records that the scanner wants.
34 |         // Note that the feature recorder does not need to be the same name as the scanner
35 |         // scanners may specify any number of feature recorders.
36 |         sp.info->feature_defs.push_back( feature_recorder_def("sha1_bufs") );
37 | 
38 |         // Note that histogram_defs is a set, so it's okay if this initialization routine is called twice,
39 |         // the histogram only gets inserted once.
40 |         histogram_def hd("test_histogram", "sha1_bufs", "^(.....)", "", "first5", histogram_def::flags_t(true, false));
41 | 
42 |         sp.info->feature_defs.push_back(feature_recorder_def("sha1_bufs"));
43 |         sp.info->histogram_defs.push_back(hd);
44 |         return;
45 |     }
46 |     if (sp.phase == scanner_params::PHASE_INIT2) {
47 |         sha1_recorder = &sp.named_feature_recorder("sha1_bufs");
48 |     }
49 | 
50 |     if (sp.phase == scanner_params::PHASE_SCAN) {
51 |         auto hexdigest = sp.sbuf->hash();
52 | 
53 |         /* Perhaps we want to cache getting the recorders? */
54 |         sha1_recorder->write(sp.sbuf->pos0, hexdigest, ""); // write the hash with no context
55 |         if (sp.ss->writer) {
56 |             sp.ss->writer->xmlout("hashdigest",hexdigest,"type='SHA1'",false);
57 |         }
58 |         return;
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/atomic_set.h:
--------------------------------------------------------------------------------
 1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
 2 | 
 3 | /**
 4 |  * defines atomic_map and atomic_set.
 5 |  * This is a nice lightweight atomic set when not much else is needed.
 6 |  *
 7 |  * 2020-07-06 - slg - Upgraded to to C++17.
 8 |  */
 9 | 
10 | #ifndef ATOMIC_SET_H
11 | #define ATOMIC_SET_H
12 | 
13 | #include <algorithm>
14 | #include <map>
15 | #include <mutex>
16 | #include <set>
17 | #include <unordered_set>
18 | #include <vector>
19 | 
20 | /*
21 |  * note: do not use const TYPE &s for signatures; it caused deadlocks.
22 |  */
23 | 
24 | template <class TYPE> class atomic_set {
25 |     // Mutex M protects myset.
26 |     // It is mutable to allow modification in const methods
27 |     mutable std::mutex M{};
28 |     std::set<TYPE> myset{};
29 | 
30 | public:
31 |     atomic_set() {}
32 |     ~atomic_set() {
33 |         const std::lock_guard<std::mutex> lock(M);
34 |         myset.clear();
35 |     }
36 |     void clear() {
37 |         const std::lock_guard<std::mutex> lock(M);
38 |         myset.clear();
39 |     }
40 |     bool contains(const TYPE& s) const {
41 |         const std::lock_guard<std::mutex> lock(M);
42 |         return myset.find(s) != myset.end();
43 |     }
44 |     void insert(const TYPE s) {
45 |         const std::lock_guard<std::mutex> lock(M);
46 |         myset.insert(s);
47 |     }
48 | 
49 |     void erase(const TYPE s) {
50 |         const std::lock_guard<std::mutex> lock(M);
51 |         myset.erase(s);
52 |     }
53 | 
54 |     /* Returns true if s is in the set, false if it is not.
55 |      * After return, s is in the set.
56 |      */
57 |     bool check_for_presence_and_insert(const TYPE s) {
58 |         const std::lock_guard<std::mutex> lock(M);
59 |         if (myset.find(s) != myset.end()) return true; // in the set
60 |         myset.insert(s);                               // otherwise insert it
61 |         return false;                                  // and return that it wasn't
62 |     }
63 | 
64 |     /* Returns true if s is in the set, false if it is not.
65 |      * After return, s is not the set.
66 |      */
67 |     bool check_for_presence_and_erase(const TYPE s) {
68 |         const std::lock_guard<std::mutex> lock(M);
69 |         bool in_set = (myset.find(s) != myset.end());
70 |         if (in_set){
71 |             myset.erase(s);
72 |         }
73 |         return in_set;                                  // and return that it wasn't
74 |     }
75 | 
76 | 
77 |     /* returns the count, not the bytes */
78 |     size_t size() const {
79 |         const std::lock_guard<std::mutex> lock(M);
80 |         return myset.size();
81 |     }
82 |     /* like python .keys() */
83 |     std::vector<TYPE> keys() const {
84 |         const std::lock_guard<std::mutex> lock(M);
85 |         std::vector<TYPE> ret;
86 |         for (auto obj: myset) {
87 |             ret.push_back(obj);
88 |         }
89 |         return ret;
90 |     }
91 | };
92 | 
93 | #endif
94 | 


--------------------------------------------------------------------------------
/histogram_def.cpp:
--------------------------------------------------------------------------------
 1 | #include "histogram_def.h"
 2 | 
 3 | histogram_def::histogram_def(const std::string& name_,
 4 |                              const std::string& feature_, // which feature file to use
 5 |                              const std::string& pattern_, // which pattern to abstract
 6 |                              const std::string& require_, // text required on the line
 7 |                              const std::string& suffix_,  // which suffix to add to the feature file name for the histogram
 8 |                              const struct flags_t& flags_):
 9 |     name(name_), feature(feature_), pattern(pattern_), reg(pattern_), require(require_), suffix(suffix_), flags(flags_) {
10 | }
11 | 
12 | 
13 | 
14 | bool histogram_def::match(std::u32string u32key, std::string* displayString, const std::string &context) const {
15 |     if (flags.lowercase) {
16 |         u32key = utf32_lowercase(u32key);
17 |     }
18 | 
19 |     if (flags.numeric) {
20 |         u32key = utf32_extract_numeric(u32key);
21 |     }
22 | 
23 |     /* TODO: When we have the ability to do regular expressions in utf32, do that here.
24 |      * We don't have that, so do the rest in utf8
25 |      */
26 | 
27 |     /* Convert match string to u8key */
28 |     std::string u8key = convert_utf32_to_utf8(u32key);
29 | 
30 |     if (require.size() > 0 ){
31 | 
32 |         /* If a string is required and it is not present, return */
33 |         if (flags.require_feature && u8key.find(require)  == std::string::npos) {
34 |             return false;
35 |         }
36 | 
37 |         if (flags.require_context && context.find(require) == std::string::npos) {
38 |             return false;
39 |         }
40 |     }
41 | 
42 |     /* Check for pattern */
43 |     if (pattern.size() > 0) {
44 |         std::smatch m{};
45 |         std::regex_search(u8key, m, this->reg);
46 |         if (m.empty() == true) { // match does not exist
47 |             return false;        // regex not found
48 |         }
49 |         u8key = m.str();
50 |     }
51 | 
52 |     if (displayString) { *displayString = u8key; }
53 |     return true;
54 | }
55 | 
56 | bool histogram_def::match(std::string u32key, std::string* displayString, const std::string &context) const {
57 |     return match(convert_utf8_to_utf32(u32key), displayString, context);
58 | }
59 | 
60 | std::ostream& operator<<(std::ostream& os, const histogram_def::flags_t& f) {
61 |     os << "<histogram_def::flags(";
62 |     if (f.lowercase) os << " lowercase";
63 |     if (f.numeric) os << " numeric";
64 |     if (f.require_feature) os << " require_feature";
65 |     if (f.require_context) os << " require_context";
66 |     os << "> ";
67 |     return os;
68 | }
69 | 
70 | 
71 | std::ostream& operator<<(std::ostream& os, const histogram_def& hd) {
72 |     os << "<histogram_def( name:" << hd.name << " feature:" << hd.feature << " pattern:" << hd.pattern
73 |        << " require:" << hd.require << " suffix:" << hd.suffix << ")>";
74 |     return os;
75 | }
76 | 


--------------------------------------------------------------------------------
/Makefile.defs:
--------------------------------------------------------------------------------
 1 | # including be20_api/Makefile.defs
 2 | BE20_API_SRC= \
 3 | 	$(BE20_API_DIR)/aftimer.h \
 4 | 	$(BE20_API_DIR)/abstract_image_reader.h \
 5 | 	$(BE20_API_DIR)/abstract_image_reader.cpp \
 6 | 	$(BE20_API_DIR)/atomic_map.h \
 7 | 	$(BE20_API_DIR)/atomic_set.h \
 8 | 	$(BE20_API_DIR)/atomic_unicode_histogram.cpp \
 9 | 	$(BE20_API_DIR)/atomic_unicode_histogram.h \
10 | 	$(BE20_API_DIR)/char_class.h \
11 | 	$(BE20_API_DIR)/feature_recorder.cpp \
12 | 	$(BE20_API_DIR)/feature_recorder.h \
13 | 	$(BE20_API_DIR)/feature_recorder_file.cpp \
14 | 	$(BE20_API_DIR)/feature_recorder_file.h \
15 | 	$(BE20_API_DIR)/feature_recorder_set.cpp \
16 | 	$(BE20_API_DIR)/feature_recorder_set.h \
17 | 	$(BE20_API_DIR)/feature_recorder_sql.cpp \
18 | 	$(BE20_API_DIR)/feature_recorder_sql.h \
19 | 	$(BE20_API_DIR)/formatter.h \
20 | 	$(BE20_API_DIR)/histogram_def.cpp \
21 | 	$(BE20_API_DIR)/histogram_def.h  \
22 | 	$(BE20_API_DIR)/machine_stats.h  \
23 | 	$(BE20_API_DIR)/net_ethernet.h \
24 | 	$(BE20_API_DIR)/packet_info.h \
25 | 	$(BE20_API_DIR)/path_printer.h \
26 | 	$(BE20_API_DIR)/path_printer.cpp \
27 | 	$(BE20_API_DIR)/pcap_fake.cpp \
28 | 	$(BE20_API_DIR)/pcap_fake.h \
29 | 	$(BE20_API_DIR)/pos0.cpp \
30 | 	$(BE20_API_DIR)/pos0.h \
31 | 	$(BE20_API_DIR)/regex_vector.cpp \
32 | 	$(BE20_API_DIR)/regex_vector.h \
33 | 	$(BE20_API_DIR)/sbuf.cpp \
34 | 	$(BE20_API_DIR)/sbuf.h \
35 | 	$(BE20_API_DIR)/sbuf_stream.h \
36 | 	$(BE20_API_DIR)/sbuf_stream.cpp \
37 | 	$(BE20_API_DIR)/scan_sha1_test.cpp \
38 | 	$(BE20_API_DIR)/scan_sha1_test.h \
39 | 	$(BE20_API_DIR)/scanner_config.cpp \
40 | 	$(BE20_API_DIR)/scanner_config.h \
41 | 	$(BE20_API_DIR)/scanner_params.cpp \
42 | 	$(BE20_API_DIR)/scanner_params.h \
43 | 	$(BE20_API_DIR)/scanner_set.cpp \
44 | 	$(BE20_API_DIR)/scanner_set.h \
45 |         $(BE20_API_DIR)/thread-pool/thread_pool.hpp \
46 |         $(BE20_API_DIR)/threadpool.h \
47 |         $(BE20_API_DIR)/threadpool.cpp \
48 | 	$(BE20_API_DIR)/unicode_escape.cpp \
49 | 	$(BE20_API_DIR)/unicode_escape.h \
50 | 	$(BE20_API_DIR)/utfcpp/source/utf8.h \
51 | 	$(BE20_API_DIR)/utfcpp/source/utf8/checked.h \
52 | 	$(BE20_API_DIR)/utfcpp/source/utf8/core.h \
53 | 	$(BE20_API_DIR)/utfcpp/source/utf8/cpp11.h \
54 | 	$(BE20_API_DIR)/utfcpp/source/utf8/cpp17.h \
55 | 	$(BE20_API_DIR)/utfcpp/source/utf8/unchecked.h \
56 | 	$(BE20_API_DIR)/utils.cpp \
57 | 	$(BE20_API_DIR)/utils.h \
58 | 	$(BE20_API_DIR)/word_and_context_list.cpp \
59 | 	$(BE20_API_DIR)/word_and_context_list.h \
60 |         $(BE20_API_DIR)/dfxml_cpp/src/dfxml_writer.h \
61 |         $(BE20_API_DIR)/dfxml_cpp/src/hash_t.h \
62 |         $(BE20_API_DIR)/dfxml_cpp/src/cpuid.h
63 | 
64 | BE20_API_EXTRA_DIST=\
65 |         $(BE20_API_DIR)/m4/slg_gcc_all_warnings.m4 \
66 |         $(BE20_API_DIR)/Makefile.defs \
67 |         $(BE20_API_DIR)/Makefile.am \
68 |         $(BE20_API_DIR)/dfxml_cpp/src/Makefile.defs \
69 |         $(BE20_API_DIR)/dfxml_cpp/src/Makefile.am \
70 |         $(BE20_API_DIR)/README.md \
71 | 	$(BE20_API_DIR)/utfcpp/LICENSE \
72 | 	$(BE20_API_DIR)/utfcpp/README.md
73 | 


--------------------------------------------------------------------------------
/.github/workflows/build-ubuntu-macos.yml:
--------------------------------------------------------------------------------
 1 | name: BE20_API CI Ubuntu and Mac
 2 | on:
 3 |   pull_request:
 4 |     branches: [ main ]
 5 |   push:
 6 |     branches: [ main ]
 7 | 
 8 | 
 9 | jobs:
10 |   build:
11 |     runs-on: ${{ matrix.os }}
12 |     strategy:
13 |       matrix:
14 |         os: ['macos-latest','ubuntu-latest']
15 | 
16 |     steps:
17 |       - name: Checkout
18 |         uses: actions/checkout@v4
19 |         with:
20 |           submodules: recursive
21 | 
22 |       # ----------------------------
23 |       # MacOS
24 |       # ----------------------------
25 |       - name: Install MacOS dependencies and run bootstrap
26 |         if: startsWith( matrix.os, 'macos')
27 |         run: |
28 |           brew install autoconf automake libtool abseil pkg-config pcre re2
29 |           autoreconf -fi
30 | 
31 |       # ----------------------------
32 |       # Ubuntu
33 |       # ----------------------------
34 |       - name: Install Ubuntu dependencies and run bootstrap
35 |         if: startsWith( matrix.os, 'ubuntu')
36 |         run: |
37 |           sudo apt update -y
38 |           sudo apt install -y autoconf automake g++ lcov libtool libssl-dev libabsl-dev libre2-dev pkg-config make pkg-config zlib1g-dev
39 |           autoreconf -fi
40 | 
41 |       # ----------------------------
42 |       # Configure for each OS
43 |       # ----------------------------
44 |       - name: configure for ubuntu with codecov
45 |         if: startsWith( matrix.os, 'ubuntu')
46 |         run: |
47 |           ./configure --disable-opt --enable-address-sanitizer \
48 |             CFLAGS='-g -O0 -fprofile-arcs -ftest-coverage' \
49 |             CXXFLAGS='-g -O0 -fprofile-arcs -ftest-coverage' \
50 |             LIBS='-lgcov'
51 | 
52 |       - name: configure for macOS
53 |         if: startsWith( matrix.os, 'macos')
54 |         run: |
55 |           ./configure --enable-maintainer-mode --enable-address-sanitizer --disable-opt --enable-silent-rules
56 | 
57 |       # ----------------------------
58 |       # Common build + test - not windows
59 |       # ----------------------------
60 |       - name: make check
61 |         run: |
62 |           make test_be20_api
63 |           ./test_be20_api
64 |           make check  || (for fn in test*.log ; do echo ""; echo $fn ; cat $fn ; done; exit 1)
65 | 
66 |       - uses: ammaraskar/gcc-problem-matcher@master
67 |         name: GCC Problem Matcher
68 | 
69 |       - name: list files
70 |         run: |
71 |           find . -ls
72 | 
73 |       - name: Generate coverage report for ubuntu
74 |         if: startsWith( matrix.os, 'ubuntu')
75 |         run: |
76 |           lcov --capture --directory . --output-file coverage.info
77 |           #lcov --remove linux-coverage.info '/usr/*' --output-file linux-coverage.info
78 |           lcov --list coverage.info
79 | 
80 |       - name: Upload coverage to Codecov
81 |         if: startsWith( matrix.os, 'ubuntu')
82 |         uses: codecov/codecov-action@v5
83 |         with:
84 |           token: ${{ secrets.CODECOV_TOKEN }}
85 |           fail_ci_if_error: false
86 |           files: coverage.info
87 |           flags: unittests
88 |           name: sleuthkit-codecov
89 | 


--------------------------------------------------------------------------------
/pcap_fake.h:
--------------------------------------------------------------------------------
 1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
 2 | /*
 3 |  * pcap_fake.h
 4 |  * A fake libpcap interface that can only read files without a filter.
 5 |  */
 6 | 
 7 | #include <cstdint>
 8 | #include <cstdio>
 9 | #include <sys/cdefs.h>
10 | #include <sys/time.h>
11 | 
12 | __BEGIN_DECLS
13 | 
14 | /*
15 |  * Version number of the current version of the pcap file format.
16 |  *
17 |  * NOTE: this is *NOT* the version number of the libpcap library.
18 |  * To fetch the version information for the version of libpcap
19 |  * you're using, use pcap_lib_version().
20 |  */
21 | #define PCAP_VERSION_MAJOR 2
22 | #define PCAP_VERSION_MINOR 4
23 | #define PCAP_ERRBUF_SIZE 256
24 | 
25 | struct pcap_file_header {
26 |     uint32_t magic;         // d4 c3 b2 a1
27 |     uint16_t version_major; // 02 00
28 |     uint16_t version_minor; // 04 00
29 |     int32_t thiszone;       /* gmt to local correction - 00 00 00 00*/
30 |     uint32_t sigfigs;       /* accuracy of timestamps */
31 |     uint32_t snaplen;       /* max length saved portion of each pkt */
32 |     uint32_t linktype;      /* data link type (LINKTYPE_*) */
33 | } __attribute__((packed));
34 | struct pcap_pkthdr {
35 |     struct timeval ts; /* time stamp; native */
36 |     uint32_t caplen;   /* length of portion present */
37 |     uint32_t len;      /* length this packet (off wire) */
38 | } __attribute__((packed));
39 | 
40 | /* What we need after opening the file to process each next packet */
41 | typedef struct pcap pcap_t;
42 | 
43 | /*
44 |  * Taken from pcap-int.h
45 |  */
46 | // typedef int (*setfilter_op_t)(pcap_t *, struct bpf_program *);
47 | typedef void (*pcap_handler)(uint8_t*, const struct pcap_pkthdr*, const uint8_t*);
48 | 
49 | struct bpf_program {
50 |     int valid; // set true if filter is valid
51 | };
52 | 
53 | char* pcap_lookupdev(char*);                               // not implemented
54 | pcap_t* pcap_open_live(const char*, int, int, int, char*); // not implemented
55 | pcap_t* pcap_open_offline(const char*, char*);             // open the file; set f
56 | pcap_t* pcap_fopen_offline(FILE* fp, char* errbuf);
57 | void pcap_close(pcap_t*);                            // close the file
58 | int pcap_loop(pcap_t*, int, pcap_handler, uint8_t*); // read the file and call loopback on each packet
59 | int pcap_datalink(pcap_t*);                          // noop
60 | int pcap_setfilter(pcap_t*, struct bpf_program*);    // noop
61 | int pcap_compile(pcap_t*, struct bpf_program*, const char*, int, uint32_t); // generate error if filter provided
62 | char* pcap_geterr(pcap_t*);
63 | /*
64 |  * These are the types that are the same on all platforms, and that
65 |  * have been defined by <net/bpf.h> for ages.
66 |  */
67 | #define DLT_NULL 0    /* BSD loopback encapsulation */
68 | #define DLT_EN10MB 1  /* Ethernet (10Mb) */
69 | #define DLT_EN3MB 2   /* Experimental Ethernet (3Mb) */
70 | #define DLT_AX25 3    /* Amateur Radio AX.25 */
71 | #define DLT_PRONET 4  /* Proteon ProNET Token Ring */
72 | #define DLT_CHAOS 5   /* Chaos */
73 | #define DLT_IEEE802 6 /* 802.5 Token Ring */
74 | #define DLT_ARCNET 7  /* ARCNET, with BSD-style header */
75 | #define DLT_SLIP 8    /* Serial Line IP */
76 | #define DLT_PPP 9     /* Point-to-point Protocol */
77 | #define DLT_FDDI 10   /* FDDI */
78 | #define DLT_RAW 101   /* just packets */
79 | 
80 | __END_DECLS
81 | 


--------------------------------------------------------------------------------
/regex_vector.h:
--------------------------------------------------------------------------------
 1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
 2 | /*
 3 |  * regex_vector.h:
 4 |  *
 5 |  * Now this covers Google's RE2 library.
 6 |  * Note:
 7 |  *  1 - RE2 and the objects are not move insertable, so we need to manually manage creating and deleting them.
 8 |  *  2 - RE2's PartialMatch function wont' return the position of a match unless it is wrapped in a group " () ",
 9 |         so we do that.
10 |  */
11 | 
12 | #ifndef REGEX_VECTOR_H
13 | #define REGEX_VECTOR_H
14 | 
15 | #include <cassert>
16 | #include <cstring>
17 | #include <fstream>
18 | #include <iostream>
19 | #include <set>
20 | #include <string>
21 | #include <vector>
22 | #include <cstdlib>
23 | 
24 | #include "config.h"
25 | 
26 | #ifdef HAVE_RE2
27 | #include <re2/re2.h>            // it's always here.
28 | #endif
29 | 
30 | /**
31 |  * The regex_vector is a vector of character regexes with a few additional convenience functions.
32 |  * We might want to change this to handle ASCII, UTF-16 and UTF-8 characters simultaneously.
33 |  * Only RE2 is supported because it is the only regular expression library that doesn't die on large segments.
34 |  * See: https://swtch.com/~rsc/regexp/regexp3.html#caveats
35 |  */
36 | 
37 | class regex_vector {
38 |     std::vector<std::string> regex_strings; // the original regex strings
39 | #ifdef HAVE_RE2
40 |     std::vector<RE2 *> re2_regex_comps;     // the compiled regular expressions
41 | #endif
42 |     regex_vector(const regex_vector&) = delete;
43 |     regex_vector& operator=(const regex_vector&) = delete;
44 |     static const std::string RE_ENGINE;
45 | 
46 | public:
47 |     static bool engine_enabled(const std::string engine) {
48 |         /** each engine is enabled if it is the first to check, or if it is specified */
49 |         return std::getenv(RE_ENGINE.c_str()) == nullptr ||
50 |             std::getenv(RE_ENGINE.c_str())==engine;
51 |     }
52 |     regex_vector() : regex_strings()
53 | #ifdef HAVE_RE2
54 |                    , re2_regex_comps()
55 | #endif
56 |     {};
57 |     ~regex_vector();
58 | 
59 |     // is this a regular expression with meta characters?
60 |     static bool has_metachars(const std::string& str);
61 |     const std::string regex_engine(); // which engine is in use
62 | 
63 |     /* Add a string */
64 | #ifndef HAVE_RE2
65 | [[noreturn]]
66 | #endif
67 |     void push_back(const std::string& val);
68 |     // Empty the vectors. For the compiled, be sure to delete them
69 |     void clear();
70 |     size_t size() const;        // the number of regular expressions in the vector
71 | 
72 |     /**
73 |      * Read regular expressions from a file: returns 0 if successful, -1 if failure.
74 |      * @param fname - the file to read.
75 |      */
76 |     int readfile(const std::string& fname); // read a file of regexes, one per line
77 | 
78 |     /** Run Return true if any of the regexes match.
79 |      * search_all() is threadsafe.
80 |      * @param probe  - the string we are searching.
81 |      * *found - set to the found string if something is found.
82 |      */
83 | 
84 |     bool search_all(const std::string& probe,
85 |                     std::string* found,
86 |                     size_t* offset = nullptr,
87 |                     size_t* len = nullptr) const;
88 |     void dump(std::ostream& os) const;
89 | };
90 | 
91 | std::ostream& operator<<(std::ostream& os, const class regex_vector& rv);
92 | 
93 | #endif
94 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | ## Copyright/Non-Copyright Statements
 2 | 
 3 | **bulk_extractor** was originally developed by Simson Garfinkel while at
 4 | the Naval Postgraduate School. As a work of the US Government this
 5 | work is not subject to copyright law.
 6 | 
 7 | Simson Garfinkel left the Naval Postgraduate School in January 2015
 8 | and continued to work on **bulk_extractor** in his personal
 9 | capacity. Those modifications are covered under the MIT license. Other
10 | components are licensed as noted.
11 | 
12 | ## MIT License.
13 | 
14 | Copyright (c) 2020, Simson L. Garfinkel {{ organization }}
15 | 
16 | Permission is hereby granted, free of charge, to any person obtaining a copy
17 | of this software and associated documentation files (the "Software"), to deal
18 | in the Software without restriction, including without limitation the rights
19 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
20 | copies of the Software, and to permit persons to whom the Software is
21 | furnished to do so, subject to the following conditions:
22 | 
23 | The above copyright notice and this permission notice shall be included in all
24 | copies or substantial portions of the Software.
25 | 
26 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
27 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
28 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
29 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
30 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
31 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
32 | OR OTHER DEALINGS IN THE SOFTWARE.
33 | 
34 | ## CC0 Original Summary
35 | 
36 | Except as otherwise noted, bulk_extractor source code files are public domain
37 | software.
38 | 
39 | That software provided here is released by the Naval Postgraduate
40 | School, an agency of the U.S. Department of Navy.  The software bears
41 | no warranty, either expressed or implied. NPS does not assume legal
42 | liability nor responsibility for a User's use of the software or the
43 | results of such use.
44 | 
45 | Please note that within the United States, copyright protection, under
46 | Section 105 of the United States Code, Title 17, is not available for
47 | any work of the United States Government and/or for any works created
48 | by United States Government employees.
49 | 
50 | However, because some bulk_extractor source modules (e.g. pyxpress.c)
51 | are covered under the GNU Public License, the compiled bulk_extractor
52 | executable is covered under the GPL copyright. This means that binary
53 | distributions of bulk_extractor must include the full source code (or
54 | have the source code be made easily available.)
55 | 
56 | ## Other materials
57 | 
58 | bulk_extractor includes the following materials:
59 | 
60 | * uses some SleuthKit 3 include files. There are present
61 | in the directory src/tsk3.
62 | 
63 | * src/tsk3/ includes SleuthKit 3 include files that are party of
64 | SleuthKit 3. These files are  Copyright (C) 2010 Brian Carrier and covered under
65 | the Common Public License 1.0
66 | 
67 | * src/be20_api/utf8.h is Copyright 2006 Nemanja Trifunovic
68 | 
69 | * src/base64_forensic.cpp is Copyright (C) 1996-1999 by Internet Software Consortium, with
70 |  portions Copyright (c) 1995 by International Business Machines, Inc.
71 | 
72 | * src/scan_ascii85.cpp is  Copyright (C) 2011 Remy Oukaour
73 | 
74 | * src/scan_json.cpp is Copyright (c) 2005 JSON.org
75 | 
76 | * src/pyxpress.c is Copyright 2008 (c) Matthieu Suiche. <msuiche[at]gmail.com>
77 | 


--------------------------------------------------------------------------------
/net_ethernet.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (C) 1997, 1999, 2001, 2008 Free Software Foundation, Inc.
 2 |    This file is part of the GNU C Library.
 3 | 
 4 |    The GNU C Library is free software; you can redistribute it and/or
 5 |    modify it under the terms of the GNU Lesser General Public
 6 |    License as published by the Free Software Foundation; either
 7 |    version 2.1 of the License, or (at your option) any later version.
 8 | 
 9 |    The GNU C Library is distributed in the hope that it will be useful,
10 |    but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12 |    Lesser General Public License for more details.
13 | 
14 |    You should have received a copy of the GNU Lesser General Public
15 |    License along with the GNU C Library; if not, write to the Free
16 |    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
17 |    02111-1307 USA.  */
18 | 
19 | /* Based on the FreeBSD version of this file. Curiously, that file
20 |    lacks a copyright in the header. */
21 | 
22 | #ifndef __NET_ETHERNET_H
23 | #define __NET_ETHERNET_H 1
24 | 
25 | #include <sys/cdefs.h>
26 | #include <sys/types.h>
27 | //#include <linux/if_ether.h>     /* IEEE 802.3 Ethernet constants */
28 | 
29 | __BEGIN_DECLS
30 | 
31 | /* This is a name for the 48 bit ethernet address available on many
32 |    systems.  */
33 | struct ether_addr {
34 |     u_int8_t ether_addr_octet[ETH_ALEN];
35 | } __attribute__((__packed__));
36 | 
37 | /* 10Mb/s ethernet header */
38 | struct ether_header {
39 |     u_int8_t ether_dhost[ETH_ALEN]; /* destination eth addr	*/
40 |     u_int8_t ether_shost[ETH_ALEN]; /* source ether addr	*/
41 |     u_int16_t ether_type;           /* packet type ID field	*/
42 | } __attribute__((__packed__));
43 | 
44 | /* Ethernet protocol ID's */
45 | #define ETHERTYPE_PUP 0x0200      /* Xerox PUP */
46 | #define ETHERTYPE_SPRITE 0x0500   /* Sprite */
47 | #define ETHERTYPE_IP 0x0800       /* IP */
48 | #define ETHERTYPE_ARP 0x0806      /* Address resolution */
49 | #define ETHERTYPE_REVARP 0x8035   /* Reverse ARP */
50 | #define ETHERTYPE_AT 0x809B       /* AppleTalk protocol */
51 | #define ETHERTYPE_AARP 0x80F3     /* AppleTalk ARP */
52 | #define ETHERTYPE_VLAN 0x8100     /* IEEE 802.1Q VLAN tagging */
53 | #define ETHERTYPE_IPX 0x8137      /* IPX */
54 | #define ETHERTYPE_IPV6 0x86dd     /* IP protocol version 6 */
55 | #define ETHERTYPE_LOOPBACK 0x9000 /* used to test interfaces */
56 | 
57 | #define ETHER_ADDR_LEN ETH_ALEN                       /* size of ethernet addr */
58 | #define ETHER_TYPE_LEN 2                              /* bytes in type field */
59 | #define ETHER_CRC_LEN 4                               /* bytes in CRC field */
60 | #define ETHER_HDR_LEN ETH_HLEN                        /* total octets in header */
61 | #define ETHER_MIN_LEN (ETH_ZLEN + ETHER_CRC_LEN)      /* min packet length */
62 | #define ETHER_MAX_LEN (ETH_FRAME_LEN + ETHER_CRC_LEN) /* max packet length */
63 | 
64 | /* make sure ethenet length is valid */
65 | #define ETHER_IS_VALID_LEN(foo) ((foo) >= ETHER_MIN_LEN && (foo) <= ETHER_MAX_LEN)
66 | 
67 | /*
68 |  * The ETHERTYPE_NTRAILER packet types starting at ETHERTYPE_TRAIL have
69 |  * (type-ETHERTYPE_TRAIL)*512 bytes of data followed
70 |  * by an ETHER type (as given above) and then the (variable-length) header.
71 |  */
72 | #define ETHERTYPE_TRAIL 0x1000 /* Trailer packet */
73 | #define ETHERTYPE_NTRAILER 16
74 | 
75 | #define ETHERMTU ETH_DATA_LEN
76 | #define ETHERMIN (ETHER_MIN_LEN - ETHER_HDR_LEN - ETHER_CRC_LEN)
77 | 
78 | __END_DECLS
79 | 
80 | #endif /* net/ethernet.h */
81 | 


--------------------------------------------------------------------------------
/feature_recorder_file.h:
--------------------------------------------------------------------------------
 1 | #ifndef FEATURE_RECORDER_FILE_H
 2 | #define FEATURE_RECORDER_FILE_H
 3 | 
 4 | #include "config.h"
 5 | 
 6 | #include <cassert>
 7 | #include <cinttypes>
 8 | 
 9 | #include <atomic>
10 | #include <fstream>
11 | #include <iostream>
12 | #include <map>
13 | #include <regex>
14 | #include <set>
15 | #include <string>
16 | #include <thread>
17 | #include <mutex>
18 | 
19 | #include "feature_recorder.h"
20 | #include "pos0.h"
21 | #include "sbuf.h"
22 | 
23 | class feature_recorder_file : public feature_recorder {
24 | public:
25 |     inline static const std::string feature_file_header   {"# Feature-File-Version: 1.1\n"};
26 |     inline static const std::string histogram_file_header {"# Histogram-File-Version: 1.1\n"};
27 |     inline static const std::string bulk_extractor_version_header {
28 |         "# " PACKAGE_NAME "-Version: " PACKAGE_VERSION "\n"};
29 | 
30 |     static std::string unquote_string(const std::string& s);
31 | 
32 |     feature_recorder_file(class feature_recorder_set& fs, const feature_recorder_def def);
33 |     virtual ~feature_recorder_file();
34 |     virtual void flush() override;
35 |     static bool extract_feature_context(const std::string& line, std::string &feature, std::string &context); // extract feature and context, return true if successful
36 |     static bool isodigit(uint8_t ch){
37 |         return ch>='0' && ch<='7';
38 |     }
39 | 
40 | private:
41 |     std::mutex Mios{};  // mutex for IOS
42 |     std::fstream ios{}; // where features are written
43 | 
44 |     void banner_stamp(std::ostream& os, const std::string& header) const; // stamp banner, and header
45 | 
46 |     //static const std::string histogram_file_header;
47 |     //static const std::string feature_file_header;
48 |     //static const std::string bulk_extractor_version_header;
49 | 
50 |     virtual void shutdown() override;
51 | 
52 | public:
53 |     /* these are not threadsafe and should only be called in startup */
54 |     // void set_carve_ignore_encoding( const std::string &encoding ){ MAINTHREAD();ignore_encoding = encoding;}
55 |     /* End non-threadsafe */
56 | 
57 |     // add i to file_number and return the result
58 |     // fetch_add() returns the original number
59 | 
60 |     /* where stopped items (on stop_list or context_stop_list) get recorded:
61 |      * Cannot be made inline becuase it accesses fs.
62 |      */
63 |     virtual void write0(const std::string& str) override;
64 |     virtual void write0(const pos0_t& pos0, const std::string& feature, const std::string& context) override;
65 | 
66 |     /* histogram support.
67 |     * The file based feature recorder can store the histogram incrementally in memory or it can make it at the end in a second pass.
68 |     */
69 |     static const inline int MAX_HISTOGRAM_FILES = 10; // don't make more than 10 files in low-memory conditions
70 | 
71 |     // the histograms are made in memory with the AtomicUnicodeHistogram object.
72 |     // Each one contains the histogram_def.
73 |     std::vector<std::unique_ptr<AtomicUnicodeHistogram>> histograms{};
74 | 
75 |     virtual size_t histogram_count() override;                 // how many histograms it has
76 |     virtual void histogram_add(const struct histogram_def& def) override;   // add a new histogram
77 | 
78 |     // Adding features to the histogram
79 | 
80 |     virtual void histogram_write_from_memory(AtomicUnicodeHistogram& h); // actually write this histogram
81 |     virtual void histogram_write_from_file(AtomicUnicodeHistogram& h); // actually write this histogram
82 |     virtual void histogram_write(AtomicUnicodeHistogram& h); // write this histogram
83 |     virtual void histograms_incremental_add_feature_context(const std::string& feature, const std::string& context) override;
84 |     virtual bool histograms_write_largest() override;
85 |     virtual void histograms_write_all() override;
86 | };
87 | 
88 | /** @} */
89 | 
90 | #endif
91 | 


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
 1 | # Log of work on
 2 | ## 2021-04-23
 3 | - Got `TEST_CASE("run", "[scanner_set]")` mostly working.
 4 | - After it runs, the output directory looks like this:
 5 | ```
 6 | (base) simsong@nimi be13_api % ls -l /var/folders/09/8v4pdnys627fqqh3vjbvsnq40000gn/T/ISmG9qlC/
 7 | total 4
 8 | -rw-r--r--  1 simsong  staff    0 Apr 23 21:20 alerts.txt
 9 | -rw-r--r--  1 simsong  staff  172 Apr 23 21:20 sha1_bufs.txt
10 | -rw-------  1 simsong  staff    0 Apr 23 21:20 sha1_bufs_Az??
11 | (base) simsong@nimi be13_api % cat  /var/folders/09/8v4pdnys627fqqh3vjbvsnq40000gn/T/ISmG9qlC/sha1_bufs.txt
12 | # BANNER FILE NOT PROVIDED (-b option)
13 | # BE13_API-Version: 1.0.0
14 | # Feature-Recorder: sha1_bufs
15 | # Feature-File-Version: 1.1
16 | hello-0	d3486ae9136e7856bc42212385ea797094475802
17 | ```
18 | 
19 | - [ ] Histogram is created with the wrong filename
20 | - [ ] Histogram file is empty
21 | 
22 | ## 2021-04-24
23 | Current problems are the UTF-8 histograms that are extracted with
24 | regular expressions. Ideally we should do the regular expressions in
25 | Unicode, not in UTF-8
26 | 
27 | Another option is to do everything as UTF-32 regex and convert the
28 | UTF-32 to UTF-8 when rendering into the files.
29 | - https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex
30 | 
31 | Another option is to add an ICU dependency:
32 | - https://unicode-org.github.io/icu/userguide/strings/regexp.html
33 | 
34 | See also:
35 | - http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2015/p0169r0.html
36 | 
37 | Oh, Boost has a unicode regular expressions too:
38 | - https://www.boost.org/doc/libs/1_46_1/libs/regex/doc/html/boost_regex/ref/non_std_strings/icu/unicode_algo.html
39 | 
40 | But make a decision. What we currently have is a mess.
41 | 
42 | ## 2021-04-25
43 | Found an error in which a value from the stack was being passed by
44 | reference, the reference was being retained, and then it was going
45 | bad.
46 | - [ ] Review every pass by reference and change to pass by value when
47 |   possible. Note that pass by value may be more efficient than pass by
48 |   reference with modern compilers.
49 | - [x] Looks like the Atomic Unicode Histogram is using an ASCII/UTF-8
50 |   regular expression on a UTF32 value, which isn't working. Perhaps
51 |   I'm wrong above, and all regular expressions should be done in UTF-8
52 |   and not UTF-32?  EDIT: Decided not to do this.
53 | - [x] Perhaps move to SRELL as the regex package?
54 |   http://www.akenotsuki.com/misc/srell/en/.  EDIT: Decided not to do this.
55 | 
56 | 
57 | ## 2021-04-27
58 | All errors in histogram production seem to be fixed!
59 | - [ ] Need to decide if the first BE2.0 program will be bulk_extractor
60 |   of tcpflow.  Since tcpflow works, let's with with bulk_extractor.
61 | 
62 | 
63 | # Outstanding things to do
64 | 
65 | - [ ] move histograms out of feature_recorder and feature_recorder_set.
66 | - [ ] Instead, histograms are made by the scanner set after the scanners have run, in the shutdown mode.
67 |     - The feature recorders just need a way of reading the contents.
68 |     - The feature_recorder can have any number of readers. It's just an open iostream.
69 | - [ ] Make histogram in-memory and throw them out if you run out of memory, going into low-memory mode for the second pass.
70 | - [ ] Merge of all outstanding histograms can be done single-threaded
71 |       or multi-threaded.
72 | 
73 | ## 2021-05-08
74 | - [ ] Get scanner commands moved from scanner_set to scanner_config.
75 | - [ ] Implement processing of scanner commands to scanner set.
76 | - [ ] Implement tests
77 | 
78 | ## 2021-06-12
79 | - [ ] sbuf_stream and sbuf_private should both be factored into sbuf.
80 | 
81 | - [ ] FrequencyReportHistogram should use unique_ptr<> rather than
82 |   actually the report elements on the vector.
83 | 
84 | ## 2021-11-16
85 | - [ ] Don't need get_scanner_by_name(). I just need a list of the
86 |   enabled scanners and a map of scanner names to scanner info
87 | 


--------------------------------------------------------------------------------
/regex_vector.cpp:
--------------------------------------------------------------------------------
  1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
  2 | 
  3 | #include "regex_vector.h"
  4 | 
  5 | /* rewritten to use C++11's regex */
  6 | const std::string regex_vector::RE_ENGINE {"RE_ENGINE"};
  7 | const std::string regex_vector::regex_engine()
  8 | {
  9 | #ifdef HAVE_RE2
 10 |     if (engine_enabled("RE2")) {
 11 |         return std::string("RE2");
 12 |     }
 13 | #endif
 14 |     return std::string("STD::REGEX");
 15 | }
 16 | 
 17 | regex_vector::~regex_vector()
 18 | {
 19 |     clear();
 20 | }
 21 | 
 22 | 
 23 | /* Only certain characters are assumed to be a regular expression. These characters are
 24 |  * coincidently never in email addresses.
 25 |  */
 26 | bool regex_vector::has_metachars(const std::string& str) {
 27 |     for (auto& it : str) {
 28 |         switch (it) {
 29 |         case '*':
 30 |         case '[':
 31 |         case '(':
 32 |         case '?': return true;
 33 |         }
 34 |     }
 35 |     return false;
 36 | }
 37 | 
 38 | #ifndef HAVE_RE2
 39 | [[noreturn]]
 40 | #endif
 41 | void regex_vector::push_back(const std::string& val) {
 42 | #ifdef HAVE_RE2
 43 |     RE2::Options options;
 44 |     options.set_case_sensitive(false);
 45 |     if (engine_enabled("RE2")){
 46 |         regex_strings.push_back(val);
 47 |         RE2 *re = new RE2(std::string("(") + val + std::string(")"), options);
 48 |         if (!re->ok()){
 49 |             std::cerr << "RE2 compilation failed error: " << re->error() << " compiling: " << val << std::endl;
 50 |             throw std::runtime_error(std::string("RE2 compilation failed"));
 51 |         }
 52 |         re2_regex_comps.push_back( re );
 53 |         return;
 54 |     }
 55 | #else
 56 |     throw std::runtime_error(std::string("RE2 not compiled in"));
 57 | #endif
 58 | }
 59 | 
 60 | void regex_vector::clear() {
 61 |     regex_strings.clear();
 62 | #ifdef HAVE_RE2
 63 |     for (RE2 *re: re2_regex_comps) {
 64 |         delete re;
 65 |     }
 66 |     re2_regex_comps.clear();
 67 | #endif
 68 | }
 69 | 
 70 | size_t regex_vector::size() const {
 71 | #ifdef HAVE_RE2
 72 |     return re2_regex_comps.size();
 73 | #else
 74 |     return 0;
 75 | #endif
 76 | }
 77 | 
 78 | /**
 79 |  * perform a search for a single hit. If there is a group and something is found,
 80 |  * set *found to be what was found, *offset to be the starting offset, and *len to be
 81 |  * the length. Note that this only handles a single group.
 82 |  */
 83 | bool regex_vector::search_all(const std::string& probe, std::string* found, size_t* offset, size_t* len) const {
 84 | #ifdef HAVE_RE2
 85 |     for (RE2 *re: re2_regex_comps) {
 86 |         re2::StringPiece sp;
 87 |         if (RE2::PartialMatch( probe, *re, &sp) ){
 88 |             if (found)  *found  = std::string(sp.data(), sp.size());
 89 |             if (offset) *offset = sp.data() - probe.data(); // this is so gross
 90 |             if (len)    *len    = sp.length();
 91 |             return true;
 92 |         }
 93 |     }
 94 | #endif
 95 |     return false;
 96 | }
 97 | 
 98 | int regex_vector::readfile(const std::string& fname) {
 99 |     std::ifstream f(fname.c_str());
100 |     if (f.is_open()) {
101 |         while (!f.eof()) {
102 |             std::string line;
103 |             getline(f, line);
104 | 
105 |             /* remove the last character while it is a \n or \r */
106 |             if (line.size() > 0 && (((*line.end()) == '\r') || (*line.end()) == '\n')) { line.erase(line.end()); }
107 | 
108 |             /* Create a regular expression and add it */
109 |             push_back(line);
110 |         }
111 |         f.close();
112 |         return 0;
113 |     }
114 |     return -1;
115 | }
116 | 
117 | void regex_vector::dump(std::ostream& os) const {
118 |     for (auto const& it : regex_strings) {
119 |         os << it << "\n";
120 |     }
121 | }
122 | 
123 | std::ostream& operator<<(std::ostream& os, const class regex_vector& rv) {
124 |     rv.dump(os);
125 |     return os;
126 | }
127 | 


--------------------------------------------------------------------------------
/machine_stats.h:
--------------------------------------------------------------------------------
  1 | #ifndef MACHINE_STATS_H
  2 | #define MACHINE_STATS_H
  3 | 
  4 | #ifndef BE20_CONFIGURE_APPLIED
  5 | #error config.h with be20_api additions must be included before machine_stats.h
  6 | #endif
  7 | 
  8 | #ifdef HAVE_MACH_MACH_H
  9 | #include <mach/mach.h>
 10 | #include <mach/mach_host.h>
 11 | #include <mach/host_info.h>
 12 | #include <mach/message.h>  // for mach_msg_type_number_t
 13 | #include <mach/kern_return.h>  // for kern_return_t
 14 | #include <mach/task_info.h>
 15 | #endif
 16 | 
 17 | #ifdef HAVE_SYS_VMMETER_H
 18 | #include <sys/vmmeter.h>
 19 | #endif
 20 | 
 21 | #include <cmath>
 22 | #include <unistd.h>
 23 | 
 24 | /**
 25 |  * return the CPU percentage (0-100) used by the current process. Use 'ps -O %cpu <pid> if system call not available.
 26 |  * The popen implementation is not meant to be efficient.
 27 |  */
 28 | struct machine_stats {
 29 |     static float get_cpu_percentage() {
 30 |         char buf[100];
 31 |         snprintf(buf,sizeof(buf),"ps -O %ccpu %d",'%',getpid());
 32 |         FILE *f = popen(buf,"r");
 33 |         if(f==nullptr){
 34 |             perror("popen failed\n");
 35 |             return(0);
 36 |         }
 37 |         if (fgets(buf,sizeof(buf),f)==NULL) return nan("error1");           /* read the first line */
 38 |         if (fgets(buf,sizeof(buf),f)==NULL) return nan("error2");           /* read the second line */
 39 |         pclose(f);
 40 |         buf[sizeof(buf)-1] = 0;             // in case it needs termination
 41 |         int pid=0;
 42 |         float ff = 0;
 43 |         int count = sscanf(buf,"%d %f",&pid,&ff);
 44 |         return (count==2) ? ff : nan("get_cpu_percentage");
 45 |     };
 46 | 
 47 |     static uint64_t get_available_memory() {
 48 |         // If there is a /proc/meminfo, use it
 49 |         std::ifstream meminfo("/proc/meminfo");
 50 |         if (meminfo.is_open()) {
 51 |             std::string line;
 52 |             while (std::getline(meminfo, line)) {
 53 |                 if (line.substr(0,13)=="MemAvailable:") {
 54 |                     return std::stoll(line.substr(14))*1024;
 55 |                 }
 56 |             }
 57 |         }
 58 | 
 59 | #ifdef HAVE_HOST_STATISTICS64
 60 |         // on macs, use this
 61 |         // https://opensource.apple.com/source/system_cmds/system_cmds-496/vm_stat.tproj/vm_stat.c.auto.html
 62 | 
 63 |         vm_statistics64_data_t	vm_stat;
 64 |         vm_size_t pageSize = 4096; 	/* Default */
 65 |         mach_port_t myHost = mach_host_self();
 66 |         if (host_page_size(myHost, &pageSize) != KERN_SUCCESS) {
 67 |             pageSize = 4096;                // put the default back
 68 |         }
 69 |         vm_statistics64_t stat = &vm_stat;
 70 | 
 71 |         unsigned int count = HOST_VM_INFO64_COUNT;
 72 |         if (host_statistics64(myHost, HOST_VM_INFO64, (host_info64_t)stat, &count) != KERN_SUCCESS) {
 73 |             return 0;
 74 |         }
 75 |         return stat->free_count * pageSize;
 76 | #else
 77 | 	return 0;
 78 | #endif
 79 |     };
 80 | 
 81 |     static void get_memory(uint64_t *virtual_size, uint64_t *resident_size) {
 82 |         *virtual_size = 0;
 83 |         *resident_size = 0;
 84 | 
 85 | #ifdef HAVE_TASK_INFO
 86 |         kern_return_t error;
 87 |         mach_msg_type_number_t outCount;
 88 |         mach_task_basic_info_data_t taskinfo;
 89 | 
 90 |         taskinfo.virtual_size = 0;
 91 |         outCount = MACH_TASK_BASIC_INFO_COUNT;
 92 |         error = task_info(mach_task_self(), MACH_TASK_BASIC_INFO, (task_info_t)&taskinfo, &outCount);
 93 |         if (error == KERN_SUCCESS) {
 94 |             *virtual_size = (uint64_t)taskinfo.virtual_size;
 95 |             *resident_size = (uint64_t)taskinfo.resident_size;
 96 |             return;
 97 |         }
 98 | #endif
 99 | 	const char* statm_path = "/proc/self/statm";
100 | 
101 | 	FILE *f = fopen(statm_path,"r");
102 | 	if(f){
103 | 	    unsigned long size, resident, share, text, lib, data, dt;
104 | 	    if(fscanf(f,"%ld %ld %ld %ld %ld %ld %ld", &size,&resident,&share,&text,&lib,&data,&dt) == 7){
105 | 		*virtual_size  = size * 4096;
106 | 		*resident_size = resident * 4096;
107 |                 fclose(f);
108 | 		return ;
109 | 	    }
110 | 	}
111 | 	fclose(f);
112 | 	return ;
113 |     };
114 | };
115 | 
116 | 
117 | #endif
118 | 


--------------------------------------------------------------------------------
/utils.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * A collection of utility functions that are typically provided,
  3 |  * but which are missing in some implementations.
  4 |  */
  5 | 
  6 | // Just for this module
  7 | //#define _FILE_OFFSET_BITS 64
  8 | 
  9 | #include "config.h"
 10 | #include "utils.h"
 11 | 
 12 | #include <cerrno>
 13 | #include <cstdarg>
 14 | #include <cstdio>
 15 | #include <filesystem>
 16 | 
 17 | #include <mutex>
 18 | #include <sstream>
 19 | 
 20 | /** Extract a buffer...
 21 |  * @param buf - the buffer to extract;
 22 |  * @param buflen - the size of the page to extract
 23 |  * @param pos0 - the byte position of buf[0]
 24 |  */
 25 | 
 26 | #ifndef HAVE_LOCALTIME_R
 27 | /* locking localtime_r implementation */
 28 | std::mutex localtime_mutex;
 29 | void localtime_r(time_t* t, struct tm* tm) {
 30 |     const std::lock_guard<std::mutex> lock(localtime_mutex);
 31 |     *tm = *localtime(t);
 32 | }
 33 | #endif
 34 | 
 35 | #ifndef HAVE_GMTIME_R
 36 | /* locking gmtime_r implementation */
 37 | std::mutex gmtime_mutex;
 38 | void gmtime_r(time_t* t, struct tm* tm) {
 39 |     if (t && tm) {
 40 |         const std::lock_guard<std::mutex> lock(gmtime_mutex);
 41 |         struct tm* tmret = gmtime(t);
 42 |         if (tmret) {
 43 |             *tm = *tmret;
 44 |         } else {
 45 |             memset(tm, 0, sizeof(*tm));
 46 |         }
 47 |     }
 48 | }
 49 | #endif
 50 | 
 51 | bool getenv_debug(const char *name)
 52 | {
 53 |     const char *e = std::getenv(name);
 54 |     if (e==nullptr) return false;
 55 |     if (e[0]=='1' || e[0]=='t' || e[0]=='T' || e[0]=='y' || e[0]=='Y') return true;
 56 |     return false;
 57 | }
 58 | 
 59 | bool starts_with(const std::string& buf, const std::string& with) {
 60 |     size_t buflen = buf.size();
 61 |     size_t withlen = with.size();
 62 |     return buflen > withlen && buf.substr(0,withlen) == with;
 63 | }
 64 | 
 65 | bool ends_with(const std::string& buf, const std::string& with) {
 66 |     size_t buflen = buf.size();
 67 |     size_t withlen = with.size();
 68 |     return buflen > withlen && buf.substr(buflen - withlen, withlen) == with;
 69 | }
 70 | 
 71 | bool ends_with(const std::wstring& buf, const std::wstring& with) {
 72 |     size_t buflen = buf.size();
 73 |     size_t withlen = with.size();
 74 |     return buflen > withlen && buf.substr(buflen - withlen, withlen) == with;
 75 | }
 76 | 
 77 | /****************************************************************/
 78 | /* C++ string splitting code from http://stackoverflow.com/questions/236129/how-to-split-a-string-in-c */
 79 | std::vector<std::string>& split(const std::string& s, char delim, std::vector<std::string>& elems) {
 80 |     std::stringstream ss(s);
 81 |     std::string item;
 82 |     while (std::getline(ss, item, delim)) { elems.push_back(item); }
 83 |     return elems;
 84 | }
 85 | 
 86 | std::vector<std::string> split(const std::string& s, char delim) {
 87 |     std::vector<std::string> elems;
 88 |     return split(s, delim, elems);
 89 | }
 90 | 
 91 | /* Read all of the lines of a file and return them as a vector */
 92 | std::vector<std::string> getLines(const std::filesystem::path path)
 93 | {
 94 |     std::vector<std::string> lines;
 95 |     std::string line;
 96 |     std::ifstream inFile;
 97 |     inFile.open( path );
 98 |     if (!inFile.is_open()) {
 99 |         std::cerr << "getLines: Cannot open file: " << path << "\n";
100 |         std::string cmd("ls -l " + path.parent_path().string());
101 |         std::cerr << cmd << "\n";
102 |         if (system( cmd.c_str())) {
103 |             std::cerr << "error\n";
104 |         }
105 |         throw std::runtime_error("test_be:getLines");
106 |     }
107 |     while (std::getline(inFile, line)){
108 |         if (line.size()>0){
109 |             lines.push_back(line);
110 |         }
111 |     }
112 |     return lines;
113 | }
114 | 
115 | // returns the last line if v has more than one line, otherwise ''
116 | std::string getLast(const std::vector<std::string> &v)
117 | {
118 |     if (v.size() > 0) return v[v.size()-1];
119 |     return std::string();
120 | }
121 | 
122 | 
123 | uint64_t scaled_stoi64(const std::string &str)
124 | {
125 |     std::stringstream ss(str);
126 |     uint64_t val;
127 |     ss >> val;
128 |     if(str.find('k')!=std::string::npos  || str.find('K')!=std::string::npos) val *= 1024LL;
129 |     if(str.find('m')!=std::string::npos  || str.find('m')!=std::string::npos) val *= 1024LL * 1024LL;
130 |     if(str.find('g')!=std::string::npos  || str.find('g')!=std::string::npos) val *= 1024LL * 1024LL * 1024LL;
131 |     if(str.find('t')!=std::string::npos  || str.find('T')!=std::string::npos) val *= 1024LL * 1024LL * 1024LL * 1024LL;
132 |     return val;
133 | }
134 | 


--------------------------------------------------------------------------------
/threadpool.h:
--------------------------------------------------------------------------------
  1 | #ifndef _THREADPOOL_H_
  2 | #define _THREADPOOL_H_
  3 | 
  4 | /****************************************************************
  5 |  *** THREADING SUPPORT
  6 |  ****************************************************************/
  7 | 
  8 | /**
  9 |  * \addtogroup internal_interfaces
 10 |  * @{
 11 |  */
 12 | 
 13 | 
 14 | /**
 15 |  * \file
 16 |  * http://stackoverflow.com/questions/4264460/wait-for-one-of-several-threads-to-finish
 17 |  * Here is the algorithm to run the thread pool with a work queue:
 18 |  *
 19 |  * \verbatim
 20 |  * main:
 21 |  *     set freethreads to numthreads
 22 |  *     init mutex M, condvars TO_MAIN and TO_WORKER
 23 |  *     start N worker threads
 24 |  *     while true:
 25 |  *         wait for work item
 26 |  *         claim M
 27 |  *         while freethreads == 0:
 28 |  *             cond-wait TO_MAIN, M
 29 |  *         put work item in queue
 30 |  *         decrement freethreads
 31 |  *         cond-signal TO_WORKER
 32 |  *         release M
 33 |  *
 34 |  * worker:
 35 |  *     init
 36 |  *     while true:
 37 |  *         claim M
 38 |  *         while no work in queue:
 39 |  *             cond-wait TO_WORKER, M
 40 |  *         get work to local storage
 41 |  *         release M
 42 |  *         do work
 43 |  *         claim M
 44 |  *         increment freethreads
 45 |  *         cond-signal TO_MAIN
 46 |  *         release M
 47 |  * \endverbatim
 48 |  */
 49 | 
 50 | #include <set>
 51 | #include <queue>
 52 | #include <condition_variable>
 53 | #include <mutex>
 54 | #include <atomic>
 55 | #include <future>      // std::future, std::promise
 56 | 
 57 | #include "aftimer.h"
 58 | #include "scanner_params.h"
 59 | 
 60 | // There is a single thread_pool object
 61 | class worker;
 62 | class thread_pool {
 63 |     /*** neither copying nor assignment is implemented ***/
 64 |     thread_pool(const thread_pool &)=delete;
 65 |     thread_pool &operator=(const thread_pool &)=delete;
 66 |     std::thread::id                     main_thread {std::this_thread::get_id()};
 67 | 
 68 | public:
 69 |     struct work_unit {
 70 |         work_unit(){}
 71 |         work_unit(const sbuf_t *sbuf_):sbuf(sbuf_) {}
 72 |         work_unit(const sbuf_t *sbuf_, scanner_t *scanner_):sbuf(sbuf_),scanner(scanner_) {}
 73 |         const sbuf_t *sbuf {nullptr};       // sbuf to process
 74 |         scanner_t *scanner {nullptr};        // if set, use only this scanner, otherwise use all.
 75 |     };
 76 | 
 77 |     typedef std::set<class worker *> worker_set_t;
 78 |     worker_set_t                        workers {};
 79 |     std::set<std::thread *>             threads {};
 80 |     mutable std::mutex                  M {};
 81 |     std::condition_variable	        TO_MAIN {};
 82 |     std::condition_variable	        TO_WORKER {};
 83 |     std::atomic<int>                    working_workers {0};
 84 |     std::atomic<int>                    freethreads {0};
 85 |     std::atomic<int>                    shutdown_spin_lock_poll_ms {100};
 86 | 
 87 |     // bulk_extractor specialiations
 88 |     class scanner_set &ss;		// one for all the threads; fs and fr are threadsafe
 89 |     std::queue<struct work_unit *> work_queue  {};	// work to be done - here it is just a list of sbufs.
 90 |     aftimer		       main_wait_timer {};	// time spend waiting
 91 |     std::atomic<uint64_t>      total_worker_wait_ns {0};
 92 |     int                        mode {0}; // 0=running; 1 = waiting for workers to finish; 2=workers should die
 93 |     std::atomic<bool>          debug {false}; // display debug messages?
 94 | 
 95 |     thread_pool(scanner_set &ss_);
 96 |     ~thread_pool();
 97 |     void launch_workers(size_t num_workers);
 98 |     void wait_for_tasks();              // wait until there are no tasks in work queue
 99 |     void join();                        // wait_for_tasks() and kill the workers
100 |     void main_thread_wait();
101 |     void push_task(const sbuf_t *sbuf, scanner_t *scanner);
102 |     void push_task(const sbuf_t *sbuf);
103 | 
104 |     // Status for callers
105 |     size_t get_worker_count() const;
106 |     int get_free_count() const;
107 |     size_t get_tasks_queued() const;
108 |     void debug_pool(std::ostream &os) const;
109 | };
110 | 
111 | // there is a worker object for each thread
112 | class worker {
113 |     thread_pool         &tp;		       // my thread pool
114 |     void                *run();                // run the worker
115 |     aftimer		worker_wait_timer {};  // time the worker spent
116 | public:
117 |     const uint32_t id;
118 |     static void * start_worker( void *arg );
119 |     worker(class thread_pool &tp_, uint32_t id_): tp(tp_),id(id_){} // the worker
120 | };
121 | 
122 | 
123 | #endif
124 | 


--------------------------------------------------------------------------------
/atomic_unicode_histogram.h:
--------------------------------------------------------------------------------
  1 | #ifndef ATOMIC_UNICODE_HISTOGRAM_H
  2 | #define ATOMIC_UNICODE_HISTOGRAM_H
  3 | 
  4 | /** A simple class for making histograms of strings.
  5 |  * Histograms are kept in printable UTF-8 representation, not in UTF32 internally.
  6 |  * In part this us due to the legacy code base.
  7 |  * This part this allows the scanners to determine what the printout should look like, rather than having
  8 |  * to pass presentation flags.
  9 |  *
 10 |  * Histogram maker implement:
 11 |  * - Counting
 12 |  * - Determining how much memory is in use by histogram.
 13 |  * - Writing histogram to a stream (for example, when memory is filled.)
 14 |  * - Merging multiple histogram files to a single file.
 15 |  *
 16 |  * Note - case transitions and text extraction is performed in UTF-32.
 17 |  *      - regular expression are then run on the UTF-8. (Not the best, but it works for now.)
 18 |  */
 19 | 
 20 | #include "atomic_map.h"
 21 | #include "histogram_def.h"
 22 | #include "unicode_escape.h"
 23 | #include <atomic>
 24 | 
 25 | struct AtomicUnicodeHistogram {
 26 |     static uint32_t debug_histogram_malloc_fail_frequency; // for debugging, make malloc fail sometimes
 27 |     struct HistogramTally {
 28 |         uint32_t count{0};   // total strings seen
 29 |         uint32_t count16{0}; // total utf16 strings seen
 30 |         HistogramTally(const HistogramTally& a) {
 31 |             this->count = a.count;
 32 |             this->count16 = a.count16;
 33 |         }
 34 |         HistogramTally& operator=(const HistogramTally& a) {
 35 |             this->count = a.count;
 36 |             this->count16 = a.count16;
 37 |             return *this;
 38 |         }
 39 | 
 40 |         HistogramTally(){};
 41 |         virtual ~HistogramTally(){};
 42 | 
 43 |         bool operator==(const HistogramTally& a) const { return this->count == a.count && this->count16 == a.count16; };
 44 |         bool operator!=(const HistogramTally& a) const { return !(*this == a); }
 45 |         bool operator<(const HistogramTally& a) const {
 46 |             return (this->count < a.count) || ((this->count == a.count && (this->count16 < a.count16)));
 47 |         }
 48 |         size_t bytes() const {
 49 |             return sizeof(*this);
 50 |         }
 51 |     };
 52 | 
 53 |     /* A FrequencyReportVector is a vector of report elements when the report is generated.*/
 54 |     typedef atomic_map<std::string, struct AtomicUnicodeHistogram::HistogramTally> auh_t;
 55 |     typedef std::vector<auh_t::item> FrequencyReportVector;
 56 | 
 57 |     /* Returns true if a<b for sort order.
 58 |      * Sort high counts before low counts, but if the count is the same sort in alphabetical order.
 59 |      */
 60 |     static bool histogram_compare(const auh_t::item &a, const auh_t::item &b) {
 61 |         if (a.value->count > b.value->count) return true;
 62 |         if (a.value->count < b.value->count) return false;
 63 |         if (a.key < b.key) return true;
 64 |         return false;
 65 |     }
 66 | 
 67 |     AtomicUnicodeHistogram(const struct histogram_def& def_) : def(def_) {}
 68 |     virtual ~AtomicUnicodeHistogram(){};
 69 | 
 70 |     // is it empty?
 71 |     bool empty() {
 72 |         const std::lock_guard<std::mutex> lock(M);
 73 |         return h.size()==0;
 74 |     }
 75 |     void clear();                       // empties the histogram
 76 |     // low-level add, directly to what we display, if the match function checks out.
 77 |     void add0(const std::string& u8key, const std::string &context, bool found_utf16);
 78 | 
 79 |      // adds Unicode string to the histogram count. context is used for histogram_def
 80 |     void add_feature_context(const std::string& feature, const std::string&context);
 81 |     size_t size()  const;              // returns the number of entries in the historam
 82 |     size_t bytes() const;              // returns the number of bytes used by the histogram
 83 | 
 84 |     /** makeReport() makes a report and returns a
 85 |      * FrequencyReportVector.
 86 |      */
 87 |     std::vector<auh_t::item> makeReport(size_t topN=0);          // returns items of <count,key>
 88 |     const struct histogram_def def;            // the definition we are making
 89 |     bool  debug {false};                        // set to enable debugging
 90 | 
 91 | private:
 92 |     mutable std::mutex M {};                    // mutex for the histogram, used to lock individual elements.
 93 |     auh_t h {};                         // the histogram
 94 | };
 95 | 
 96 | std::ostream& operator<<(std::ostream& os, const AtomicUnicodeHistogram::FrequencyReportVector& rep);
 97 | std::ostream& operator<<(std::ostream& os, const AtomicUnicodeHistogram::auh_t::item& e);
 98 | 
 99 | #endif
100 | 


--------------------------------------------------------------------------------
/unicode_escape.h:
--------------------------------------------------------------------------------
  1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
  2 | 
  3 | /*
  4 |  * Tools for working with Unicode
  5 |  */
  6 | 
  7 | #ifndef UNICODE_ESCAPE_H
  8 | #define UNICODE_ESCAPE_H
  9 | 
 10 | #include <cstdint>
 11 | #include <cstring>
 12 | #include <cwctype>
 13 | #include <iostream>
 14 | #include <locale>
 15 | #include <string>
 16 | 
 17 | #include "utf8.h"
 18 | 
 19 | /** \addtogroup bulk_extractor_APIs
 20 |  * @{
 21 |  */
 22 | /** \file */
 23 | 
 24 | /* Our standard escaping is \\ for backslash and \000 for null, \001 for control-a, etc. */
 25 | 
 26 | std::string octal_escape(unsigned char ch);       // escape this character
 27 | bool utf8cont(unsigned char ch);            // true if a UTF8 continuation character
 28 | bool valid_utf8codepoint(uint32_t unichar); // not all unichars are valid codepoints
 29 | 
 30 | /* Our internal, testable, somewhat broken Unicode handling */
 31 | const std::u32string utf32_lowercase(const std::u32string& str);
 32 | const std::u32string utf32_extract_numeric(const std::u32string& str);
 33 | 
 34 | struct unicode {
 35 |     static const uint16_t INTERLINEAR_ANNOTATION_ANCHOR = 0xFFF9;
 36 |     static const uint16_t INTERLINEAR_ANNOTATION_SEPARATOR = 0xFFFA;
 37 |     static const uint16_t INTERLINEAR_ANNOTATION_TERMINATOR = 0xFFFB;
 38 |     static const uint16_t OBJECT_REPLACEMENT_CHARACTER = 0xFFFC;
 39 |     static const uint16_t REPLACEMENT_CHARACTER = 0xFFFD;
 40 |     static const uint16_t BOM = 0xFEFF;
 41 | };
 42 | 
 43 | /* Create safe UTF8 from unsafe UTF8.
 44 |  * if validate is true and the others are false, throws an exception with bad UTF8.
 45 |  */
 46 | class BadUnicode : public std::exception {
 47 |     std::string bad_string{};
 48 | public:
 49 |     BadUnicode(std::string_view bad) : bad_string(bad) {};
 50 |     const char *what() const noexcept override { return bad_string.c_str(); };
 51 | };
 52 | 
 53 | std::string validateOrEscapeUTF8(const std::string& input, bool escape_bad_UTF8, bool escape_backslash, bool validate);
 54 | 
 55 | /* Guess if this is valid utf16 and return likely endian */
 56 | bool looks_like_utf16(const std::string& str, bool& little_endian);
 57 | 
 58 | /* These return the string. If no conversion is possible,
 59 |  * they throw const utf8::invalid_utf16.
 60 |  * catch with 'catch (const utf8::invalid_utf16 &)'
 61 |  */
 62 | 
 63 | std::string convert_utf16_to_utf8(const std::string& str, bool little_endian); // request specific conversion
 64 | std::string convert_utf16_to_utf8(const std::string& str);                     // guess for best
 65 | 
 66 | // std::u32string convert_utf16_to_utf32(const std::string &str,bool little_endian); // request specific conversion
 67 | 
 68 | // std::u32string convert_utf8_to_utf32(const std::string &str);
 69 | // std::string convert_utf32_to_utf8(const std::u32string &str);
 70 | // std::string convert_utf32_to_utf8(const std::u32string &str);
 71 | std::u32string convert_utf16_to_utf32(const std::string& str);
 72 | std::u16string convert_utf32_to_utf16(const std::u32string& str);
 73 | std::string make_utf8(const std::string& str); // returns valid, escaped UTF8 for utf8 or utf16
 74 | 
 75 | inline const std::u32string utf32_lowercase(const std::u32string& str) {
 76 |     std::u32string output;
 77 |     for (auto& ch : str) { output.push_back(ch < 0xffff ? tolower(ch) : ch); }
 78 |     return output;
 79 | }
 80 | 
 81 | inline const std::u32string utf32_extract_numeric(const std::u32string& str) {
 82 |     std::u32string output;
 83 |     for (auto& ch : str) {
 84 |         if (iswdigit(ch)) { output.push_back(ch); }
 85 |     }
 86 |     return output;
 87 | }
 88 | 
 89 | /* Now we just pass through to utf8 */
 90 | inline const std::u16string convert_utf8_to_utf16(const std::string& utf8) {
 91 |     return utf8::utf8to16(utf8);
 92 | }
 93 | 
 94 | inline const std::u32string convert_utf8_to_utf32(const std::string utf8) {
 95 |     return utf8::utf8to32(utf8);
 96 | }
 97 | 
 98 | inline const std::string convert_utf32_to_utf8(const std::u32string& u32s) {
 99 |     return utf8::utf32to8(u32s);
100 | }
101 | 
102 | inline std::string safe_utf16to8(std::wstring s) { // needs to be cleaned up
103 |     std::string utf8_line;
104 |     try {
105 |         utf8::utf16to8(s.begin(), s.end(), back_inserter(utf8_line));
106 |     } catch (const utf8::invalid_utf16&) {
107 |         /* Exception thrown: bad UTF16 encoding */
108 |         utf8_line = "";
109 |     }
110 |     return utf8_line;
111 | }
112 | 
113 | // This needs to be cleaned up:
114 | inline std::wstring safe_utf8to16(std::string s) {
115 |     std::wstring utf16_line;
116 |     try {
117 |         utf8::utf8to16(s.begin(), s.end(), back_inserter(utf16_line));
118 |     } catch (const utf8::invalid_utf8&) {
119 |         /* Exception thrown: bad UTF8 encoding */
120 |         utf16_line = L"";
121 |     }
122 |     return utf16_line;
123 | }
124 | 
125 | 
126 | 
127 | #endif
128 | 


--------------------------------------------------------------------------------
/word_and_context_list.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * class word_and_context_list reads from disk and maintains in memory
  3 |  * a data structure that is used for the stop list and alert list.
  4 |  */
  5 | 
  6 | #include "config.h"
  7 | #include <cinttypes>
  8 | #include <iostream>
  9 | 
 10 | #include "word_and_context_list.h"
 11 | 
 12 | void word_and_context_list::add_regex(const std::string& pat) { patterns.push_back(pat); }
 13 | 
 14 | /**
 15 |  * Insert a feature and context, but only if not already present.
 16 |  * Returns true if added.
 17 |  */
 18 | bool word_and_context_list::add_fc(const std::string& f, const std::string& c) {
 19 |     context ctx(f, c); // ctx includes feature, before and after
 20 | 
 21 |     if (c.size() > 0 && context_set.find(c) != context_set.end()) return false; // already present
 22 |     context_set.insert(c);                                                      // now we've seen it.
 23 |     fcmap.insert(std::pair<std::string, context>(f, ctx));
 24 |     return true;
 25 | }
 26 | 
 27 | /**
 28 | returns 0 if success, -1 if fail. */
 29 | int word_and_context_list::readfile(const std::filesystem::path path, std::ostream &os) {
 30 |     std::ifstream i( path );
 31 |     if (!i.is_open()) return -1;
 32 |     os << "Reading context stop list " << path << "\n";
 33 |     std::string line;
 34 |     uint64_t total_context = 0;
 35 |     uint64_t line_counter = 0;
 36 |     uint64_t features_read = 0;
 37 |     while (getline(i, line)) {
 38 |         line_counter++;
 39 |         if (line.size() == 0) continue;
 40 |         if (line[0] == '#') continue; // it's a comment
 41 |         if ((*line.end()) == '\r') { line.erase(line.end()); /* remove the last character if it is a \r */ }
 42 |         if (line.size() == 0) continue; // no line content
 43 |         ++features_read;
 44 | 
 45 |         // If there are two tabs, this is a line from a feature file
 46 |         size_t tab1 = line.find('\t');
 47 |         if (tab1 != std::string::npos) {
 48 |             size_t tab2 = line.find('\t', tab1 + 1);
 49 |             if (tab2 != std::string::npos) {
 50 |                 size_t tab3 = line.find('\t', tab2 + 1);
 51 |                 if (tab3 == std::string::npos) tab3 = line.size();
 52 |                 std::string f = line.substr(tab1 + 1, (tab2 - 1) - tab1);
 53 |                 std::string c = line.substr(tab2 + 1, (tab3 - 1) - tab2);
 54 |                 if (add_fc(f, c)) { ++total_context; }
 55 |             } else {
 56 |                 std::string f = line.substr(tab1 + 1);
 57 |                 add_fc(f, ""); // Insert a feature with no context
 58 |             }
 59 |             continue;
 60 |         }
 61 | 
 62 |         // If there is no tab, then this must be a simple item to ignore.
 63 |         // If it is a regular expression, add it to the list of REs
 64 |         if (regex_vector::has_metachars(line)) {
 65 |             patterns.push_back(line);
 66 |         } else {
 67 |             // Otherwise, add it as a feature with no context
 68 |             fcmap.insert(std::pair<std::string, context>(line, context(line)));
 69 |         }
 70 |     }
 71 |     os << "Stop list read.\n";
 72 |     os << "  Total features read: " << features_read << " in " << line_counter << " lines.\n";
 73 |     os << "  List Size: " << fcmap.size() << "\n";
 74 |     os << "  Context Strings: " << total_context << "\n";
 75 |     os << "  Regular Expressions: " << patterns.size() << "\n";
 76 |     return 0;
 77 | }
 78 | 
 79 | /** check() is threadsafe. */
 80 | bool word_and_context_list::check(const std::string& probe, const std::string& before, const std::string& after) const {
 81 |     /* First check literals, because they are faster */
 82 |     for (stopmap_t::const_iterator it = fcmap.find(probe); it != fcmap.end(); it++) {
 83 |         if ((rstrcmp((*it).second.before, before) == 0) && (rstrcmp((*it).second.after, after) == 0) &&
 84 |             ((*it).second.feature == probe)) {
 85 |             return true;
 86 |         }
 87 |     }
 88 | 
 89 |     /* Now check the patterns; do this second because it is more expensive */
 90 |     return patterns.search_all(probe, nullptr);
 91 | };
 92 | 
 93 | bool word_and_context_list::check_feature_context(const std::string& probe, const std::string& context) const {
 94 |     std::string before;
 95 |     std::string after;
 96 |     context::extract_before_after(probe, context, before, after);
 97 |     return check(probe, before, after);
 98 | }
 99 | 
100 | void word_and_context_list::dump(std::ostream &os) {
101 |     os << "dump context list:\n";
102 |     for (auto const& it : fcmap) { os << it.first << " = " << it.second << "\n"; }
103 |     os << "dump RE list:\n";
104 |     patterns.dump(os);
105 | }
106 | 
107 | #ifdef STAND
108 | int main(int argc, char** argv) {
109 |     cout << "testing contxt_list\n";
110 |     word_and_context_list cl;
111 |     while (--argc) {
112 |         argv++;
113 |         if (cl.readfile(*argv)) { err(1, "Cannot read %s", *argv); }
114 |     }
115 |     cl.dump();
116 |     exit(1);
117 | }
118 | #endif
119 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # be20_api
  2 | 
  3 | |Branch|Coverage|
  4 | |------|--------|
  5 | |[main](https://github.com/simsong/be20_api/blob/main/README.md)|[![codecov](https://codecov.io/gh/simsong/be20_api/branch/slg-dev/graph/badge.svg?token=Nj8q8eo3Ji)](https://codecov.io/gh/simsong/be20_api)|
  6 | |[slg-dev](https://github.com/simsong/be20_api/blob/slg-dev/README.md)| [![codecov](https://codecov.io/gh/simsong/be20_api/branch/slg-dev/graph/badge.svg?token=Nj8q8eo3Ji)](https://codecov.io/gh/simsong/be20_api)|
  7 | 
  8 | 
  9 | This is the framework for the [bulk_extractor](https://github.com/simsong/bulk_extractor)  plug-in API.
 10 | It is called *be20_api* because the API was developed for Bulk_Extractor version 1.3. The API has been
 11 | used without change in Bulk_Extractor versions 1.4 and 1.5, and will be used without change in Bulk_Extractor version 2.0
 12 | 
 13 | The Bulk_Extractor API is a plug-in API for bulk_extractor "scanners." Scanners are implemented
 14 | as `extern "C"` functions which are called from the bulk_extractor C++ framework. All bulk_extractor
 15 | scanners are implemented using the API. Scanners can either be compiled into the bulk_extractor executable, or they can be loaded at run-time from the plug-ins directory. The directory contains zero or more shared libraries (on Unix/Linux/MacOS) or DLLs (on Windows).
 16 | 
 17 | There is no differnece in functionality between scanners that are
 18 | compiled into the program (e.g. bulk_extractor or tcpflow) and those that are loaded at runtime.
 19 | 
 20 | ## Normal Usage
 21 | 
 22 | The API defines functions for:
 23 | 
 24 | 1. Creating a `scanner_set`.  This creates the scanner_set's `feature_recorder_set`.
 25 | 
 26 | 2. Loading scanners into a scanner set.  When each scanner is loaded:
 27 | 
 28 |   2.1 Any feature recorders that it specifies will be created and
 29 |   added to the `feature_recorder_set` if they do not already exist.
 30 | 
 31 | 3. Entering the scanning phase.
 32 | 
 33 | 4. Scanning one or more `sbuf`s, which may cause scanners to create child sbufs
 34 |    and recursively scan them.
 35 | 
 36 | 5. Exiting the scanning phase and running the histogram phase, which
 37 |    causes the scanner_set to collect from the scanner all of the
 38 |    specified histograms (by `feature_recorder` name and regular
 39 |    expression). Each feature recorder is then asked to make its
 40 |    histograms (this process can be parallelized too, and will be
 41 |    parallelized in the future!)
 42 | 
 43 | 6. Finally, the `scanner_set` shuts down and everything is de-allocated.
 44 | 
 45 | ## Path Printing
 46 | 
 47 | The API also defines functions for "path printing," which uses the scanners to decode and print a forensic path.
 48 | 
 49 | |Path|Action|
 50 | |----|------|
 51 | |0-PRINT|Prints the contents of location 0|
 52 | |0-PRINT/r|Raw dumps the length of the buffer in decimal, a \r\n, and then the contents of location 0|
 53 | |0-PRINT/h|Hext dump the contents of location 0|
 54 | 
 55 | 
 56 | ## Working with this repo.
 57 | This repo can used in three ways:
 58 | 
 59 | 1. As a stand-alone repo for testing the API modules.
 60 | 2. As a stand-alone repo for developing and testing scanners.
 61 | 3. As a submodule repo to bulk_extractor or tcplow
 62 | 
 63 | The autotools implementation is this repo is designed to either be included in the parent's `configure.ac` file or to use its own `configure.ac` file. It makes a library called `be20_api.a` which can then be linked into the bulk_extractor program or the testing program.
 64 | 
 65 | Use the  `bootstrap.sh` program in *this* repo to compile the test programs.
 66 | 
 67 | ### Help on git submodules
 68 | 
 69 | Git submodules are complicated. Basically, the parent module is linked to a paritcular commit point, and not to a particular branch. This isolates parent modules from changes in the submodule until the parent module wants to accept the change.
 70 | 
 71 | Update to this repository to master:
 72 | 
 73 |     (cd be20_api; git pull origin master)
 74 | 
 75 | # Major changes with BE20 v. 2.0:
 76 | * `scanner_set` now controls the recursive scanning process. Scanner
 77 |   set holds the configuration information for the scan and the scanners.
 78 | 
 79 | * sbuf now keeps track of the depth.
 80 | * max_depth is now defined for the `scanner_set`, not per scanner. An
 81 |   individual scanner can just look at the depth in the sbuf and abort
 82 |   if the scanner things have gone on too long.
 83 | 
 84 | Scanner Activation
 85 | ------------------
 86 | * scanner_commands is created from reading the command-line
 87 |   arguments. It contains enable and disable commands for each scanner.
 88 | 
 89 | * For each scanner, we can then scan the scanner_commands to determine
 90 |   if the scanner should be initialized, and if we should, we
 91 |   initialize it.
 92 | 
 93 | * The scanners are then sent
 94 | 
 95 | BE20_API STATUS REPORT
 96 | ======================
 97 | BE13_API has been renamed BE20_API and is largley complete.
 98 | 
 99 | Next on the agenda is rewriting tcpflow to use be20_api from be13_api.
100 | 


--------------------------------------------------------------------------------
/word_and_context_list.h:
--------------------------------------------------------------------------------
  1 | #ifndef WORD_AND_CONTEXT_LIST_H
  2 | #define WORD_AND_CONTEXT_LIST_H
  3 | 
  4 | /**
  5 |  * \addtogroup internal_interfaces
  6 |  * @{
  7 |  * \file
  8 |  * word_and_context_list:
  9 |  *
 10 |  * A re-implementation of the basic stop list, regular expression
 11 |  * stop_list, and context-sensitive stop list.
 12 |  *
 13 |  * Method:
 14 |  * Each entry in the stop list can be represented as:
 15 |  * - a feature that is stopped, with optional context.
 16 |  * - a regular expression
 17 |  *
 18 |  * Context is represented as a std::string before the feature and a std::string after.
 19 |  *
 20 |  * The stop list contains is a map of features that are stopped.
 21 |  * For each feature, there may be no context or a list of context.
 22 |  * If there is no context and the feature is in the list,
 23 |  */
 24 | 
 25 | /*
 26 |  * context is a class that records the feature, the text before, and the text after.
 27 |  * Typically this is used for stop lists and alert lists.
 28 |  */
 29 | 
 30 | #include <algorithm>
 31 | #include <iostream>
 32 | #include <map> // brings in map and multimap
 33 | #include <set>
 34 | #include <string>
 35 | #include <unordered_map>
 36 | #include <unordered_set>
 37 | #include <filesystem>
 38 | 
 39 | #include "regex_vector.h"
 40 | 
 41 | class context {
 42 | public:
 43 |     static void extract_before_after(const std::string& feature, const std::string& ctx, std::string& before,
 44 |                                      std::string& after) {
 45 |         if (feature.size() <= ctx.size()) {
 46 |             /* The most simple algorithm is a sliding window */
 47 |             for (size_t i = 0; i < ctx.size() - feature.size(); i++) {
 48 |                 if (ctx.substr(i, feature.size()) == feature) {
 49 |                     before = ctx.substr(0, i);
 50 |                     after = ctx.substr(i + feature.size());
 51 |                     return;
 52 |                 }
 53 |             }
 54 |         }
 55 |         before.clear(); // can't be done
 56 |         after.clear();
 57 |     }
 58 | 
 59 |     // constructors to make a context with nothing before or after, with just a context, or with all three
 60 |     context(const std::string& f) : feature(f), before(), after() {}
 61 |     context(const std::string& f, const std::string& c) : feature(f), before(), after() {
 62 |         extract_before_after(f, c, before, after);
 63 |     }
 64 |     context(const std::string& f, const std::string& b, const std::string& a) : feature(f), before(b), after(a) {}
 65 |     std::string feature;
 66 |     std::string before;
 67 |     std::string after;
 68 | };
 69 | 
 70 | inline std::ostream& operator<<(std::ostream& os, const class context& c) {
 71 |     os << "context[" << c.before << "|" << c.feature << "|" << c.after << "]";
 72 |     return os;
 73 | }
 74 | inline bool operator==(const class context& a, const class context& b) {
 75 |     return (a.feature == b.feature) && (a.before == b.before) && (a.after == b.after);
 76 | }
 77 | 
 78 | /**
 79 |  * the object that holds the word and context list
 80 |  * They aren't atomic, but they are read-only.
 81 |  */
 82 | class word_and_context_list {
 83 | private:
 84 |     typedef std::unordered_multimap<std::string, context> stopmap_t;
 85 |     stopmap_t fcmap; // maps features to contexts; for finding them
 86 | 
 87 |     typedef std::unordered_set<std::string> stopset_t;
 88 |     stopset_t context_set; // presence of a pair in fcmap
 89 | 
 90 |     regex_vector patterns;
 91 | 
 92 | public:
 93 |     /**
 94 |      * rstrcmp is like strcmp, except it compares std::strings right-aligned
 95 |      * and only compares the minimum sized std::string of the two.
 96 |      */
 97 |     static int rstrcmp(const std::string& a, const std::string& b);
 98 | 
 99 |     word_and_context_list() : fcmap(), context_set(), patterns() {}
100 |     size_t size() { return fcmap.size() + patterns.size(); }
101 |     void add_regex(const std::string& pat);                  // not threadsafe
102 |     bool add_fc(const std::string& f, const std::string& c); // not threadsafe
103 |     int readfile(const std::filesystem::path path, std::ostream& os = std::cout); // readfile with stats to os
104 | 
105 |     // return true if the probe with context is in the list or in the stopmap
106 |     bool check(const std::string& probe, const std::string& before, const std::string& after) const; // threadsafe
107 |     bool check_feature_context(const std::string& probe, const std::string& context) const;          // threadsafe
108 |     void dump(std::ostream &os = std::cout);
109 | };
110 | 
111 | /* like strcmp, but runs in reverse */
112 | inline int word_and_context_list::rstrcmp(const std::string& a, const std::string& b) {
113 |     size_t alen = a.size();
114 |     size_t blen = b.size();
115 |     size_t len = alen < blen ? alen : blen;
116 |     for (size_t i = 0; i < len; i++) {
117 |         size_t apos = alen - len + i;
118 |         size_t bpos = blen - len + i;
119 |         if (a[apos] < b[bpos]) return -1;
120 |         if (a[apos] > b[bpos]) return 1;
121 |     }
122 |     return 0;
123 | }
124 | 
125 | #endif
126 | 


--------------------------------------------------------------------------------
/feature_recorder_mhist.cpp.broken:
--------------------------------------------------------------------------------
  1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
  2 | 
  3 | #include "formatter.h"
  4 | 
  5 | /**
  6 |  * write() is the main entry point for writing a feature at a given position with context.
  7 |  * write() checks the stoplist and escapes non-UTF8 characters, then calls write0().
  8 |  */
  9 | void feature_recorder::write(const pos0_t& pos0, const std::string& feature_, const std::string& context_) {
 10 |     if (fs.flags.disabled) return; // disabled
 11 |     if (fs.flags.pedantic) {
 12 |         if (feature_.size() > def.max_feature_size) {
 13 |             throw std::runtime_error(Formatter() << "feature_recorder::write : feature_.size()=" << feature_.size());
 14 |         }
 15 |         if (context_.size() > def.max_context_size) {
 16 |             throw std::runtime_error(Formatter() << "feature_recorder::write : context_.size()=" << context_.size());
 17 |         }
 18 |     }
 19 | 
 20 |     std::string feature = feature_;
 21 |     std::string context = flags.no_context ? "" : context_;
 22 |     std::string* feature_utf8 = AtomicUnicodeHistogram::make_utf8(feature); // a utf8 feature
 23 | 
 24 |     quote_if_necessary(feature, context);
 25 | 
 26 |     if (feature.size() == 0 && fs.flags.pedantic) {
 27 |         throw std::runtime_error(Formatter()  name << ": zero length feature at " << pos0);
 28 |     }
 29 | 
 30 |     /* First check to see if the feature is on the stop list.
 31 |      * Only do this if we have a stop_list_recorder (the stop list recorder itself
 32 |      * does not have a stop list recorder. If it did we would infinitely recurse.
 33 |      */
 34 |     if (flags.no_stoplist == false && fs.stop_list && fs.stop_list_recorder &&
 35 |         fs.stop_list->check_feature_context(*feature_utf8, context)) {
 36 |         fs.stop_list_recorder->write(pos0, feature, context);
 37 |         delete feature_utf8;
 38 |         return;
 39 |     }
 40 | 
 41 |     /* The alert list is a special features that are called out.
 42 |      * If we have one of those, write it to the redlist.
 43 |      */
 44 | #if 0
 45 |     if (flags.no_alertlist==false
 46 |         && fs.alert_list
 47 |         && fs.alert_list->check_feature_context(*feature_utf8,context)) {
 48 |         std::string alert_fn = fs.get_outdir() + "/ALERTS_found.txt";
 49 |         const std::lock_guard<std::mutex> lock(Mr);                // notice we are locking the alert list
 50 |         std::ofstream rf(alert_fn.c_str(),std::ios_base::app);
 51 |         if(rf.is_open()){
 52 |             rf << pos0.shift(fs.offset_add).str() << '\t' << feature << '\t' << "\n";
 53 |         }
 54 |     }
 55 | #endif
 56 | 
 57 | #if 0
 58 |     /* Support in-memory histograms */
 59 |     for (const auto &it:mhistograms ){
 60 |         const histogram_def &def = it.first;
 61 |         mhistogram_t *m = it.second;
 62 |         std::string new_feature = *feature_utf8;
 63 |         if (def.require.size()==0 || new_feature.find_first_of(def.require)!=std::string::npos){
 64 |             /* If there is a pattern to use, use it to simplify the feature */
 65 |             if (def.pattern.size()){
 66 |                 std::smatch sm;
 67 |                 std::regex_search( new_feature, sm, def.reg);
 68 |                 if (sm.size() == 0){
 69 |                     // no search match; avoid this feature
 70 |                     new_feature = "";
 71 |                 }
 72 |                 else {
 73 |                     new_feature = sm.str();
 74 |                 }
 75 |             }
 76 |             if(new_feature.size()) m->add(new_feature,1);
 77 |         }
 78 |     }
 79 | #endif
 80 | 
 81 |     /* Finally write out the feature and the context */
 82 |     this->write0(pos0, feature, context);
 83 |     delete feature_utf8;
 84 | }
 85 | 
 86 | /**
 87 |  * Given a buffer, an offset into that buffer of the feature, and the length
 88 |  * of the feature, make the context and write it out. This is mostly used
 89 |  * for writing from within the lexical analyzers.
 90 |  */
 91 | 
 92 | void feature_recorder::write_buf(const sbuf_t& sbuf, size_t pos, size_t len) {
 93 |     /* If we are in the margin, ignore; it will be processed again */
 94 |     if (pos >= sbuf.pagesize && pos < sbuf.bufsize) { return; }
 95 | 
 96 |     if (pos >= sbuf.bufsize) { /* Sanity checks */
 97 |         std::cerr << "*** write_buf: WRITE OUTSIDE BUFFER. "
 98 |                   << " pos=" << pos << " sbuf=" << sbuf << "\n";
 99 |         return;
100 |     }
101 | 
102 |     /* Asked to write beyond bufsize; bring it in */
103 |     if (pos + len > sbuf.bufsize) { len = sbuf.bufsize - pos; }
104 | 
105 |     std::string feature = sbuf.substr(pos, len);
106 |     std::string context;
107 | 
108 |     if (flags.no_context == false) {
109 |         /* Context write; create a clean context */
110 |         size_t p0 = context_window < pos ? pos - context_window : 0;
111 |         size_t p1 = pos + len + context_window;
112 | 
113 |         if (p1 > sbuf.bufsize) p1 = sbuf.bufsize;
114 |         assert(p0 <= p1);
115 |         context = sbuf.substr(p0, p1 - p0);
116 |     }
117 |     this->write(sbuf.pos0 + pos, feature, context);
118 | }
119 | 


--------------------------------------------------------------------------------
/sbuf_stream.cpp:
--------------------------------------------------------------------------------
  1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
  2 | #include "config.h"
  3 | #include "sbuf_stream.h"
  4 | 
  5 | /*
  6 |  * Stream interfaces
  7 |  */
  8 | sbuf_stream::sbuf_stream(const sbuf_t& sbuf_) : sbuf(sbuf_) { }
  9 | sbuf_stream::~sbuf_stream() {}
 10 | void sbuf_stream::seek(size_t offset_) { offset = offset_; }
 11 | 
 12 | size_t sbuf_stream::tell() { return offset; }
 13 | 
 14 | /*
 15 |  * unsigned integers, default little endian
 16 |  */
 17 | uint8_t sbuf_stream::get8u() {
 18 |     uint8_t value = sbuf.get8u(offset);
 19 |     offset++;
 20 |     return value;
 21 | }
 22 | uint16_t sbuf_stream::get16u() {
 23 |     uint16_t value = sbuf.get16u(offset);
 24 |     offset += 2;
 25 |     return value;
 26 | }
 27 | uint32_t sbuf_stream::get32u() {
 28 |     uint32_t value = sbuf.get32u(offset);
 29 |     offset += 4;
 30 |     return value;
 31 | }
 32 | uint64_t sbuf_stream::get64u() {
 33 |     uint64_t value = sbuf.get64u(offset);
 34 |     offset += 8;
 35 |     return value;
 36 | }
 37 | 
 38 | /*
 39 |  * unsigned integers, big endian
 40 |  */
 41 | uint8_t sbuf_stream::get8uBE() {
 42 |     uint8_t value = sbuf.get8uBE(offset);
 43 |     offset++;
 44 |     return value;
 45 | }
 46 | uint16_t sbuf_stream::get16uBE() {
 47 |     uint16_t value = sbuf.get16uBE(offset);
 48 |     offset += 2;
 49 |     return value;
 50 | }
 51 | uint32_t sbuf_stream::get32uBE() {
 52 |     uint32_t value = sbuf.get32uBE(offset);
 53 |     offset += 4;
 54 |     return value;
 55 | }
 56 | uint64_t sbuf_stream::get64uBE() {
 57 |     uint64_t value = sbuf.get64uBE(offset);
 58 |     offset += 8;
 59 |     return value;
 60 | }
 61 | 
 62 | /*
 63 |  * unsigned integers, byte order specified
 64 |  */
 65 | uint8_t sbuf_stream::get8u(sbuf_t::byte_order_t bo) {
 66 |     uint8_t value = sbuf.get8u(offset, bo);
 67 |     offset++;
 68 |     return value;
 69 | }
 70 | uint16_t sbuf_stream::get16u(sbuf_t::byte_order_t bo) {
 71 |     uint16_t value = sbuf.get16u(offset, bo);
 72 |     offset += 2;
 73 |     return value;
 74 | }
 75 | uint32_t sbuf_stream::get32u(sbuf_t::byte_order_t bo) {
 76 |     uint32_t value = sbuf.get32u(offset, bo);
 77 |     offset += 4;
 78 |     return value;
 79 | }
 80 | uint64_t sbuf_stream::get64u(sbuf_t::byte_order_t bo) {
 81 |     uint64_t value = sbuf.get64u(offset, bo);
 82 |     offset += 8;
 83 |     return value;
 84 | }
 85 | 
 86 | /*
 87 |  * signed integers, default little endian
 88 |  */
 89 | int8_t sbuf_stream::get8i() {
 90 |     int8_t value = sbuf.get8i(offset);
 91 |     offset++;
 92 |     return value;
 93 | }
 94 | int16_t sbuf_stream::get16i() {
 95 |     int16_t value = sbuf.get16i(offset);
 96 |     offset += 2;
 97 |     return value;
 98 | }
 99 | int32_t sbuf_stream::get32i() {
100 |     int32_t value = sbuf.get32i(offset);
101 |     offset += 4;
102 |     return value;
103 | }
104 | int64_t sbuf_stream::get64i() {
105 |     int64_t value = sbuf.get64i(offset);
106 |     offset += 8;
107 |     return value;
108 | }
109 | 
110 | /*
111 |  * signed integers, big endian
112 |  */
113 | int8_t sbuf_stream::get8iBE() {
114 |     int8_t value = sbuf.get8iBE(offset);
115 |     offset++;
116 |     return value;
117 | }
118 | int16_t sbuf_stream::get16iBE() {
119 |     int16_t value = sbuf.get16iBE(offset);
120 |     offset += 2;
121 |     return value;
122 | }
123 | int32_t sbuf_stream::get32iBE() {
124 |     int32_t value = sbuf.get32iBE(offset);
125 |     offset += 4;
126 |     return value;
127 | }
128 | int64_t sbuf_stream::get64iBE() {
129 |     int64_t value = sbuf.get64iBE(offset);
130 |     offset += 8;
131 |     return value;
132 | }
133 | 
134 | /*
135 |  * signed integers, byte order specified
136 |  */
137 | int8_t sbuf_stream::get8i(sbuf_t::byte_order_t bo) {
138 |     int8_t value = sbuf.get8i(offset, bo);
139 |     offset++;
140 |     return value;
141 | }
142 | int16_t sbuf_stream::get16i(sbuf_t::byte_order_t bo) {
143 |     int16_t value = sbuf.get16i(offset, bo);
144 |     offset += 2;
145 |     return value;
146 | }
147 | int32_t sbuf_stream::get32i(sbuf_t::byte_order_t bo) {
148 |     int32_t value = sbuf.get32i(offset, bo);
149 |     offset += 4;
150 |     return value;
151 | }
152 | int64_t sbuf_stream::get64i(sbuf_t::byte_order_t bo) {
153 |     int64_t value = sbuf.get64i(offset, bo);
154 |     offset += 8;
155 |     return value;
156 | }
157 | 
158 | /*
159 |  * string readers
160 |  */
161 | std::string sbuf_stream::getUTF8(size_t num_octets_requested)
162 | {
163 |     std::string utf8_string = sbuf.getUTF8(num_octets_requested);
164 |     offset += utf8_string.length();
165 |     return utf8_string;
166 | }
167 | std::string sbuf_stream::getUTF8() {
168 |     std::string ret = sbuf.getUTF8(offset);
169 |     size_t num_bytes = ret.length();
170 |     // if anything was read then also skip \0
171 |     if (num_bytes > 0) {
172 |         num_bytes++;
173 |     }
174 |     offset += num_bytes;
175 |     return ret;
176 | }
177 | 
178 | std::wstring sbuf_stream::getUTF16(size_t code_units_requested) {
179 |     std::wstring ret = sbuf.getUTF16(offset, code_units_requested);
180 |     offset += ret.length() * 2;
181 |     return ret;
182 | }
183 | std::wstring sbuf_stream::getUTF16() {
184 |     std::wstring utf16_string = sbuf.getUTF16(offset);
185 |     size_t num_bytes = utf16_string.length() * 2;
186 |     if (num_bytes > 0) {
187 |         // if anything was read then also skip \U0000
188 |         num_bytes += 2;
189 |     }
190 |     offset += num_bytes;
191 |     return utf16_string;
192 | }
193 | 


--------------------------------------------------------------------------------
/aftimer.h:
--------------------------------------------------------------------------------
  1 | #ifndef __AFTIMER_H__
  2 | #define __AFTIMER_H__
  3 | 
  4 | #include <atomic>
  5 | #include <ctime>
  6 | #include <cstdio>
  7 | #include <cassert>
  8 | #include <string>
  9 | #include <chrono>
 10 | #include <sstream>
 11 | #include <iomanip>
 12 | 
 13 | #include "utils.h"
 14 | 
 15 | /**
 16 |  * threadsafe timer.
 17 |  */
 18 | class aftimer {
 19 |     aftimer(const aftimer & s) = delete;
 20 |     aftimer & operator=(const aftimer &s) = delete;
 21 |     std::chrono::time_point<std::chrono::steady_clock> t0 {};
 22 |     std::atomic<bool>     running    {};
 23 |     std::atomic<uint64_t> elapsed_ns {}; //  for all times we have started and stopped
 24 |     std::atomic<uint64_t> last_ns    {}; // time from when we last did a "start"
 25 | public:
 26 |     static std::string now_str(std::string prefix="",std::string suffix="");              // return a high-resolution string as now.
 27 |     static std::string hms_str(long t);             // turn a number of seconds into h:m:s
 28 |     static std::string hms_ns_str(uint64_t ns);     // turn a number of nanoseconds into h:m:s
 29 |     static const uint64_t ns_per_s = 1000*1000*1000; // seconds per nanoseconds
 30 |     aftimer()  {}
 31 | 
 32 |     void start(); // start the timer
 33 |     void stop();  // stop the timer
 34 |     void lap();   // note the time for elapsed_seconds() below
 35 | 
 36 |     uint64_t running_nanoseconds() const;             // for how long have we been running?
 37 |     double elapsed_seconds() const;                   // how long timer has been running; timer can be running from the beginning
 38 |     uint64_t elapsed_nanoseconds() const;
 39 |     uint64_t lap_seconds() const;                     // how long the timer is running this time
 40 |     double eta(double fraction_done) const;           // calculate ETA in seconds, given fraction
 41 |     std::string elapsed_text() const;                 // how long we have been running
 42 |     std::string eta_text(double fraction_done) const; // h:m:s
 43 |     std::string eta_time(double fraction_done) const; // the actual time
 44 |     std::string eta_date(double fraction_done) const; // the actual date and time
 45 | };
 46 | 
 47 | /* This code is from:
 48 |  * http://social.msdn.microsoft.com/Forums/en/vcgeneral/thread/430449b3-f6dd-4e18-84de-eebd26a8d668
 49 |  * and:
 50 |  * https://gist.github.com/ugovaretto/5875385
 51 |  */
 52 | 
 53 | // https://stackoverflow.com/questions/16177295/get-time-since-epoch-in-milliseconds-preferably-using-c11-chrono
 54 | inline std::string aftimer::now_str(std::string prefix,std::string suffix) {
 55 |     //uint64_t nanoseconds_since_epoch  = std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now());
 56 |     //std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now().time_since_epoch()).count();
 57 |     uint64_t microseconds_since_epoch = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
 58 |     std::stringstream ss;
 59 |     ss << std::setprecision(4) << std::fixed << prefix << microseconds_since_epoch/1000 << suffix;
 60 |     return ss.str();
 61 | }
 62 | 
 63 | inline std::string aftimer::hms_str(long t)  {
 64 |     char buf[64];
 65 |     int days = t / (60 * 60 * 24);
 66 | 
 67 |     t = t % (60 * 60 * 24); /* what's left */
 68 | 
 69 |     int h = t / 3600;
 70 |     int m = (t / 60) % 60;
 71 |     int s = t % 60;
 72 |     buf[0] = 0;
 73 |     switch (days) {
 74 |     case 0: snprintf(buf, sizeof(buf), "%2d:%02d:%02d", h, m, s); break;
 75 |     case 1: snprintf(buf, sizeof(buf), "%d day, %2d:%02d:%02d", days, h, m, s); break;
 76 |     default: snprintf(buf, sizeof(buf), "%d days %2d:%02d:%02d", days, h, m, s);
 77 |     }
 78 |     return std::string(buf);
 79 | }
 80 | 
 81 | inline std::string aftimer::hms_ns_str(uint64_t ns)  {
 82 |     return hms_str(ns / ns_per_s);
 83 | }
 84 | 
 85 | inline void aftimer::start() {
 86 |     assert (running == false);
 87 |     t0 = std::chrono::steady_clock::now();
 88 |     running = true;
 89 | }
 90 | 
 91 | inline uint64_t aftimer::running_nanoseconds() const {
 92 |     auto v = std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - t0 );
 93 |     return v.count();
 94 | }
 95 | 
 96 | inline void aftimer::stop() {
 97 |     assert (running==true);
 98 |     last_ns = running_nanoseconds();
 99 |     elapsed_ns += last_ns;
100 |     running = false;
101 | }
102 | 
103 | inline void aftimer::lap() {
104 |     stop();
105 |     start();
106 | }
107 | 
108 | inline uint64_t aftimer::elapsed_nanoseconds() const {
109 |     if (running) {
110 |         return elapsed_ns + running_nanoseconds();
111 |     } else {
112 |         return elapsed_ns;
113 |     }
114 | }
115 | 
116 | inline double aftimer::elapsed_seconds() const {
117 |     return elapsed_nanoseconds() / double(ns_per_s);
118 | }
119 | 
120 | inline std::string aftimer::elapsed_text() const {
121 |     return hms_str((int)elapsed_seconds());
122 | }
123 | 
124 | /**
125 |  * returns the number of seconds until the job is complete.
126 |  */
127 | inline double aftimer::eta(double fraction_done) const {
128 |     double t = elapsed_seconds();
129 |     if (t <= 0) return -1;             // can't figure it out
130 |     if (fraction_done <= 0) return -1; // can't figure it out
131 |     return (t * 1.0 / fraction_done - t);
132 | }
133 | 
134 | /**
135 |  * Retuns the number of hours:minutes:seconds until the job is done.
136 |  */
137 | inline std::string aftimer::eta_text(double fraction_done) const {
138 |     double e = eta(fraction_done);
139 |     if (e < 0) return std::string("n/a"); // can't figure it out
140 |     return hms_str((long)e);
141 | }
142 | 
143 | /**
144 |  * Returns the time when data is due.
145 |  */
146 | inline std::string aftimer::eta_time(double fraction_done) const {
147 |     time_t t = time_t(eta(fraction_done)) + time(0);
148 |     struct tm tm;
149 |     localtime_r(&t, &tm);
150 |     char buf[64];
151 |     snprintf(buf, sizeof(buf), "%02d:%02d:%02d", tm.tm_hour, tm.tm_min, tm.tm_sec);
152 |     return std::string(buf);
153 | }
154 | 
155 | inline std::string aftimer::eta_date(double fraction_done) const {
156 |     time_t t = time_t(eta(fraction_done)) + time(0);
157 |     struct tm tm;
158 |     localtime_r(&t, &tm);
159 |     char buf[64];
160 |     snprintf(buf, sizeof(buf), "%04d-%02d-%02d %02d:%02d:%02d",
161 |              tm.tm_year+1900, tm.tm_mon+1, tm.tm_mday,
162 |              tm.tm_hour, tm.tm_min, tm.tm_sec);
163 |     return std::string(buf);
164 | }
165 | 
166 | #endif
167 | 


--------------------------------------------------------------------------------
/histogram_def.h:
--------------------------------------------------------------------------------
  1 | #ifndef HISTOGRAM_DEF_H
  2 | #define HISTOGRAM_DEF_H
  3 | 
  4 | #include <cstdio>
  5 | #include <iostream>
  6 | #include <regex>
  7 | #include <string>
  8 | 
  9 | #include "unicode_escape.h"
 10 | 
 11 | /**
 12 |  * histogram_def defines the histograms that will be made by a feature recorder.
 13 |  * If the mhistogram is set, the histogram is generated when features are recorded
 14 |  * and kept in memory. If mhistogram is not set, the histogram is generated when the feature recorder is closed.
 15 |  */
 16 | 
 17 | struct histogram_def {
 18 |     struct flags_t {
 19 |         flags_t(const flags_t& a) {
 20 |             this->lowercase = a.lowercase;
 21 |             this->numeric = a.numeric;
 22 |             this->require_feature = a.require_feature;
 23 |             this->require_context = a.require_context;
 24 |         };
 25 | 
 26 |         flags_t& operator=(const flags_t& a) {
 27 |             this->lowercase = a.lowercase;
 28 |             this->numeric = a.numeric;
 29 |             this->require_feature = a.require_feature;
 30 |             this->require_context = a.require_context;
 31 |             return *this;
 32 |         };
 33 | 
 34 |         bool operator<(const flags_t& a) const {
 35 |             if (this->lowercase < a.lowercase) return true;
 36 |             if (this->lowercase > a.lowercase) return false;
 37 |             if (this->numeric < a.numeric) return true;
 38 |             if (this->numeric > a.numeric) return false;
 39 | 
 40 |             if (this->require_feature < a.require_feature) return true;
 41 |             if (this->require_feature > a.require_feature) return false;
 42 |             if (this->require_context < a.require_context) return true;
 43 |             if (this->require_context > a.require_context) return false;
 44 |             return false;
 45 |         }
 46 | 
 47 |         bool operator==(const flags_t& a) const {
 48 |             return (this->lowercase == a.lowercase) && (this->numeric == a.numeric) && (this->require_feature==a.require_feature) && (this->require_context==a.require_context);
 49 |         }
 50 | 
 51 |         flags_t(){};
 52 |         flags_t(bool lowercase_, bool numeric_) : lowercase(lowercase_), numeric(numeric_) {}
 53 |         bool lowercase       {false}; // make all flags lowercase
 54 |         bool numeric         {false};   // extract digits only
 55 |         bool require_feature {true};  // require text is applied to feature
 56 |         bool require_context {false};  // require text is applied to context
 57 |     };
 58 | 
 59 |     /**
 60 |      * @param feature - the feature file to histogram (no .txt)
 61 |      * @param pattern - the regular expression to extract.
 62 |      * @param suffix - the suffix to add to the histogram file after feature name before .txt
 63 |      * @param flags  - any flags (see above)
 64 |      * @param require- require this string on the line (usually in context)
 65 |      */
 66 | 
 67 |     histogram_def(const std::string& name_,
 68 |                   const std::string& feature_, // which feature file to use
 69 |                   const std::string& pattern_, // which pattern to abstract
 70 |                   const std::string& require_, // text required on the line
 71 |                   const std::string& suffix_,  // which suffix to add to the feature file name for the histogram
 72 |                   const struct flags_t& flags_);
 73 |     std::string name{};    // name of the hsitogram
 74 |     std::string feature{}; // feature file to extract
 75 |     std::string
 76 |         pattern{}; // regular expression used to extract feature substring from feature. "" means use the entire feature
 77 |     mutable std::regex reg{}; // the compiled regular expression.
 78 |     std::string require{};    // text required somewhere on the feature line. Sort of like grep. used for IP histograms
 79 |     std::string suffix{};     // suffix to append to histogram report name
 80 | 
 81 |     /* flags */
 82 |     struct flags_t flags {};
 83 | 
 84 |     /* default copy construction and assignment */
 85 |     histogram_def(const histogram_def& a) {
 86 |         this->name = a.name;
 87 |         this->feature = a.feature;
 88 |         this->pattern = a.pattern;
 89 |         this->reg = a.reg;
 90 |         this->require = a.require;
 91 |         this->suffix = a.suffix;
 92 |         this->flags = a.flags;
 93 |     };
 94 | 
 95 |     /* assignment operator */
 96 |     histogram_def& operator=(const histogram_def& a) {
 97 |         this->name = a.name;
 98 |         this->feature = a.feature;
 99 |         this->pattern = a.pattern;
100 |         this->reg = a.reg;
101 |         this->require = a.require;
102 |         this->suffix = a.suffix;
103 |         this->flags = a.flags;
104 |         return *this;
105 |     }
106 | 
107 |     bool operator==(const histogram_def& a) const {
108 |         return (this->name == a.name) && (this->feature == a.feature) && (this->pattern == a.pattern) &&
109 |                (this->require == a.require) && (this->suffix == a.suffix) && (this->flags == a.flags);
110 |     }
111 | 
112 |     bool operator!=(const histogram_def& a) const { return !(*this == a); }
113 | 
114 |     /* comparator, so we can have a functioning map and set classes.'
115 |      * ignores reg.
116 |      */
117 |     bool operator<(const histogram_def& a) const {
118 |         if (this->name < a.name) return true;
119 |         if (this->name > a.name) return false;
120 |         if (this->feature < a.feature) return true;
121 |         if (this->feature > a.feature) return false;
122 |         if (this->pattern < a.pattern) return true;
123 |         if (this->pattern > a.pattern) return false;
124 |         if (this->require < a.require) return true;
125 |         if (this->require > a.require) return false;
126 |         if (this->suffix < a.suffix) return true;
127 |         if (this->suffix > a.suffix) return false;
128 |         if (this->flags < a.flags) return true;
129 |         return false;
130 |     }
131 | 
132 |     /* Match and extract:
133 |      * If the string matches this histogram, return true and optionally
134 |      * set match to Extract and match: Does this string match
135 |      */
136 | 
137 |     bool match(std::u32string u32key, std::string* displayString, const std::string &context) const;
138 |     bool match(std::string u32key,    std::string* displayString, const std::string &context) const;
139 | };
140 | 
141 | std::ostream& operator<<(std::ostream& os, const histogram_def::flags_t& f);
142 | std::ostream& operator<<(std::ostream& os, const histogram_def& hd);
143 | 
144 | #endif
145 | 


--------------------------------------------------------------------------------
/scanner_config.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * scanner_config.h:
  3 |  *
  4 |  * class to hold the full configuration of the scanner_set and the feature recorders.
  5 |  *
  6 |  * Includes a set of name=value pairs from the command line and the list of all scanners that
  7 |  * are enabled or disabled.
  8 |  *
  9 |  * This class is also used to build the help string.
 10 |  *
 11 |  * All of the scanners get the same config, so the names that the scanners want need to be unique.
 12 |  * We could have adopted a system where each scanner had its own configuraiton space, but we didn't.
 13 |  * Scanner histograms are added to 'histograms' by machinery.
 14 |  */
 15 | 
 16 | #ifndef _SCANNER_CONFIG_H_
 17 | #define _SCANNER_CONFIG_H_
 18 | 
 19 | #include <cinttypes>
 20 | #include <filesystem>
 21 | #include <map>
 22 | #include <sstream>
 23 | #include <string>
 24 | #include <vector>
 25 | 
 26 | #include "utils.h"
 27 | 
 28 | /* There is only one scanner-config object. It is called for all of the scanners
 29 |  */
 30 | struct scanner_config {
 31 |     /**
 32 |      * Commands whether to enable or disable a scanner.
 33 |      * Typically created from parsing command-line arguments
 34 |      */
 35 |     struct scanner_command {
 36 |         static inline const std::string ALL_SCANNERS = "all";
 37 |         enum command_t { DISABLE, ENABLE };
 38 |         scanner_command(const scanner_command& sc) : scannerName(sc.scannerName), command(sc.command){};
 39 |         scanner_command(const std::string& scannerName_, scanner_command::command_t c)
 40 |             : scannerName(scannerName_), command(c){};
 41 |         std::string scannerName{};
 42 |         command_t command{};
 43 |         /* default copy construction and assignment */
 44 |         scanner_command& operator=(const scanner_command& a) {
 45 |             this->scannerName = a.scannerName;
 46 |             this->command = a.command;
 47 |             return *this;
 48 |         }
 49 |     };
 50 | 
 51 | private:
 52 |     /* The global configuration */
 53 |     typedef std::map<std::string, std::string> config_t; // configuration for scanner passed in
 54 |     config_t namevals{};                                 //  (input) name=val map
 55 |     std::string global_help_options {""};
 56 |     // The commands for those scanners (enable, disable, options, etc.
 57 |     typedef std::vector<struct scanner_config::scanner_command> scanner_commands_t;
 58 |     scanner_commands_t scanner_commands {};
 59 | 
 60 | public:
 61 |     const scanner_commands_t get_scanner_commands() {
 62 |         return static_cast<const scanner_commands_t>(scanner_commands);
 63 |     }
 64 |     void set_config(std::string name, std::string val) {
 65 |         namevals[name] = val;
 66 |     }
 67 |     std::string get_help() const { return global_help_options;}
 68 | 
 69 |     template <typename T> void get_global_config(const std::string& name, T* val, const std::string& help) {
 70 |         std::stringstream s;
 71 |         s << "   -S " << name << "=" << *val << "    " << help << " (" << name << ")\n";
 72 |         global_help_options += s.str(); // add the help in
 73 | 
 74 |         auto it = namevals.find(name);
 75 |         if (it != namevals.end() && val) {
 76 |             set_from_string(val, it->second);
 77 |         }
 78 |     }
 79 | 
 80 |     /* Find options */
 81 |     struct {
 82 |         std::vector<std::filesystem::path> files {};     // accumulates pattern files
 83 |         std::vector<std::string> patterns {};            // accumulates cmdline patterns
 84 |     } FindOpts {};
 85 | 
 86 |     bool find_opts_empty() const {
 87 |         return FindOpts.files.empty() && FindOpts.patterns.empty();
 88 |     }
 89 | 
 90 |     // Find interface
 91 |     const std::vector<std::string> &find_patterns() const        { return FindOpts.patterns; }
 92 |     const std::vector<std::filesystem::path> &find_files() const { return FindOpts.files; }
 93 |     void add_find_pattern(std::string pattern)                   { FindOpts.patterns.push_back(pattern);}
 94 |     void add_find_path(std::filesystem::path path)               { FindOpts.files.push_back(path);}
 95 | 
 96 | 
 97 |     size_t context_window_default{16}; // global option
 98 |     uint64_t offset_add{0}; // add this number to the first offset in every feature file (used for parallelism)
 99 |     std::filesystem::path banner_file{}; // add the contents of this file to the top of every feature file
100 |     static inline const uint32_t DEFAULT_MAX_DEPTH {12};
101 |     static inline const uint32_t DEFAULT_MAX_NGRAM {10};
102 |     virtual ~scanner_config(){};
103 |     scanner_config(){};
104 |     scanner_config(const scanner_config&) = default;
105 |     std::filesystem::path input_fname {NO_INPUT}; // where input comes from
106 |     std::filesystem::path outdir {NO_OUTDIR};     // where output goes
107 |     std::string hash_algorithm {"sha1"};          // which hash algorithm are using; default to SHA1
108 | 
109 |     bool allow_recurse { true };         // can be turned off for testing
110 | 
111 |     inline static const std::string NO_INPUT = "<NO-INPUT>"; // 'filename' indicator that the FRS has no input file
112 |     inline static const std::string NO_OUTDIR = "<NO-OUTDIR>"; // 'dirname' indicator that the FRS produces no file output
113 |     inline static const std::string CARVE_MODE_SUFFIX = "_carve_mode";
114 | 
115 |     std::string get_nameval(std::string name) const {
116 |         auto it = namevals.find(name);
117 |         return it != namevals.end() ? it->second  : "";
118 |     }
119 | 
120 |     int get_carve_mode(const std::string name) const {
121 |         std::string option_name = name + CARVE_MODE_SUFFIX;
122 |         config_t::const_iterator it = namevals.find(option_name);
123 |         if (it == namevals.end()) return -1;
124 |         return std::stoi( std::string(it->second));
125 |     }
126 | 
127 |     /* Set configuration; added to the static config */
128 |     uint32_t max_depth {DEFAULT_MAX_DEPTH};
129 |     uint32_t max_ngram {DEFAULT_MAX_NGRAM};                         // maximum ngram size to scan for
130 | 
131 |     /* Control which scanners are enabled */
132 |     // enable/disable a specific scanner
133 |     void push_scanner_command(const std::string& scannerName, scanner_command::command_t c) {
134 |         scanner_commands.push_back(scanner_command(scannerName, c));
135 |     }
136 |     void enable_all_scanners() {
137 |         push_scanner_command(scanner_command::ALL_SCANNERS, scanner_command::ENABLE);
138 |     }
139 |     void disable_all_scanners() {
140 |         push_scanner_command(scanner_command::ALL_SCANNERS, scanner_command::DISABLE);
141 |     }
142 | };
143 | 
144 | #endif
145 | 


--------------------------------------------------------------------------------
/atomic_unicode_histogram.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * atomic_unicode_histogram.cpp:
  3 |  * Maintain a histogram for Unicode strings provided with either UTF-8 or UTF-16 encodings.
  4 |  * Track number of UTF-16 encodings provided.
  5 |  *
  6 |  * Currently, all operations are done on UTF-8 values, because the C++17 regular expression package
  7 |  * does not handle 32-bit regular expressions.
  8 |  */
  9 | 
 10 | #include "unicode_escape.h"
 11 | #include "utf8.h"
 12 | 
 13 | #include <cwctype>
 14 | #include <fstream>
 15 | #include <iostream>
 16 | #include <regex>
 17 | #include <string>
 18 | 
 19 | #include "atomic_unicode_histogram.h"
 20 | 
 21 | std::ostream& operator<<(std::ostream& os, const AtomicUnicodeHistogram::FrequencyReportVector& rep) {
 22 |     for (const auto& it : rep) {
 23 |         os << it;
 24 |     }
 25 |     return os;
 26 | }
 27 | 
 28 | /* Output is in UTF-8 */
 29 | std::ostream& operator<<(std::ostream& os, const AtomicUnicodeHistogram::auh_t::item& e) {
 30 |     os << "n=" << e.value->count << "\t" << validateOrEscapeUTF8(e.key, true, false, false);
 31 |     if (e.value->count16 > 0) os << "\t(utf16=" << e.value->count16 << ")";
 32 |     os << "\n";
 33 |     return os;
 34 | }
 35 | 
 36 | /* Create a histogram report.
 37 |  * @param topN - if >0, return only this many.
 38 |  * Return only the topN.
 39 |  */
 40 | std::vector<AtomicUnicodeHistogram::auh_t::item> AtomicUnicodeHistogram::makeReport(size_t topN)
 41 | {
 42 |     const std::lock_guard<std::mutex> lock(M);
 43 |     std::vector<AtomicUnicodeHistogram::auh_t::item> ret = h.items();
 44 | 
 45 |     std::sort(ret.begin(), ret.end(), AtomicUnicodeHistogram::histogram_compare); // reverse sort
 46 | 
 47 |     /* If we only want some of them, delete the extra */
 48 |     if ((topN > 0) && (topN < ret.size())) {
 49 |         ret.erase( ret.begin()+topN, ret.end());
 50 |     }
 51 |     return ret;
 52 | }
 53 | 
 54 | /**
 55 |  * Takes a string (the key) passed in, figure out what it is, and add it to a unicode histogram.
 56 |  * Typically it is going to be UTF16 or UTF8.
 57 |  * Regular expressions are applied, if requested, in the UTF32 world.
 58 |  *
 59 |  * @param - key - either a UTF8 or UTF16 string.
 60 |  * If the string appears to be UTF16, convert it to UTF-8 and note that it was converted.
 61 |  *
 62 |  * def.flags.digits - extract the digits first and throw away the rest.
 63 |  * def.flags.lower  - also convert to lowercase using Unicode rules.
 64 |  */
 65 | 
 66 | // https://stackoverflow.com/questions/37989081/how-to-use-unicode-range-in-c-regex
 67 | 
 68 | // debug_histogram_malloc_fail_frequency allows us to simulate low-memory situations for testing the code.
 69 | uint32_t AtomicUnicodeHistogram::debug_histogram_malloc_fail_frequency = 0;
 70 | void AtomicUnicodeHistogram::clear()
 71 | {
 72 |     const std::lock_guard<std::mutex> lock(M);
 73 |     h.clear();
 74 | }
 75 | 
 76 | // low-level add after key has been converted to UTF8
 77 | void AtomicUnicodeHistogram::add0(const std::string& u8key, const std::string &context, bool found_utf16)
 78 | {
 79 |     std::string displayString;
 80 | 
 81 |     if (def.match(u8key, &displayString, context)) {
 82 | 
 83 |         if (debug) std::cerr << "  AtomicUnicodeHistogram::add0 match u8key=" << u8key << std::endl;
 84 | 
 85 |         /* Escape as necessary */
 86 |         displayString = validateOrEscapeUTF8(displayString, true, true, false);
 87 | 
 88 |         /* For debugging low-memory handling logic,
 89 |          * specify DEBUG_MALLOC_FAIL to make malloc occasionally fail (not yet implemented)
 90 |          */
 91 |         if (debug_histogram_malloc_fail_frequency) {
 92 |             const std::lock_guard<std::mutex> lock(M);
 93 |             if ((h.size() % debug_histogram_malloc_fail_frequency) == (debug_histogram_malloc_fail_frequency - 1)) {
 94 |                 throw std::bad_alloc();
 95 |             }
 96 |         }
 97 | 
 98 |         /* Add the key to the histogram. Note that this is threadsafe */
 99 |         const std::lock_guard<std::mutex> lock(M);
100 |         h[displayString].count++;
101 |         if (found_utf16) {
102 |             h[displayString].count16++; // track how many UTF16s were converted
103 |         }
104 |         if (debug) std::cerr << "  AtomicUnicodeHistogram::add0 h[" <<displayString << "].count=" << h[displayString].count << std::endl;
105 |     }
106 | }
107 | 
108 | void AtomicUnicodeHistogram::add_feature_context(const std::string& key_unknown_encoding, const std::string& context)
109 | {
110 |     if (key_unknown_encoding.size() == 0) return; // don't deal with zero-length keys
111 | 
112 |     /* On input, the key may be UTF8 or UTF16. See if we can figure it out */
113 |     bool found_utf16   = false;   // did we find a utf16?
114 |     bool little_endian = false; // was it little_endian?
115 |     std::u32string u32key;      // u32key. Doesn't matter if LE or BE, because we never write it out.
116 | 
117 |     if (looks_like_utf16(key_unknown_encoding, little_endian)) {
118 |         // We have an endian-guessing implementation that converts from 16 to 8, so convert from 16 to 8
119 |         // and then convert it to utf32
120 |         u32key = convert_utf8_to_utf32(convert_utf16_to_utf8(key_unknown_encoding, little_endian));
121 |         found_utf16 = true;
122 |     } else {
123 |         u32key = convert_utf8_to_utf32(key_unknown_encoding);
124 |     }
125 | 
126 |     /* At this point we have UTF-32, which we treat as raw unicode characters.
127 |      *
128 |      * We would like to process lowercase, numeric and regular expressions in utf32 world.
129 |      * Ideally this would be done with ICU, but we do not want to assume we have ICU.
130 |      * https://stackoverflow.com/questions/34433380/lowercase-of-unicode-character
131 |      * https://stackoverflow.com/questions/313970/how-to-convert-stdstring-to-lower-case/24063783
132 |      * https://en.cppreference.com/w/cpp/string/wide/towlower
133 |      * See: http://stackoverflow.com/questions/1081456/wchar-t-vs-wint-t
134 |      *
135 |      * One possibility is the SRELL library, which is included in this repo.
136 |      *
137 |      * Instead, we just convert to UTF-8 and then treat it with the C++17 8-bit regular expression package.
138 |      *
139 |      * See also:
140 |      * https://www.moria.us/articles/wchar-is-a-historical-accident/?
141 |      */
142 | 
143 |     std::string u8key = convert_utf32_to_utf8(u32key);
144 |     add0(u8key, context, found_utf16);
145 | }
146 | 
147 | size_t AtomicUnicodeHistogram::size() const // returns the total number of bytes of the histogram,.
148 | {
149 |     const std::lock_guard<std::mutex> lock(M);
150 |     return h.size();
151 | }
152 | 
153 | size_t AtomicUnicodeHistogram::bytes() const // returns the total number of bytes of the histogram,.
154 | {
155 |     const std::lock_guard<std::mutex> lock(M);
156 |     return sizeof(*this) + h.bytes();
157 | }
158 | 


--------------------------------------------------------------------------------
/pos0.h:
--------------------------------------------------------------------------------
  1 | #ifndef _FPOS0_H_
  2 | #define _FPOS0_H_
  3 | 
  4 | #include <exception>
  5 | #include <algorithm>
  6 | #include <cinttypes>
  7 | #include <cctype>
  8 | #include <sstream>
  9 | #include <string>
 10 | #include <filesystem>
 11 | 
 12 | /****************************************************************
 13 |  *** pos0_t
 14 |  ****************************************************************/
 15 | 
 16 | /** \addtogroup bulk_extractor_APIs
 17 |  * @{
 18 |  */
 19 | /** \file */
 20 | /**
 21 |  * \class pos0_t
 22 |  * The pos0_t structure is used to record the forensic path of the
 23 |  * first byte of an sbuf. The forensic path can include strings associated
 24 |  * with decompressors and ordinals associated with offsets.
 25 |  *
 26 |  * e.g., 1000-GZIP-300-BASE64-30 means go 1000 bytes into the stream,
 27 |  *       unzip, go 300 bytes into the decompressed stream, un-BASE64, and
 28 |  *       go 30 bytes into that.
 29 |  *
 30 |  * pos0_t uses a string to hold the base path and the offset into that path
 31 |  * in a 64-bit number.
 32 |  */
 33 | 
 34 | inline int64_t stoi64(std::string str) {
 35 |     int64_t val(0);
 36 |     std::istringstream ss(str);
 37 |     ss >> val;
 38 |     return val;
 39 | }
 40 | 
 41 | class pos0_t {
 42 |     mutable int depth_ {-1};               // if -1, it needs to be calculated. 0 is top.
 43 | 
 44 | public:
 45 |     inline static const std::string U10001C = "\xf4\x80\x80\x9c"; // default delimeter character in bulk_extractor
 46 |     static std::string map_file_delimiter;                        // character placed
 47 |     static void set_map_file_delimiter(const std::string new_delim) { map_file_delimiter = new_delim; }
 48 |     const std::string path{}; /* forensic path of decoders*/
 49 |     const uint64_t offset{0}; /* location of buf[0] */
 50 | 
 51 |     explicit pos0_t() {}                                                 // the beginning of a nothing
 52 |     explicit pos0_t(std::string s, uint64_t o = 0) : path(s), offset(o) {}        // s can be a full path
 53 |     explicit pos0_t(std::filesystem::path fn, std::string s, uint64_t o = 0) :
 54 |         path(fn.string() + pos0_t::map_file_delimiter + s), offset(o) {}
 55 |     pos0_t(const pos0_t& obj) : path(obj.path), offset(obj.offset) {}    // copy operator
 56 | 
 57 |     /* Every new layer is indicated by a "-" followed by a letter.
 58 |      * This threadsafe, but it may need to be computed twice if two
 59 |      * computations are happening at the same time.
 60 |      */
 61 |     static unsigned int calc_depth(const std::string& s)  {
 62 |         if (s.size()<2) {
 63 |             return 0;
 64 |         }
 65 |         unsigned int cdepth = 0;
 66 |         for( size_t i = 0; i<s.size()-1; i++ ){
 67 |             if (s[i]=='-' && isupper(s[i+1])) {
 68 |                 cdepth += 1;
 69 |             }
 70 |         }
 71 |         return cdepth;
 72 |     }
 73 |     unsigned int depth() const {
 74 |         if (depth_ == -1 ){
 75 |             depth_ = calc_depth(path);
 76 |         }
 77 |         return depth_;
 78 |     }
 79 | 
 80 |     std::string str() const { // convert to a string, with offset included
 81 |         std::stringstream ss;
 82 |         if (path.size() > 0) { ss << path << "-"; }
 83 |         ss << offset;
 84 |         return ss.str();
 85 |     }
 86 |     bool isRecursive() const { // is there a path?
 87 |         return path.size() > 0;
 88 |     }
 89 |     bool contains(const std::string &name) const { // does it contain this name?
 90 |         return (path.find(name) != std::string::npos);
 91 |     }
 92 | 
 93 |     std::string firstPart() const { // the first part of the path
 94 |         size_t p = path.find('-');
 95 |         if (p == std::string::npos) return std::string("");
 96 |         return path.substr(0, p);
 97 |     }
 98 |     std::string lastAddedPart() const { // the last part of the path, before the offset
 99 |         size_t p = path.rfind('-');
100 |         if (p == std::string::npos) return std::string("");
101 |         return path.substr(p + 1);
102 |     }
103 |     std::string alphaPart() const { // return the non-numeric parts, with /'s between each
104 |         std::string desc;
105 |         bool inalpha = false;
106 |         /* Now get the std::string part of pos0 */
107 |         for (const auto &it : path) {
108 |             if ((it) == '-') {
109 |                 if (desc.size() > 0 && desc.at(desc.size() - 1) != '/') desc += '/';
110 |                 inalpha = false;
111 |             }
112 |             if (isalpha(it) || (inalpha && isdigit(it))) {
113 |                 desc += it;
114 |                 inalpha = true;
115 |             }
116 |         }
117 |         return desc;
118 |     }
119 |     uint64_t imageOffset() const { // return the offset from start of disk
120 |         if (path.size() > 0) return stoi64(path);
121 |         return offset;
122 |     }
123 | 
124 |     /**
125 |      * Return a new position that's been shifted by an offset
126 |      */
127 |     pos0_t shift(int64_t s) const {
128 |         if (s == 0) return *this;
129 |         size_t p = path.find('-');
130 |         if (p == std::string::npos) { // no path
131 |             return pos0_t("", offset + s);
132 |         }
133 |         /* Figure out the value of the shift */
134 |         int64_t baseOffset = stoi64(path.substr(0, p - 1));
135 |         std::stringstream ss;
136 |         ss << (baseOffset + s) << path.substr(p);
137 |         return pos0_t(ss.str(), offset);
138 |     }
139 | };
140 | 
141 | /** iostream support for the pos0_t */
142 | inline std::ostream& operator<<(std::ostream& os, const class pos0_t& pos0) {
143 |     os << "(" << pos0.path << "|" << pos0.offset << ")";
144 |     return os;
145 | }
146 | 
147 | /** Append a string (subdir).
148 |  * The current offset is a prefix to the subdir.
149 |  */
150 | inline class pos0_t operator+(pos0_t pos, const std::string& subdir) {
151 |     std::stringstream ss;
152 |     ss << pos.path << (pos.path.size() > 0 ? "-" : "") << pos.offset << "-" << subdir;
153 |     return pos0_t(ss.str(), 0);
154 | };
155 | 
156 | /** Adding an offset */
157 | inline class pos0_t operator+(pos0_t pos, size_t delta) {
158 |     return pos0_t(pos.path, pos.offset + delta);
159 | };
160 | 
161 | /** Subtracting an offset */
162 | inline class pos0_t operator-(pos0_t pos, size_t delta) {
163 |     if (delta > pos.offset) {
164 |         throw std::runtime_error("attempt to subtract a delta from an pos0_t that is larger that pos.offset");
165 |     }
166 |     return pos0_t(pos.path, pos.offset - delta);
167 | };
168 | 
169 | /** \name Comparision operations
170 |  * @{
171 |  */
172 | inline bool operator<(const class pos0_t& pos0, const class pos0_t& pos1) {
173 |     if (pos0.path.size() == 0 && pos1.path.size() == 0) return pos0.offset < pos1.offset;
174 |     if (pos0.path == pos1.path) return pos0.offset < pos1.offset;
175 |     return pos0.path < pos1.path;
176 | };
177 | 
178 | inline bool operator>(const class pos0_t& pos0, const class pos0_t& pos1) {
179 |     if (pos0.path.size() == 0 && pos1.path.size() == 0) return pos0.offset > pos1.offset;
180 |     if (pos0.path == pos1.path) return pos0.offset > pos1.offset;
181 |     return pos0.path > pos1.path;
182 | };
183 | 
184 | inline bool operator==(const class pos0_t& pos0, const class pos0_t& pos1) {
185 |     return pos0.path == pos1.path && pos0.offset == pos1.offset;
186 | };
187 | 
188 | inline bool operator!=(const class pos0_t& pos0, const class pos0_t& pos1) { return !(pos0 == pos1); };
189 | /** @} */
190 | #endif
191 | 


--------------------------------------------------------------------------------
/feature_recorder_sql.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Feature recorder mods for writing features into an SQLite3 database.
  3 |  */
  4 | 
  5 | /* http://blog.quibb.org/2010/08/fast-bulk-inserts-into-sqlite/ */
  6 | 
  7 | #include "config.h"
  8 | 
  9 | #ifdef HAVE_SQLITE3_H
 10 | 
 11 | #include <cstdio>
 12 | #include <cstdlib>
 13 | #include <cstring>
 14 | #include <unistd.h>
 15 | 
 16 | #include "feature_recorder_set.h"
 17 | #include "feature_recorder_sql.h"
 18 | #include "sbuf.h"
 19 | 
 20 | feature_recorder_sql::feature_recorder_sql(class feature_recorder_set& fs_, const feature_recorder_def def_)
 21 |     : feature_recorder(fs_, def_) {
 22 |     /*
 23 |      * If the feature recorder set is disabled, just return.
 24 |      */
 25 |     if (fs.flags.disabled) return;
 26 |         /* write to a database? Create tables if necessary and create a prepared statement */
 27 | 
 28 | #if 0
 29 |     char buf[1024];
 30 |     fs.db_create_table(name);
 31 |     snprintf( buf, sizeof(buf), db_insert_stmt,name.c_str() );
 32 |     bs = new besql_stmt( fs.db3, buf );
 33 | #endif
 34 | }
 35 | 
 36 | feature_recorder_sql::~feature_recorder_sql() {}
 37 | 
 38 | #if 0
 39 | #define DB_INSERT_STMT                                                                                                 \
 40 |     "INSERT INTO f_%s (offset,path,feature_eutf8,feature_utf8,context_eutf8) VALUES (?1, ?2, ?3, ?4, ?5)"
 41 | const char *feature_recorder::db_insert_stmt = DB_INSERT_STMT;
 42 | 
 43 | void feature_recorder::besql_stmt::insert_feature(const pos0_t &pos,
 44 |                                                         const std::string &feature,
 45 |                                                         const std::string &feature8, const std::string &context)
 46 | {
 47 |     assert(stmt!=0);
 48 |     const std::lock_guard<std::mutex> lock(Mstmt);           // grab a lock
 49 |     const std::string &path = pos.str();
 50 |     sqlite3_bind_int64(stmt, 1, pos.imageOffset()); // offset
 51 |     sqlite3_bind_text(stmt, 2, path.data(), path.size(), SQLITE_STATIC); // path
 52 |     sqlite3_bind_text(stmt, 3, feature.data(), feature.size(), SQLITE_STATIC);
 53 |     sqlite3_bind_text(stmt, 4, feature8.data(), feature8.size(), SQLITE_STATIC);
 54 |     sqlite3_bind_text(stmt, 5, context.data(), context.size(), SQLITE_STATIC);
 55 |     if (sqlite3_step(stmt) != SQLITE_DONE) {
 56 |         fprintf(stderr,"sqlite3_step failed\n");
 57 |     }
 58 |     sqlite3_reset(stmt);
 59 | };
 60 | 
 61 | feature_recorder::besql_stmt::besql_stmt(sqlite3 *db3,const char *sql):Mstmt(),stmt()
 62 | {
 63 |     assert(db3!=0);
 64 |     assert(sql!=0);
 65 |     sqlite3_prepare_v2(db3,sql, strlen(sql), &stmt, NULL);
 66 |     assert(stmt!=0);
 67 | }
 68 | 
 69 | feature_recorder::besql_stmt::~besql_stmt()
 70 | {
 71 |     assert(stmt!=0);
 72 |     sqlite3_finalize(stmt);
 73 |     stmt = 0;
 74 | }
 75 | 
 76 | /* Hook for writing feature to SQLite3 database */
 77 | void feature_recorder::write0_sqlite3(const pos0_t &pos0,const std::string &feature,const std::string &context)
 78 | {
 79 |     /**
 80 |      * Note: this is not very efficient, passing through a quoted feature and then unquoting it.
 81 |      * We could make this more efficient.
 82 |      */
 83 |     std::string *feature8 = AtomicUnicodeHistogram::convert_utf16_to_utf8(feature_recorder::unquote_string(feature));
 84 |     assert(bs!=0);
 85 |     bs->insert_feature(pos0,feature,
 86 |                          feature8 ? *feature8 : feature,
 87 |                          flag_set(feature_recorder::FLAG_NO_CONTEXT) ? "" : context);
 88 |     if (feature8) delete feature8;
 89 | }
 90 | 
 91 | /*** SQL Routines Follow ***
 92 |  *
 93 |  * Time results with ubnist1 on R4:
 94 |  * no SQL - 79 seconds
 95 |  * no pragmas - 651 seconds
 96 |  * "PRAGMA synchronous =  OFF", - 146 second
 97 |  * "PRAGMA synchronous =  OFF", "PRAGMA journal_mode=MEMORY", - 79 seconds
 98 |  *
 99 |  * Time with domexusers:
100 |  * no SQL -
101 |  */
102 | 
103 | #define SQLITE_EXTENSION ".sqlite"
104 | #ifndef SQLITE_DETERMINISTIC
105 | #define SQLITE_DETERMINISTIC 0
106 | #endif
107 | 
108 | /* This creates the base histogram. Note that the SQL fails if the histogram exists */
109 | static const char *schema_hist[] = {
110 |     "CREATE TABLE h_%s (count INTEGER(12), feature_utf8 TEXT)",
111 |     "CREATE INDEX h_%s_idx1 ON h_%s(count)",
112 |     "CREATE INDEX h_%s_idx2 ON h_%s(feature_utf8)",
113 |     0};
114 | 
115 | /* This performs the histogram operation */
116 | static const char *schema_hist1[] = {
117 |     "INSERT INTO h_%s select COUNT(*),feature_utf8 from f_%s GROUP BY feature_utf8",
118 |     0};
119 | 
120 | static const char *schema_hist2[] = {
121 |     "INSERT INTO h_%s select sum(count),BEHIST(feature_utf8) from h_%s where BEHIST(feature_utf8)!='' GROUP BY BEHIST(feature_utf8)",
122 |     0};
123 | 
124 | 
125 | 
126 | void feature_recorder::dump_histogram_sqlite3(const histogram_def &def,void *user,feature_recorder::dump_callback_t cb) const
127 | {
128 |     /* First check to see if there exists a feature histogram summary. If not, make it */
129 |     std::string query = "SELECT name FROM sqlite_master WHERE type='table' AND name='h_" + def.feature +"'";
130 |     char *errmsg=0;
131 |     int rowcount=0;
132 |     if ( sqlite3_exec(fs.db3,query.c_str(),callback_counter,&rowcount,&errmsg)){
133 |         std::cerr << "sqlite3: " << errmsg << "\n";
134 |         return;
135 |     }
136 |     if (rowcount==0){
137 |         const char *feature = def.feature.c_str();
138 |         fs.db_send_sql( fs.db3, schema_hist, feature, feature); // creates the histogram
139 |         fs.db_send_sql( fs.db3, schema_hist1, feature, feature); // creates the histogram
140 |     }
141 |     /* Now create the summarized histogram for the regex, if it is not existing, but only if we have
142 |      * sqlite3_create_function_v2
143 |      */
144 |     if (def.pattern.size()>0){
145 |         /* Create the database where we will add the histogram */
146 |         std::string hname = def.feature + "_" + def.suffix;
147 | 
148 |         /* Remove any "-" characters if present */
149 |         for(size_t i=0;i<hname.size();i++){
150 |             if (hname[i]=='-') hname[i]='_';
151 |         }
152 | 
153 |         if(debug) std::cerr << "CREATING TABLE = " << hname << "\n";
154 |         if (sqlite3_create_function_v2(fs.db3,"BEHIST",1,SQLITE_UTF8|SQLITE_DETERMINISTIC,
155 |                                        (void *)&def,dump_hist,0,0,0)) {
156 |             std::cerr << "could not register function BEHIST\n";
157 |             return;
158 |         }
159 |         const char *fn = def.feature.c_str();
160 |         const char *hn = hname.c_str();
161 |         fs.db_send_sql(fs.db3,schema_hist, hn , hn); // create the table
162 |         fs.db_send_sql(fs.db3,schema_hist2, hn , fn); // select into it from a function of the old histogram table
163 | 
164 |         /* erase the user defined function */
165 |         if (sqlite3_create_function_v2(fs.db3,"BEHIST",1,SQLITE_UTF8|SQLITE_DETERMINISTIC,
166 |                                        (void *)&def,0,0,0,0)) {
167 |             std::cerr << "could not remove function BEHIST\n";
168 |             return;
169 |         }
170 |     }
171 | }
172 | 
173 | void feature_recorder::write(const pos0_t &pos0, const std::string &feature, const std::string &context)
174 | {
175 |     if ( fs.flag_set(feature_recorder_set::ENABLE_SQLITE3_RECORDERS ) &&
176 |          this->flag_notset(feature_recorder::FLAG_NO_FEATURES_SQL) ) {
177 |         write0_sqlite3( pos0, feature, context);
178 |     }
179 | }
180 | #endif
181 | 
182 | #endif
183 | 


--------------------------------------------------------------------------------
/pcap_fake.cpp:
--------------------------------------------------------------------------------
  1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
  2 | 
  3 | /*
  4 |  * pcap_fake.cpp
  5 |  * A fake libpcap implementation that can only read files without a filter.
  6 |  */
  7 | 
  8 | // config.h is needed solely to find out if we need pcap_fake.h or not.
  9 | #include "config.h"
 10 | 
 11 | #ifndef HAVE_LIBPCAP
 12 | #include "pcap_fake.h"
 13 | 
 14 | #include <fcntl.h>
 15 | #include <iostream>
 16 | #include <cstdlib>
 17 | #include <cstring>
 18 | 
 19 | #ifdef _WIN32
 20 | #define SET_BINMODE(f) _setmode(_fileno(f), _O_BINARY)
 21 | #else
 22 | #define SET_BINMODE(f) /* ignore */
 23 | #endif
 24 | 
 25 | /* pcap_fake's struct pcap just keeps track of the file that was opened and
 26 |  * whether or not it was byteswapped.
 27 |  */
 28 | struct pcap {
 29 |     FILE* fp;    // input file we are reading from
 30 |     int swapped; // whether magic number was swapped?
 31 |     uint32_t linktype;
 32 |     bool error;      // an error occured
 33 |     bool break_loop; // break_loop was called
 34 |     bool must_close;
 35 |     char err_buf[128];
 36 |     uint8_t* pktbuf;
 37 | };
 38 | 
 39 | char* pcap_geterr(pcap_t* p) {
 40 |     snprintf(p->err_buf, sizeof(p->err_buf), "not implemented in pcap_fake");
 41 |     return p->err_buf;
 42 | }
 43 | 
 44 | /**
 45 |  * pcap_open_offline()
 46 |  * -- "The name "-" is a synonym for stdin" (pcap manual)
 47 |  * -- allocate the pcap_t structure
 48 |  * -- open a pcap capture file.
 49 |  */
 50 | pcap_t* pcap_open_offline(const char* fname, char* errbuf) {
 51 |     FILE* fp = strcmp(fname, "-") == 0 ? stdin : fopen(fname, "rb");
 52 |     if (!fp) {
 53 |         snprintf(errbuf, PCAP_ERRBUF_SIZE, "%s:%s", fname, strerror(errno));
 54 |         return 0;
 55 |     }
 56 |     pcap_t* p = pcap_fopen_offline(fp, errbuf);
 57 |     if (p && p->fp != stdin) p->must_close = true;
 58 |     return p;
 59 | }
 60 | 
 61 | char* pcap_lookupdev(char*) // not implemented
 62 | {
 63 |     fprintf(stderr, "pcap_fake.cpp:pcap_lookupdev: tcpflow was compiled without LIBPCAP. Will not live capture.\n");
 64 |     return 0;
 65 | }
 66 | 
 67 | pcap_t* pcap_open_live(const char*, int, int, int, char*) {
 68 |     fprintf(stderr, "pcap_fake.cpp:pcap_open_live: tcpflow was compiled without LIBPCAP. Will not live capture.\n");
 69 |     return 0;
 70 | }
 71 | 
 72 | inline uint32_t swap4(uint32_t x) {
 73 |     return (((x & 0xff000000) >> 24) | ((x & 0x00ff0000) >> 8) | ((x & 0x0000ff00) << 8) | ((x & 0x000000ff) << 24));
 74 | }
 75 | 
 76 | inline uint32_t swap2(uint16_t x) { return (((x & 0xff00) >> 8) | ((x & 0x00ff) << 8)); }
 77 | 
 78 | pcap_t* pcap_fopen_offline(FILE* fp, char* errbuf) {
 79 |     SET_BINMODE(fp);
 80 |     bool swapped = false;
 81 |     struct pcap_file_header header;
 82 |     if (fread(&header, sizeof(header), 1, fp) != 1) {
 83 |         snprintf(errbuf, PCAP_ERRBUF_SIZE, "Cannot read pcap header");
 84 |         return 0; // cannot read header
 85 |     }
 86 |     if (header.magic == 0xd4c3b2a1) { // check for swap
 87 |         header.magic = swap4(header.magic);
 88 |         header.version_major = swap2(header.version_major);
 89 |         header.version_minor = swap2(header.version_minor);
 90 |         header.thiszone = swap4(header.thiszone);
 91 |         header.sigfigs = swap4(header.sigfigs);
 92 |         header.snaplen = swap4(header.snaplen);
 93 |         header.linktype = swap4(header.linktype);
 94 |         swapped = true;
 95 |     }
 96 |     if (header.magic != 0xa1b2c3d4) {
 97 |         snprintf(errbuf, PCAP_ERRBUF_SIZE, "Cannot decode pcap header 0x%x; swapped=%d", header.magic, swapped);
 98 |         return 0;
 99 |     }
100 |     if (header.version_major != PCAP_VERSION_MAJOR || header.version_minor != PCAP_VERSION_MINOR) {
101 |         snprintf(errbuf, PCAP_ERRBUF_SIZE, "Cannot read pcap version %d.%d", header.version_major,
102 |                  header.version_minor);
103 |         return 0;
104 |     }
105 | 
106 |     pcap_t* ret = (pcap_t*)calloc(1, sizeof(pcap_t));
107 |     if (ret == 0) {
108 |         snprintf(errbuf, PCAP_ERRBUF_SIZE, "Cannot calloc %lu bytes", sizeof(pcap_t));
109 |         return 0;
110 |     }
111 |     ret->pktbuf = (uint8_t*)malloc(header.snaplen);
112 |     if (ret->pktbuf == 0) { // did we get the snaplen?
113 |         std::cerr << "Couldn't get header snaplen";
114 |         free(ret);
115 |         return 0;
116 |     }
117 |     // DEBUG(100) ("pcap_fake.cpp DEBUG: header.magic = %x", header.magic);
118 |     // DEBUG(100) ("pcap_fake.cpp DEBUG: header.version_major = %d", header.version_major);
119 |     // DEBUG(100) ("pcap_fake.cpp DEBUG: header.version_minor = %d", header.version_minor);
120 |     // DEBUG(100) ("pcap_fake.cpp DEBUG: header.thiszone = %d", header.thiszone);
121 |     // DEBUG(100) ("pcap_fake.cpp DEBUG: header.sigfigs = %d", header.sigfigs);
122 |     // DEBUG(100) ("pcap_fake.cpp DEBUG: header.snaplen = %d", header.snaplen);
123 |     // DEBUG(100) ("pcap_fake.cpp DEBUG: header.linktype = %d",header.linktype);
124 |     // DEBUG(100) ("pcap_fake.cpp DEBUG: ret->pktbuf = %s". ret->pktbuf);
125 |     ret->fp = fp;
126 |     ret->swapped = swapped;
127 |     ret->linktype = header.linktype;
128 |     return ret;
129 | }
130 | 
131 | /*
132 |  * These are not implemented in pcap_fake
133 |  */
134 | 
135 | int pcap_compile(pcap_t* p, struct bpf_program* program, const char* expression, int optimize, uint32_t mask) {
136 |     if (strlen(expression) == 0) {
137 |         program->valid = true;
138 |         return 0; // we can compile the empty expression
139 |     }
140 |     return -1; // we cannot compile otherwise
141 | }
142 | 
143 | int pcap_datalink(pcap_t* p) { return p->linktype; }
144 | 
145 | int pcap_setfilter(pcap_t* p, struct bpf_program* prog) {
146 |     if (prog->valid) return 0;
147 |     return -1;
148 | }
149 | 
150 | int pcap_loop(pcap_t* p, int cnt, pcap_handler callback, uint8_t* user) {
151 |     while (cnt != 0 && !feof(p->fp) && p->break_loop == false) {
152 |         uint32_t tv_sec;
153 |         uint32_t tv_usec;
154 | 
155 |         struct pcap_pkthdr hdr;
156 | 
157 |         /* Note: struct timeval is 16 bytes on MacOS and not 8 bytes,
158 |          * so we manually read and set up the structure
159 |          */
160 |         if (fread(&tv_sec, sizeof(uint32_t), 1, p->fp) != 1) break;
161 |         if (fread(&tv_usec, sizeof(uint32_t), 1, p->fp) != 1) break;
162 |         hdr.ts.tv_sec = tv_sec;
163 |         hdr.ts.tv_usec = tv_usec;
164 | 
165 |         if (fread(&hdr.caplen, sizeof(uint32_t), 1, p->fp) != 1) break;
166 |         if (fread(&hdr.len, sizeof(uint32_t), 1, p->fp) != 1) break;
167 | 
168 |         /* Swap the header if necessary */
169 |         if (p->swapped) {
170 |             hdr.ts.tv_sec = swap4(hdr.ts.tv_sec);
171 |             hdr.ts.tv_usec = swap4(hdr.ts.tv_usec);
172 |             hdr.caplen = swap4(hdr.caplen);
173 |             hdr.len = swap4(hdr.len);
174 |         }
175 | 
176 |         /* Read the packet */
177 |         if (fread(p->pktbuf, hdr.caplen, 1, p->fp) != 1) break; // no more to read
178 | 
179 |         // DEBUG(100) ("pcap_fake: read tv_sec.tv_usec=%d.%06d  caplen=%d  len=%d",
180 |         // (int)hdr.ts.tv_sec,(int)hdr.ts.tv_usec,hdr.caplen,hdr.len);
181 | 
182 |         /* Process the packet */
183 |         (*callback)(user, &hdr, p->pktbuf);
184 | 
185 |         /* And loop */
186 |         if (cnt > 0) cnt--; // decrease the packet count
187 |     }
188 |     return 0;
189 | }
190 | 
191 | void pcap_break_loop(pcap_t* p) { p->break_loop = true; }
192 | 
193 | void pcap_close(pcap_t* p) // close the file
194 | {
195 |     if (p->must_close) fclose(p->fp);
196 |     free(p->pktbuf);
197 |     free(p);
198 | }
199 | 
200 | #endif
201 | 


--------------------------------------------------------------------------------
/atomic_map.h:
--------------------------------------------------------------------------------
  1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
  2 | 
  3 | /**
  4 |  * defines atomic_map and atomic_set.
  5 |  * This is a nice lightweight atomic set when not much else is needed.
  6 |  *
  7 |  * 2020-07-06 - slg - Upgraded to to C++17.
  8 |  */
  9 | 
 10 | #ifndef ATOMIC_MAP_H
 11 | #define ATOMIC_MAP_H
 12 | 
 13 | #include <algorithm>
 14 | #include <map>
 15 | #include <mutex>
 16 | #include <set>
 17 | #include <unordered_map>
 18 | #include <unordered_set>
 19 | #include <vector>
 20 | #include <iostream>
 21 | 
 22 | /*
 23 |  * Sample usage:
 24 |  * struct {int a, int b, int c} mycounter_t;
 25 |  * atomic_map<key, mycounter_t>.
 26 |  * Creates a defaultdict, and automatically cleans up memory.
 27 |  * Could be reimplemented to use smart pointers.
 28 |  */
 29 | 
 30 | template <class T1, class T2> class atomic_map {
 31 |     // T1 - key. For example, std::string
 32 |     // T2 - value. Should be a pointer.
 33 |     // Mutex M protects mymap.
 34 |     // It is mutable to allow modification in const methods
 35 |     mutable std::mutex M{};
 36 |     std::map<T1, T2 *> mymap{};
 37 | 
 38 | public:
 39 |     atomic_map() {}
 40 |     ~atomic_map() {
 41 |         /* delete everything in the map. Could do this with a unique_ptr? */
 42 |         clear();
 43 |     }
 44 |     class KeyError : public std::exception {
 45 |         T1 key;
 46 |     public:
 47 |         KeyError(T1 key_) : key(key_) {}
 48 |         const char* what() const noexcept override { return "did not convert key_ to a string"; }
 49 |     };
 50 |     /*
 51 |      * Create the behavior of a Python defaultdict:
 52 |      * If the object is not in the map, add it.
 53 |      * then return a reference to the object that is in the map.
 54 |      */
 55 |     T2 &operator[](const T1& key) {
 56 |         const std::lock_guard<std::mutex> lock(M);
 57 |         auto it = mymap.find(key);
 58 |         if (it == mymap.end()) {
 59 |             mymap[key] = new T2();
 60 |             return *(mymap[key]);
 61 |         }
 62 |         return *(it->second);
 63 |     }
 64 |     /*
 65 |      * Get behavior throws a key error if not present, and is const.
 66 |      */
 67 |     T2 &get(const T1& key) const {
 68 |         const std::lock_guard<std::mutex> lock(M);
 69 |         auto it = mymap.find(key);
 70 |         if (it == mymap.end()) {
 71 |             throw KeyError(key);
 72 |         }
 73 |         return *(it->second);
 74 |     }
 75 |     /*
 76 |      * insert. We want this in some cases. Fail if it already exists
 77 |      */
 78 |     void insert(const T1 &key, T2 *value) {
 79 |         const std::lock_guard<std::mutex> lock(M);
 80 |         auto it = mymap.find(key);
 81 |         if (it != mymap.end()) {
 82 |             throw KeyError(key);
 83 |         }
 84 |         mymap[key] = value;
 85 |     }
 86 | 
 87 |     /* We can't just pass-through to find, because we need to lock the mutext */
 88 |     typename std::map<T1, T2 *>::const_iterator find(const T1& key) const {
 89 |         const std::lock_guard<std::mutex> lock(M);
 90 |         return mymap.find(key);
 91 |     }
 92 |     /* We can't allow iteration through the map, since that would not be threadsafe, but we can allow the caller to get end(). */
 93 | #if 0
 94 |     typename std::map<T1, T2 *>::const_iterator begin() const {
 95 |         const std::lock_guard<std::mutex> lock(M);
 96 |         return mymap.begin();
 97 |     }
 98 | #endif
 99 |     typename std::map<T1, T2 *>::const_iterator end() const {
100 |         const std::lock_guard<std::mutex> lock(M);
101 |         return mymap.end();
102 |     }
103 | 
104 |     void clear() {
105 |         /* First delete all of the elements, then clear the map.  This
106 |          * might be better done with unique_ptr(). However, then we
107 |          * couldn't return a pointer, so we would need to use
108 |          * shared_ptr(), which would incur a higher cost.
109 |          */
110 |         for (const auto &it : mymap) {
111 |             delete it.second;
112 |         }
113 |         mymap.clear();
114 |     }
115 |     /* Number of elements */
116 |     size_t size() const {
117 |         const std::lock_guard<std::mutex> lock(M);
118 |         return mymap.size();
119 |     }
120 |     /* implement this later */
121 |     /* bytes */
122 |     size_t bytes() const {
123 |         const std::lock_guard<std::mutex> lock(M);
124 |         size_t count = sizeof(*this);
125 |         for (const auto &it : mymap) {
126 |             count += sizeof(it.first) + sizeof(it.second) + it.first.size() + it.second->bytes();
127 |         }
128 |         return count;
129 |     }
130 | 
131 |     bool contains(T1 key) const {
132 |         const std::lock_guard<std::mutex> lock(M);
133 |         return mymap.find(key) != mymap.end();
134 |     }
135 |     /* Like python .keys() */
136 |     typename std::vector<T1> keys() const {
137 |         const std::lock_guard<std::mutex> lock(M);
138 |         std::vector<T1>ret;
139 |         for (const auto &it : mymap) {
140 |             ret.push_back( it.first );
141 |         }
142 |         return ret;
143 |     }
144 | 
145 |     /* This is only threadsafe if the it.second is an object, and not a pointer*/
146 |     /* Like python .values(). It should actually return objects. */
147 |     typename std::vector<T2 *> values() const {
148 |         const std::lock_guard<std::mutex> lock(M);
149 |         std::vector<T2 *>ret;
150 |         for (const auto &it : mymap) {
151 |             ret.push_back( it.second );
152 |         }
153 |         return ret;
154 |     }
155 | 
156 | 
157 |     /* like Python .items() */
158 |     /* This is used for dumping the contents in a mostly threadsafe manner.
159 |      * The item that is return is a reference to what's in the atomic_map, so it better not be deleted,
160 |      * and if you want multiple threads to access it, the elements should be atomic.
161 |      * There is no reference counting on the pointer, so be careful!
162 |      * It would be useful to have a priority queue to get the topN.
163 |      */
164 |     struct item {
165 |         item(const item& s): key(s.key), value(s.value){};
166 |         item(item &&that) noexcept : key(that.key), value(that.value) {}
167 |         item& operator=(const item& s) { this->key = s.key; this->value = s.value; return *this;}
168 | 
169 |         item(T1 key_, T2 *value_) : key(key_), value(value_){};
170 |         T1 key{};                      // reference to the key in the histogram
171 |         T2 *value{};                   // a pointer to the histogram's object
172 |         // these comparisions only look at the keys
173 |         bool operator==(const item& a) const { return (this->key == a.key); }
174 |         bool operator!=(const item& a) const { return !(*this == a); }
175 |         bool operator<(const item& a) const {
176 |             if (this->key < a.key) return true;
177 |             return false;
178 |         }
179 |         static bool compare(const item& e1, const item& e2) { return e1 < e2; }
180 |         virtual ~item(){};
181 |         size_t bytes() const {
182 |             return sizeof(*this) + value->bytes();
183 |         } // number of bytes used by object
184 |     };
185 | 
186 |     std::vector<item> items() const {
187 |         std::vector<item> ret;
188 |         /* Protect access to mymap with mutex */
189 |         const std::lock_guard<std::mutex> lock(M);
190 |         for ( auto &pair:mymap ){
191 |             ret.push_back( item(pair.first, pair.second));
192 |         }
193 |         return ret;
194 |     }
195 |     void write(std::ostream &os) const {
196 |         const std::lock_guard<std::mutex> lock(M);
197 |         for (const auto &it : mymap) {
198 |             os << " " << it.first << ": " << (it.second) << "\n";
199 |         }
200 |     }
201 | };
202 | 
203 | #endif
204 | 


--------------------------------------------------------------------------------
/utils.h:
--------------------------------------------------------------------------------
  1 | /****************************************************************
  2 |  *** utils.h
  3 |  ***
  4 |  *** To use utils.c/utils.h, be sure this is in your configure.ac file:
  5 |       m4_include([be20_api/be20_configure.m4])
  6 |  ***
  7 |  ****************************************************************/
  8 | 
  9 | #ifndef UTILS_H
 10 | #define UTILS_H
 11 | 
 12 | #ifndef PACKAGE_NAME
 13 | #error utils.h requires that autoconf-generated config.h be included first
 14 | #endif
 15 | 
 16 | #include <array>
 17 | #include <cstdio>
 18 | #include <exception>
 19 | #include <filesystem>
 20 | #include <fstream>
 21 | #include <iostream>
 22 | #include <memory>
 23 | #include <random>
 24 | #include <sstream>
 25 | #include <stdexcept>
 26 | #include <string>
 27 | #include <unistd.h>
 28 | #include <vector>
 29 | 
 30 | bool getenv_debug(const char *name);    // look for an environment variable and return TRUE if it is set and not 0 or FALSE
 31 | bool starts_with(const std::string& buf, const std::string& with);
 32 | bool ends_with(const std::string& buf, const std::string& with);
 33 | bool ends_with(const std::wstring& buf, const std::wstring& with);
 34 | std::vector<std::string>& split(const std::string& s, char delim, std::vector<std::string>& elems);
 35 | std::vector<std::string> split(const std::string& s, char delim);
 36 | 
 37 | /* Read all of the lines of a file and return them as a vector */
 38 | std::vector<std::string> getLines(const std::filesystem::path path);
 39 | std::string getLast(const std::vector<std::string> &v); // returns the last line if v has more than one line, otherwise ''
 40 | 
 41 | inline void truncate_at(std::string& line, char ch) {
 42 |     size_t pos = line.find(ch);
 43 |     if (pos != std::string::npos) line.resize(pos);
 44 | };
 45 | 
 46 | inline void set_from_string(int *ret, std::string v) { *ret = std::stoi(v); };
 47 | inline void set_from_string(unsigned int *ret, std::string v) { *ret = std::stoul(v); };
 48 | inline void set_from_string(uint64_t *ret, std::string v) { *ret = std::stoull(v); };
 49 | inline void set_from_string(uint8_t *ret, std::string v) { *ret = std::stoul(v); };
 50 | 
 51 | inline void set_from_string(std::string *ret, std::string v) { *ret =  v; };
 52 | inline void set_from_string(bool *ret, std::string v) {
 53 |     *ret = (v.size()>0 && (v[0]=='Y' || v[0]=='y' || v[0]=='T' || v[0]=='t' || v[0]=='1'));
 54 | };
 55 | 
 56 | 
 57 | 
 58 | #ifndef HAVE_LOCALTIME_R
 59 | #ifdef __MINGW32__
 60 | #undef localtime_r
 61 | #endif
 62 | void localtime_r(time_t* t, struct tm* tm);
 63 | #endif
 64 | 
 65 | #ifndef HAVE_GMTIME_R
 66 | #ifdef __MINGW32__
 67 | #undef gmtime_r
 68 | #endif
 69 | void gmtime_r(time_t* t, struct tm* tm);
 70 | #endif
 71 | 
 72 | int64_t get_filesize(int fd);
 73 | 
 74 | #ifndef HAVE_ISHEXNUMBER
 75 | inline int ishexnumber(int c) {
 76 |     switch (c) {
 77 |     case '0':
 78 |     case '1':
 79 |     case '2':
 80 |     case '3':
 81 |     case '4':
 82 |     case '5':
 83 |     case '6':
 84 |     case '7':
 85 |     case '8':
 86 |     case '9':
 87 |     case 'A':
 88 |     case 'B':
 89 |     case 'C':
 90 |     case 'D':
 91 |     case 'E':
 92 |     case 'F':
 93 |     case 'a':
 94 |     case 'b':
 95 |     case 'c':
 96 |     case 'd':
 97 |     case 'e':
 98 |     case 'f': return 1;
 99 |     }
100 |     return 0;
101 | }
102 | #endif
103 | 
104 | #ifndef HAVE_ISXDIGIT
105 | inline int isxdigit(int c) { return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); }
106 | #endif
107 | 
108 | /* Useful functions for scanners */
109 | #define ONE_HUNDRED_NANO_SEC_TO_SECONDS 10000000
110 | #define SECONDS_BETWEEN_WIN32_EPOCH_AND_UNIX_EPOCH 11644473600LL
111 | /*
112 |  * 11644473600 is the number of seconds between the Win32 epoch
113 |  * and the Unix epoch.
114 |  *
115 |  * http://arstechnica.com/civis/viewtopic.php?f=20&t=111992
116 |  * gmtime_r() is Linux-specific. You'll find a copy in util.cpp for Windows.
117 |  */
118 | 
119 | inline std::string microsoftDateToISODate(const uint64_t& time) {
120 |     time_t tmp = (time / ONE_HUNDRED_NANO_SEC_TO_SECONDS) - SECONDS_BETWEEN_WIN32_EPOCH_AND_UNIX_EPOCH;
121 | 
122 |     struct tm time_tm;
123 |     gmtime_r(&tmp, &time_tm);
124 |     char buf[256];
125 |     strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%SZ", &time_tm); // Zulu time
126 |     return std::string(buf);
127 | }
128 | 
129 | /* Convert Unix timestamp to ISO format */
130 | inline std::string unixTimeToISODate(const uint64_t t) {
131 |     struct tm time_tm;
132 |     time_t tmp = t;
133 |     gmtime_r(&tmp, &time_tm);
134 |     char buf[256];
135 |     strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%SZ", &time_tm); // Zulu time
136 |     return std::string(buf);
137 | }
138 | 
139 | /* Many internal windows and Linux structures require a valid printable name in ASCII */
140 | inline bool validASCIIName(const std::string name) {
141 |     for (auto ch : name) {
142 |         if (ch & 0x80) return false;  // high bit should not be set
143 |         if (ch < ' ') return false;   // should not be control character
144 |         if (ch == 0x7f) return false; // DEL is not printable
145 |     }
146 |     return true;
147 | }
148 | 
149 | // https://stackoverflow.com/questions/3379956/how-to-create-a-temporary-directory-in-c
150 | inline std::filesystem::path NamedTemporaryDirectory(unsigned long long max_tries = 1000) {
151 |     std::random_device dev;
152 |     std::mt19937 prng(dev());
153 |     std::uniform_int_distribution<uint64_t> rand(0);
154 |     std::filesystem::path path;
155 |     for (unsigned int i=0; i<max_tries; i++ ){
156 |         std::stringstream ss;
157 |         ss << "be_tmp" << std::hex << rand(prng);
158 |         path = std::filesystem::temp_directory_path() / ss.str();
159 |         if (std::filesystem::create_directory(path)) {
160 |             return path;
161 |         }
162 |     }
163 |     throw std::runtime_error("could not create NamedTemporaryDirectory");
164 | }
165 | 
166 | inline bool directory_empty(std::filesystem::path path) {
167 |     namespace fs = std::filesystem;
168 |     if (fs::is_directory(path)) {
169 |         for (const auto& it : fs::directory_iterator(path)) {
170 |             (void)it;
171 |             return false;
172 |         }
173 |     }
174 |     return true;
175 | }
176 | 
177 | uint64_t scaled_stoi64(const std::string &str);
178 | 
179 | // https://stackoverflow.com/questions/478898/how-do-i-execute-a-command-and-get-the-output-of-the-command-within-c-using-po
180 | inline std::string subprocess_call(const char* cmd) {
181 |     std::array<char, 4096> buffer;
182 |     std::stringstream ss;
183 |     std::unique_ptr<FILE, int(*)(FILE *)> pipe(popen(cmd, "r"), pclose);
184 |     if (!pipe) {
185 |         throw std::runtime_error("popen() failed!");
186 |     }
187 |     while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
188 |         ss << buffer.data();
189 |     }
190 |     return ss.str();
191 | }
192 | 
193 | 
194 | #ifndef HAVE_STRPTIME
195 | // https://stackoverflow.com/questions/321849/strptime-equivalent-on-windows
196 | inline char* strptime(const char* s, const char* f, struct tm* tm) {
197 |     // Isn't the C++ standard lib nice? std::get_time is defined such that its
198 |     // format parameters are the exact same as strptime. Of course, we have to
199 |     // create a string stream first, and imbue it with the current C locale, and
200 |     // we also have to make sure we return the right things if it fails, or
201 |     // if it succeeds, but this is still far simpler an implementation than any
202 |     // of the versions in any of the C standard libraries.
203 |     std::istringstream input(s);
204 |     input.imbue(std::locale(setlocale(LC_ALL, nullptr)));
205 |     input >> std::get_time(tm, f);
206 |     if (input.fail()) {
207 |         return nullptr;
208 |     }
209 |     return (char*)(s + input.tellg());
210 | }
211 | #endif
212 | 
213 | 
214 | 
215 | #endif
216 | 


--------------------------------------------------------------------------------
/threadpool.cpp:
--------------------------------------------------------------------------------
  1 | #include "config.h"
  2 | #include "threadpool.h"
  3 | #include "scanner_set.h"
  4 | 
  5 | thread_pool::thread_pool(scanner_set &ss_): ss(ss_)
  6 | {
  7 | }
  8 | 
  9 | void thread_pool::launch_workers(size_t num_workers)
 10 | {
 11 |     for (size_t i=0; i < num_workers; i++){
 12 |         std::unique_lock<std::mutex> lock(M);
 13 |         class worker *w = new worker(*this,i);
 14 |         workers.insert(w);
 15 |         threads.insert(new std::thread( &worker::start_worker, static_cast<void *>(w) ));
 16 |     }
 17 | }
 18 | 
 19 | 
 20 | thread_pool::~thread_pool()
 21 | {
 22 |     /* We previously sent the termination message to all of the sub-threads here.
 23 |      * However, their terminating caused wacky problems with the malloc library.
 24 |      * So we just leave them floating around now. Doesn't matter much, because
 25 |      * the main process will die soon enough.
 26 |      */
 27 |     for (auto &it : threads ){
 28 |         it->join();
 29 |         delete it;
 30 |     }
 31 | }
 32 | 
 33 | /*
 34 |  * Wait until there are no tasks and none of the threads are running
 35 |  */
 36 | void thread_pool::wait_for_tasks()
 37 | {
 38 |     if(debug) std::cerr << "thread_pool::wait_for_tasks  work_queue.size()=" << work_queue.size() << std::endl;
 39 |     std::unique_lock<std::mutex> lock(M);
 40 |     if(debug) std::cerr << "thread_pool::wait_for_tasks  got lock work_queue.size()=" << work_queue.size() << " working_workers=" << working_workers << std::endl;
 41 |     // wait until a thread is free (doesn't matter which)
 42 |     while (work_queue.size() > 0 || working_workers>0){
 43 |         if(debug) std::cerr << "thread_pool::wait_for_tasks work_queue.size()==" << work_queue.size() << "  working_workers=" << working_workers << std::endl;
 44 |         TO_WORKER.notify_one();         // wake up a worker in case one is sleeping
 45 |         TO_MAIN.wait( lock );           // wait for a message from a worker
 46 |     }
 47 |     if(debug) std::cerr << "thread_pool::wait_for_tasks  done. work_queue.size()=" << work_queue.size() <<  " working_workers=" << working_workers << std::endl;
 48 | };
 49 | 
 50 | 
 51 | void thread_pool::join()
 52 | {
 53 |     wait_for_tasks();    /* Wait until there are no messages in the work queue */
 54 |     /* Next, send a kill message to each active thread. */
 55 |     size_t num_threads = get_worker_count(); // get the count with lock
 56 |     for(size_t i=0;i < num_threads;i++){
 57 |         if (debug) std::cerr << "thread_pool::join: pushing null task #" << i << std::endl;
 58 |         push_task(nullptr);             // tell a thread to die
 59 |     }
 60 | 
 61 |     // This is a spin lock until there are no more workers. Gross, but it works.
 62 |     while (get_worker_count()>0){
 63 |         std::this_thread::sleep_for( std::chrono::milliseconds( shutdown_spin_lock_poll_ms ));
 64 |         if (debug) {
 65 |             debug_pool(std::cerr);
 66 |         }
 67 |     }
 68 | }
 69 | 
 70 | void thread_pool::main_thread_wait()
 71 | {
 72 |     std::unique_lock<std::mutex> lock(M);
 73 |     main_wait_timer.start();
 74 |     //TO_WORKER.notify_one();         // if a worker is sleeping, wake it up
 75 |     TO_MAIN.wait( lock );
 76 |     main_wait_timer.stop();
 77 | }
 78 | 
 79 | 
 80 | /*
 81 |  * This may be called from any thread.
 82 |  * Right now it only works if called by main thread.
 83 |  */
 84 | 
 85 | void thread_pool::push_task(const sbuf_t *sbuf, scanner_t *scanner)
 86 | {
 87 |     if (debug) {
 88 |         std::cerr << "thread_pool::push_task( ";
 89 |         if (sbuf) {
 90 |             std::cerr << *sbuf;
 91 |         } else {
 92 |             std::cerr << "nullptr";
 93 |         }
 94 |         std::cerr << " , scanner=" << scanner << ") ";
 95 |     }
 96 |     std::unique_lock<std::mutex> lock(M);
 97 |     /* In the main thread, make sure there is a free worker before continuing.
 98 |      * We don't do this in the worker threads because we want them to clear.
 99 |      */
100 |     if (main_thread == std::this_thread::get_id() && scanner==nullptr) {
101 |         while (freethreads==0){               // if there are no free threads, wait.
102 |             main_wait_timer.start();
103 |             //TO_WORKER.notify_one();         // if a worker is sleeping, wake it up
104 |             TO_MAIN.wait( lock );
105 |             main_wait_timer.stop();
106 |         }
107 |     }
108 | 
109 |     /* Add to the count */
110 |     work_queue.push( new work_unit(sbuf, scanner) );
111 |     if (debug) std::cerr << "added work unit to queue. size=" << work_queue.size() << std::endl;
112 |     TO_WORKER.notify_one();
113 | };
114 | 
115 | 
116 | void thread_pool::push_task(const sbuf_t *sbuf)
117 | {
118 |     push_task(sbuf, nullptr);
119 | }
120 | 
121 | 
122 | int thread_pool::get_free_count() const
123 | {
124 |     std::lock_guard<std::mutex> lock(M);
125 |     return freethreads;
126 | };
127 | 
128 | size_t thread_pool::get_worker_count() const
129 | {
130 |     std::lock_guard<std::mutex> lock(M);
131 |     return workers.size();
132 | }
133 | 
134 | size_t thread_pool::get_tasks_queued() const
135 | {
136 |     std::lock_guard<std::mutex> lock(M);
137 |     return work_queue.size();
138 | }
139 | 
140 | 
141 | void thread_pool::debug_pool(std::ostream &os) const
142 | {
143 |     os << " worker_count: " << get_worker_count()
144 |        << " free_count: "   << get_free_count()
145 |        << " tasks_queued: " << get_tasks_queued()
146 |        << std::endl;
147 | }
148 | 
149 | /* Launch the worker. It's kept on the per-thread stack. When it is done, delete it.
150 |  */
151 | void * worker::start_worker(void *arg)
152 | {
153 |     worker *w = static_cast<class worker *>(arg);
154 |     auto ret = w->run();
155 |     delete w;
156 |     return ret;
157 | }
158 | 
159 | 
160 | /* Run the worker.
161 |  * Note that we used to throw internal errors, but this caused problems with some versions of GCC.
162 |  * Now we simply return when there is an error.
163 |  */
164 | void *worker::run()
165 | {
166 |     if (tp.debug) std::cerr << "worker " << std::this_thread::get_id() << " starting " << std::endl;
167 |     tp.freethreads++;           // this thread is free
168 |     while(true){
169 | 	/* Get the lock, then wait for the queue to be empty.
170 | 	 * If it is not empty, wait for the lock again.
171 | 	 */
172 |         thread_pool::work_unit wu;
173 |         {
174 |             std::unique_lock<std::mutex> lock( tp.M );
175 |             if (tp.debug) std::cerr << "worker " << std::this_thread::get_id() << " has lock " << std::endl;
176 |             worker_wait_timer.start();  // waiting for work
177 |             while ( tp.work_queue.size()==0 ){   // wait until something is in the task queue
178 |                 if (tp.debug) std::cerr << "worker " << std::this_thread::get_id() << " waiting " << std::endl;
179 |                 /* I didn't get any work; go to sleep */
180 |                 //std::cerr << std::this_thread::get_id() << " #1 tp.tasks.size()=" << tp.tasks.size() << std::endl;
181 |                 tp.ss.thread_set_status("waiting");
182 |                 tp.TO_MAIN.notify_one(); // if main is sleeping, wake it up
183 |                 tp.TO_WORKER.wait( lock );
184 |                 //std::cerr << std::this_thread::get_id() << " #2 tp.tasks.size()=" << tp.tasks.size() << std::endl;
185 |             }
186 |             worker_wait_timer.stop();   // no longer waiting
187 |             tp.ss.thread_set_status("working");
188 | 
189 |             /* Worker still has the lock */
190 |             thread_pool::work_unit *wup = tp.work_queue.front();    // get the task
191 |             tp.work_queue.pop();           // remove it
192 |             wu = *wup;
193 |             delete wup;
194 |             tp.freethreads--;           // no longer free
195 |             tp.working_workers++;       // a worker is working
196 |         }
197 | 	if (wu.sbuf==nullptr) {                  // special code to exit thread
198 |             //tp.TO_MAIN.notify_one();          // tell the master that one is gone
199 |             if (tp.debug) std::cerr << std::this_thread::get_id() << "got wu.sbuf=nullptr" << std::endl;
200 |             break;
201 |         }
202 |         /* dispatch the work unit.
203 |          * if wu.scanner is not set, process_sbuf will run all scanners in sequence, or schedule each.
204 |          * if wu.scanner is set, process_sbuf will just run that one scanner.
205 |          */
206 |         if (wu.scanner) {
207 |             tp.ss.process_sbuf( wu.sbuf, wu.scanner);
208 |         }
209 |         else {
210 |             tp.ss.process_sbuf( wu.sbuf);
211 |         }
212 |         tp.ss.release_sbuf(wu.sbuf);
213 |         tp.working_workers--;
214 |         {
215 |             std::unique_lock<std::mutex> lock( tp.M );
216 |             tp.freethreads++;        // and now the thread is free again!
217 |             tp.TO_MAIN.notify_one(); // tell the master that we are free!
218 |         }
219 |     }
220 |     tp.ss.thread_set_status("exiting");
221 |     if (tp.debug) std::cerr << std::this_thread::get_id() << " exiting "<< std::endl;
222 |     {
223 |         std::unique_lock<std::mutex> lock(tp.M);
224 |         tp.workers.erase(this);
225 |         tp.working_workers--;       // a worker is working
226 |     }
227 |     tp.total_worker_wait_ns += worker_wait_timer.running_nanoseconds();
228 |     tp.ss.thread_set_status("exited");
229 |     return nullptr;
230 | }
231 | 


--------------------------------------------------------------------------------
/feature_recorder_set.h:
--------------------------------------------------------------------------------
  1 | /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
  2 | #ifndef FEATURE_RECORDER_SET_H
  3 | #define FEATURE_RECORDER_SET_H
  4 | 
  5 | #include <exception>
  6 | #include <filesystem>
  7 | 
  8 | #if defined(HAVE_SQLITE3_H)
  9 | #include <sqlite3.h>
 10 | #endif
 11 | 
 12 | #include "atomic_map.h"
 13 | #include "atomic_set.h"
 14 | #include "feature_recorder.h"
 15 | #include "sbuf.h"
 16 | #include "scanner_config.h"
 17 | 
 18 | /** \addtogroup internal_interfaces
 19 |  * @{
 20 |  */
 21 | /** \file */
 22 | 
 23 | /**
 24 |  * \class feature_recorder_set
 25 |  * The feature_recorder_set is an object that controls output. It knows where the output goes (outdir),
 26 |  * the various feature recorders that write to that output, and provides for synchronization.
 27 |  * It also has the factory method for new feature_recorders. Therefore if you want a different feature_recorder,
 28 |  * this set should be subclassed as well.
 29 |  *
 30 |  * NOTE: plugins can only call virtual functions!
 31 |  *
 32 |  */
 33 | 
 34 | /* Define a map of feature recorders with atomic access. */
 35 | /* TODO: This should probably be a unique_ptr */
 36 | typedef atomic_map<std::string, class feature_recorder> feature_recorder_map_t;
 37 | inline std::ostream& operator<<(std::ostream& os, const feature_recorder_map_t& m) {
 38 |     m.write(os);
 39 |     return os;
 40 | }
 41 | 
 42 | class word_and_context_list;
 43 | class feature_recorder_set {
 44 | private:
 45 |     // neither copying nor assignment is implemented
 46 |     feature_recorder_set(const feature_recorder_set& fs) = delete;
 47 |     feature_recorder_set& operator=(const feature_recorder_set& fs) = delete;
 48 | 
 49 |     friend class feature_recorder;
 50 | 
 51 |     //const std::string input_fname{}; // input file; copy for convenience.
 52 |     //const std::string outdir{};      // where output goes; must know.
 53 | 
 54 |     /* map of feature recorders, name->feature recorder It is
 55 |      * read-write when BE is running single-threaded. After we go into
 56 |      * multi-threaded mode, it is read-only.
 57 |      */
 58 |     feature_recorder_map_t frm{};
 59 |     bool frm_frozen {false};            // once the frm is frozen, it is read-only.
 60 |     feature_recorder* stop_list_recorder{nullptr}; // where stopped features get written (if there is one)
 61 | #if defined(HAVE_SQLITE3_H) and defined(HAVE_LIBSQLITE3)
 62 |     /* If we are compiled with SQLite3, this is the handle to the open database */
 63 |     sqlite3* db3{};
 64 | #endif
 65 | 
 66 | public:
 67 |     void frm_freeze() { assert(frm_frozen==false); frm_frozen=true;}
 68 |     size_t feature_recorder_count() const { return frm.size(); }
 69 |     /* Flags for feature recorders. This used to be a bitmask, but Stroustrup (2013) recommends just having
 70 |      * a bunch of bools.
 71 |      */
 72 |     struct flags_t {
 73 |         bool disabled{false};                   // do not record anything! This is is just used for a path-printer
 74 |         bool pedantic{false};                   // make sure that all features written are valid utf-8
 75 |         bool no_alert{false};                   // no alert recorder
 76 |         bool only_alert{false};                 //  always return the alert recorder
 77 |         bool create_stop_list_recorders{false}; // static const uint32_t CREATE_STOP_LIST_RECORDERS= 0x04;  //
 78 |         bool debug{false};                      // enable debug printing
 79 |         bool record_files{true};                // record to files
 80 |         bool record_sql{false};                 // record to SQL
 81 |     } flags;
 82 | 
 83 |     static flags_t flags_disabled() {           // return a frs that is disabled
 84 |         flags_t f;
 85 |         f.disabled = true;
 86 |         return f;
 87 |     }
 88 | 
 89 |     /** Constructor:
 90 |      * create an emptry feature recorder set. If disabled, create a disabled recorder.
 91 |      * @param flags_ = config flags
 92 |      * @param hash_algorithm - which algorithm to use for de-duplication
 93 |      * @param input_fname_ = where input comes from
 94 |      * @param outdir_ = output directory (passed to feature recorders). "" if disabled.
 95 |      * This clearly needs work.
 96 |      */
 97 |     feature_recorder_set(const flags_t& flags_, const scanner_config& sc);
 98 |     virtual ~feature_recorder_set();
 99 | 
100 |     /* Configuration. This is a copy; it should be a reference, but that caused an AddressSanitizer error. */
101 |     const scanner_config sc;
102 | 
103 |     /* Read-only functions for the scanner-config file management variables */
104 |     virtual std::filesystem::path get_input_fname() const { return sc.input_fname; }
105 |     virtual std::filesystem::path get_outdir() const { return sc.outdir; }
106 | 
107 |     /* the feature recorder set automatically hashes all of the sbuf's that it processes. */
108 |     typedef std::string (*hash_func_t)(const uint8_t* buf, size_t bufsize);
109 |     struct hash_def {
110 |         hash_def(std::string name_, hash_func_t func_) : name(name_), func(func_){};
111 |         std::string name; // name of hash
112 |         hash_func_t func; // hash function
113 |         static std::string md5_hasher(const uint8_t* buf, size_t bufsize);
114 |         static std::string sha1_hasher(const uint8_t* buf, size_t bufsize);
115 |         static std::string sha256_hasher(const uint8_t* buf, size_t bufsize);
116 |         static hash_func_t hash_func_for_name(const std::string& name);
117 |     };
118 | 
119 |     const word_and_context_list* alert_list{}; /* shold be flagged */
120 |     const word_and_context_list* stop_list{};  /* should be ignored */
121 | 
122 |     /** hashing system */
123 |     const hash_def hasher; // name and function that perform hashing; set by allocator
124 | 
125 |     static const std::string ALERT_RECORDER_NAME; // the name of the alert recorder
126 |     // static const std::string   DISABLED_RECORDER_NAME; // the fake disabled feature recorder
127 | 
128 |     void set_stop_list(const word_and_context_list* alist) { stop_list = alist; }
129 |     void set_alert_list(const word_and_context_list* alist) { alert_list = alist; }
130 | 
131 |     /** Initialize a feature_recorder_set. Previously this was a constructor, but it turns out that
132 |      * virtual functions for the create_name_factory aren't honored in constructors.
133 |      *
134 |      * init() is called after all of the scanners have been loaded. It
135 |      * tells each feature file about its histograms (among other things)
136 |      */
137 | 
138 |     /* feature_recorder_set flags */
139 |     /* Flags are now implemented as booleans per stroustrup 2013 */
140 | 
141 |     int64_t offset_add{0};         // added to every reported offset, for use with hadoop
142 |     std::string banner_filename{}; // banner for top of every file
143 | 
144 |     /* histogram support */
145 |     void histogram_add(const histogram_def& def); // adds it to a local set or to the specific feature recorder
146 |     size_t histogram_count() const;               // counts histograms in all feature recorders
147 | 
148 |     void set_carve_defaults();
149 | 
150 |     // called when scanner_set shuts down:
151 |     void feature_recorders_shutdown();
152 |     void histograms_generate(); // make the histograms in the output directory (and optionally in the database)
153 | 
154 |     //typedef  void (*xml_notifier_t)(const std::string &xmlstring);
155 | 
156 |     /* support for creating and finding feature recorders
157 |      * Previously called create_name().
158 |      * functions must be virtual so they can be called by plug-in.
159 |      * All return a reference to the named (or created) feature recorder, or else throw exception indicated
160 |      */
161 |     class NoSuchFeatureRecorder : public std::exception {
162 |         std::string m_error{};
163 | 
164 |     public:
165 |         NoSuchFeatureRecorder(std::string_view error) : m_error(error) {}
166 |         const char* what() const noexcept override { return m_error.c_str(); }
167 |     };
168 | 
169 |     class FeatureRecorderAlreadyExists : public std::exception {
170 |         std::string m_error{};
171 | 
172 |     public:
173 |         FeatureRecorderAlreadyExists(std::string_view error) : m_error(error) {}
174 |         const char* what() const noexcept override { return m_error.c_str(); }
175 |     };
176 | 
177 |     class FeatureRecorderNullName : public std::exception {
178 |     public:
179 |         FeatureRecorderNullName() {}
180 |         const char* what() const noexcept override { return "FeatureRecorderNullName"; }
181 |     };
182 | 
183 |     /* create a feature recorder, and return it as well */
184 |     virtual void create_alert_recorder();
185 |     virtual feature_recorder& create_feature_recorder(feature_recorder_def def); // create a feature recorder
186 |     virtual feature_recorder& create_feature_recorder(std::string name);         // convenience function
187 | 
188 |     // Just return it
189 |     virtual feature_recorder& named_feature_recorder(const std::string name) const;       // returns the named feature recorder
190 |     virtual feature_recorder& get_alert_recorder() const;       // returns the alert recorder
191 |     virtual std::vector<std::string> feature_file_list() const; // returns a list of feature file names
192 | 
193 |     void dump_name_count_stats(class dfxml_writer& writer) const; // dumps the standard dfxml
194 | 
195 |     void info_feature_recorders( std::ostream &os) const;
196 | 
197 |     /****************************************************************
198 |      *** DB interface
199 |      ****************************************************************/
200 | 
201 | #if 0
202 | #if defined(HAVE_SQLITE3_H) and defined(HAVE_LIBSQLITE3)
203 |     virtual  void db_send_sql(sqlite3 *db3,const char **stmts, ...) ;
204 |     virtual  sqlite3 *db_create_empty(const std::string &name) ;
205 |     void     db_create_table(const std::string &name) ;
206 |     void     db_create() ;
207 |     void     db_transaction_begin() ;
208 |     void     db_transaction_commit() ;               // commit current transaction
209 |     void     db_close() ;                            //
210 | #endif
211 | #endif
212 |     /****************************************************************
213 |      *** External Functions
214 |      ****************************************************************/
215 | };
216 | 
217 | #endif
218 | 


--------------------------------------------------------------------------------