├── doc ├── source │ ├── README.md │ ├── web.css │ ├── print.css │ ├── index.md │ └── stringsext--manpage.md ├── shell.nix ├── make--all ├── make--rustdoc-link ├── make--index ├── make--stringsext--manpage ├── markdown2man ├── markdown2html └── markdown2pdf ├── tests └── functional │ ├── expected_output3 │ ├── input2 │ ├── expected_output1 │ ├── run-tests │ ├── expected_output2 │ └── input1 ├── scripts ├── 14-make-doc ├── 13-make-win-cross-compile ├── 11-test ├── 10-clear-targets ├── 15-build-packages ├── 16-clean-targets-keep-binaries ├── 02-make-all-podman ├── 17-make-dist └── 12-make-targets ├── .envrc ├── .gitignore ├── .dockerignore ├── LICENSE.md ├── .direnv └── bin │ └── nix-direnv-reload ├── LICENSE-MIT ├── shell.nix ├── Cargo.toml ├── src ├── help.rs ├── options.rs ├── finding.rs ├── input.rs ├── main.rs ├── scanner.rs ├── finding_collection.rs └── mission.rs ├── README.md ├── .azure-pipelines.yml ├── LICENSE-APACHE └── Cargo.lock /doc/source/README.md: -------------------------------------------------------------------------------- 1 | ../../README.md -------------------------------------------------------------------------------- /tests/functional/expected_output3: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /scripts/14-make-doc: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | cd ../doc 3 | ./make--all 4 | 5 | cargo rustdoc -p stringsext 6 | -------------------------------------------------------------------------------- /tests/functional/input2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getreu/stringsext/HEAD/tests/functional/input2 -------------------------------------------------------------------------------- /scripts/13-make-win-cross-compile: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | cargo build --release --target x86_64-pc-windows-gnu 3 | -------------------------------------------------------------------------------- /.envrc: -------------------------------------------------------------------------------- 1 | # [nix-community/nix-direnv](https://github.com/nix-community/nix-direnv) 2 | use nix 3 | # use flake 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | doc/build/ 2 | target 3 | *.sw? 4 | *.html 5 | *.1 6 | *.1.gz 7 | .vscode/ 8 | tests/functional/output 9 | .idea 10 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | doc/build/ 2 | target 3 | *.sw? 4 | *.html 5 | *.1 6 | *.1.gz 7 | .vscode/ 8 | tests/functional/output 9 | .idea 10 | -------------------------------------------------------------------------------- /doc/shell.nix: -------------------------------------------------------------------------------- 1 | { pkgs ? import {} }: 2 | pkgs.mkShell { 3 | nativeBuildInputs = with pkgs; [ 4 | pandoc 5 | python311Packages.weasyprint 6 | ]; 7 | 8 | } 9 | 10 | -------------------------------------------------------------------------------- /doc/make--all: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Change to this script directory 4 | cd "${0%/*}" 5 | 6 | rm -r ./build 7 | 8 | ./make--index 9 | ./make--stringsext--manpage 10 | ./make--rustdoc-link 11 | 12 | -------------------------------------------------------------------------------- /scripts/11-test: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Unit testing 4 | 5 | if cargo test 6 | then 7 | echo Cargo test succeeded. 8 | else 9 | ( echo Cargo test failed. && exit 1 ) 10 | fi 11 | 12 | 13 | ../tests/functional/run-tests 14 | -------------------------------------------------------------------------------- /doc/source/web.css: -------------------------------------------------------------------------------- 1 | 13 | -------------------------------------------------------------------------------- /doc/make--rustdoc-link: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | mkdir --parents ./build/html 4 | mkdir --parents ./build/html/_downloads 5 | mkdir --parents ./build/man/man1 6 | 7 | cd ./build/html/_downloads 8 | ln -sf ../../../../target/* . 9 | ln -sf ../../../build/man/man1/stringsext.1.gz . 10 | 11 | 12 | -------------------------------------------------------------------------------- /scripts/10-clear-targets: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Change to this script directory 3 | cd $(dirname "$0") 4 | 5 | ### Better do these manually, to make sure that all dependencies work with 6 | ### the current `rustc` version. 7 | # cargo update 8 | # cargo upgrade 9 | 10 | rm -r ../target 11 | rm -r ../doc/build 12 | 13 | -------------------------------------------------------------------------------- /doc/make--index: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | mkdir --parents ./build/html 4 | mkdir --parents ./build/html/_downloads 5 | mkdir --parents ./build/man/man1 6 | 7 | cp ../README.md ./source/index.md 8 | sed -i 's/http.*\.getreu\.net//g' ./source/index.md 9 | 10 | ./markdown2html ./source/index.md ./build/html/index.html 11 | #rm index.md 12 | -------------------------------------------------------------------------------- /doc/source/print.css: -------------------------------------------------------------------------------- 1 | @page { 2 | @bottom-center { 3 | content: counter(page) "/" counter(pages); 4 | } 5 | size: A4 portrait; 6 | } 7 | h1, h2, h3, h4 { 8 | font-weight: normal; 9 | color: #049; 10 | } 11 | h1 { font-size: 140%; } 12 | h2 { font-size: 120%; } 13 | h3 { font-size: 110%; } 14 | h4 { font-size: 105%; } 15 | 16 | -------------------------------------------------------------------------------- /scripts/15-build-packages: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | cd .. 3 | 4 | # [cargo-deb 2.7.0 - Docs.rs](https://docs.rs/crate/cargo-deb/latest) 5 | # See section "Cross-compilation" 6 | 7 | # Cargo deb saves the built package in 8 | cargo deb --target=x86_64-unknown-linux-gnu 9 | 10 | #rustup target add i686-unknown-linux-gnu 11 | CARGO_TARGET_I686_UNKNOWN_LINUX_GNU_LINKER=/usr/bin/i686-linux-gnu-gcc \ 12 | cargo deb --target=i686-unknown-linux-gnu 13 | -------------------------------------------------------------------------------- /doc/make--stringsext--manpage: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | mkdir --parents ./build/html 4 | mkdir --parents ./build/html/_downloads 5 | mkdir --parents ./build/man/man1 6 | 7 | # Html page 8 | ./markdown2man ./source/stringsext--manpage.md ./build/man/man1/stringsext.1 9 | ./markdown2pdf ./source/stringsext--manpage.md ./build/pdf/stringsext--manpage.pdf 10 | ./markdown2html ./source/stringsext--manpage.md ./build/html/stringsext--manpage.html 11 | 12 | cd ./build/html/ 13 | ln -s stringsext--manpage.html stringsext--man.html 14 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # License 2 | 3 | Licensed under either of 4 | 5 | - Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or 6 | http://www.apache.org/licenses/LICENSE-2.0) 7 | - MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) 8 | 9 | at your option. 10 | 11 | ## Contribution 12 | 13 | Unless you explicitly state otherwise, any contribution intentionally submitted 14 | for inclusion in the work by you, as defined in the Apache-2.0 license, shall be 15 | dual licensed as above, without any additional terms or conditions. 16 | -------------------------------------------------------------------------------- /tests/functional/expected_output1: -------------------------------------------------------------------------------- 1 | 2 | 60 (a UTF-8) d My Cheese? is 3 | >60+ (a UTF-8) a simple parable 4 | 80+ (a UTF-8) that reveals pr 5 | >80+ (a UTF-8) ofound truths 6 | 500 (a UTF-8) ed My Cheese? 7 | fe0+ (a UTF-8) asked. 9 | 1580 (a UTF-8) heese?" 10 | 15e0 (a UTF-8) y? Maybe we can 11 | >15e0+ (a UTF-8) get something fr 12 | 1600+ (a UTF-8) om it." 13 | 2f40 (a UTF-8) My Cheese? throu 14 | >2f40+ (a UTF-8) gh the adv 15 | 2f60+ (a UTF-8) entures of Sniff 16 | >2f60+ (a UTF-8) , Scurry, Hem an 17 | 2f80+ (a UTF-8) d Haw, as a way 18 | -------------------------------------------------------------------------------- /scripts/16-clean-targets-keep-binaries: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #set -x 3 | cd .. 4 | find target -mindepth 2 -not -path "*/release/stringsext" \ 5 | -not -path "*/release/stringsext.exe" \ 6 | -not -path "*/debian" \ 7 | -not -path "*/debian/stringsext*.deb" \ 8 | -not -path "*/release/*.md" \ 9 | -not -path "*/release" \ 10 | -not -path "target/doc*" \ 11 | -exec rm -r {} \; 12 | 13 | rm -r target/debug 14 | rm -r target/release 15 | rm -r target/rls 16 | rm target/.rustc_info.json 17 | -------------------------------------------------------------------------------- /scripts/02-make-all-podman: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -x 3 | # Change to this script directory 4 | cd $(dirname "$0") 5 | 6 | ### You might want to (re-)build the Docker image before: 7 | #../build-env/rebuild-dev-image 8 | 9 | 10 | ../build-env/run-script ./10-clear-targets 11 | ../build-env/run-script ./11-test 12 | ../build-env/run-script ./12-make-targets 13 | ../build-env/run-script ./13-make-win-cross-compile 14 | ./14-make-doc 15 | ../build-env/run-script ./15-build-packages 16 | ../build-env/run-script ./16-clean-targets-keep-binaries 17 | ../build-env/run-script ./17-make-dist 18 | -------------------------------------------------------------------------------- /scripts/17-make-dist: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Author: Jens Getreu 3 | 4 | pack () { 5 | #set -x 6 | InPath="$1" 7 | InFile="${InPath##*/}" 8 | InBase="${InFile%.*}" 9 | InDir="${InPath%/*}" 10 | if [ "$InDir" = "$InPath" ] ; then 11 | InDir="." 12 | fi 13 | 14 | OutPath="$2" 15 | OutFile="${OutPath##*/}" 16 | OutBase="${OutFile%.*}" 17 | OutDir="${OutPath%/*}" 18 | if [ "$OutDir" = "$OutPath" ] ; then 19 | OutDir="." 20 | fi 21 | 22 | 23 | # process 24 | 25 | ZipFile="$OutBase.zip" 26 | 27 | mkdir -p "$OutDir" 28 | 29 | zip -r "$OutDir/$ZipFile" "$InPath" 30 | 31 | } 32 | 33 | 34 | ### Main 35 | # usage: 36 | # pack FILE [FILE] 37 | # pack reports ./package/report.zip 38 | 39 | #if [[ -n "${2/[ ]*\n/}" ]] ; then 40 | # OutPath="$2" 41 | #else 42 | # OutPath="${1%.*}.html" # $2 is empty 43 | #fi 44 | #pack "$1" "$OutPath" 45 | 46 | pack "../doc/build/" "../doc/build/dist/stringsext.zip" 47 | 48 | -------------------------------------------------------------------------------- /doc/markdown2man: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Author: Jens Getreu 3 | 4 | 5 | # sudo apt install pandoc 6 | 7 | # parse args 8 | 9 | render () { 10 | #set -x 11 | InPath="$1" 12 | InFile="${InPath##*/}" 13 | InBase="${InFile%.*}" 14 | InDir="${InPath%/*}" 15 | if [ "$InDir" = "$InPath" ] ; then 16 | InDir="." 17 | fi 18 | 19 | OutPath="$2" 20 | OutFile="${OutPath##*/}" 21 | OutBase="${OutFile%.*}" 22 | OutDir="${OutPath%/*}" 23 | if [ "$OutDir" = "$OutPath" ] ; then 24 | OutDir="." 25 | fi 26 | 27 | 28 | # process 29 | 30 | ManFile="$OutBase.1" 31 | 32 | mkdir -p "$OutDir" 33 | 34 | pandoc -s -t man -o "$OutDir/$ManFile" "$InPath" 35 | 36 | gzip -k -f "$OutDir/$ManFile" 37 | } 38 | 39 | 40 | ### Main 41 | # usage: 42 | # render FILE [FILE] 43 | # render report.md ./rendition/report.1 44 | 45 | if [[ -n "${2/[ ]*\n/}" ]] ; then 46 | OutPath="$2" 47 | else 48 | OutPath="${1%.*}.html" # $2 is empty 49 | fi 50 | render "$1" "$OutPath" 51 | 52 | -------------------------------------------------------------------------------- /.direnv/bin/nix-direnv-reload: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | if [[ ! -d "/home/getreu-dev/projects/WEB-SERVER-CONTENT/blog.getreu.net/projects/stringsext" ]]; then 4 | echo "Cannot find source directory; Did you move it?" 5 | echo "(Looking for "/home/getreu-dev/projects/WEB-SERVER-CONTENT/blog.getreu.net/projects/stringsext")" 6 | echo 'Cannot force reload with this script - use "direnv reload" manually and then try again' 7 | exit 1 8 | fi 9 | 10 | # rebuild the cache forcefully 11 | _nix_direnv_force_reload=1 direnv exec "/home/getreu-dev/projects/WEB-SERVER-CONTENT/blog.getreu.net/projects/stringsext" true 12 | 13 | # Update the mtime for .envrc. 14 | # This will cause direnv to reload again - but without re-building. 15 | touch "/home/getreu-dev/projects/WEB-SERVER-CONTENT/blog.getreu.net/projects/stringsext/.envrc" 16 | 17 | # Also update the timestamp of whatever profile_rc we have. 18 | # This makes sure that we know we are up to date. 19 | touch -r "/home/getreu-dev/projects/WEB-SERVER-CONTENT/blog.getreu.net/projects/stringsext/.envrc" "/home/getreu-dev/projects/WEB-SERVER-CONTENT/blog.getreu.net/projects/stringsext/.direnv"/*.rc 20 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Jens Getreu 2 | 3 | Permission is hereby granted, free of charge, to any 4 | person obtaining a copy of this software and associated 5 | documentation files (the "Software"), to deal in the 6 | Software without restriction, including without 7 | limitation the rights to use, copy, modify, merge, 8 | publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software 10 | is furnished to do so, subject to the following 11 | conditions: 12 | 13 | The above copyright notice and this permission notice 14 | shall be included in all copies or substantial portions 15 | of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | DEALINGS IN THE SOFTWARE. 26 | -------------------------------------------------------------------------------- /tests/functional/run-tests: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Functional tests 4 | 5 | # Make this script directory the current directory 6 | cd $(dirname "$0") 7 | 8 | cargo build 9 | 10 | # We search for `?`. 11 | ../../target/debug/stringsext -q 16 -g 63 -tx -a All-Ctrl -u Common \ 12 | -e UTF-8 -e utf-16le -e utf-16be -- input1 > output 13 | 14 | if diff output expected_output1 15 | then 16 | echo Commandline test 1 succeeded. 17 | else 18 | ( echo Commandline test 1 failed. && exit 2 ) 19 | fi 20 | 21 | # We search for `:`. 22 | ../../target/debug/stringsext -n 10 -q 32 -g 58 -tx -a All-Ctrl -u Common \ 23 | -e UTF-8 -e utf-16le -e utf-16be -- input1 input2 > output 24 | 25 | if diff output expected_output2 26 | then 27 | echo Commandline test 2 succeeded. 28 | else 29 | ( echo Commandline test 2 failed. && exit 3 ) 30 | fi 31 | 32 | # We search for nothing. Do we get nothing? 33 | ../../target/debug/stringsext -q 32 -tx -a None -u None \ 34 | -e UTF-8 -e utf-16le -e utf-16be -- input1 input2 > output 35 | 36 | if diff output expected_output3 37 | then 38 | echo Commandline test 3 succeeded. 39 | else 40 | ( echo Commandline test 3 failed. && exit 4 ) 41 | fi 42 | 43 | -------------------------------------------------------------------------------- /scripts/12-make-targets: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -x 3 | # Change to this script directory 4 | cd $(dirname "$0") 5 | 6 | #rustup target add x86_64-unknown-linux-gnu 7 | # Musl needs: `sudo apt install musl-tools` 8 | #rustup target add x86_64-unknown-linux-musl 9 | # Windows needs: `sudo apt install binutils-mingw-w64 mingw-w64` 10 | #rustup target add x86_64-pc-windows-gnu 11 | # Binary for Raspberry Pi (32 bit) 12 | #rustup target add armv7-unknown-linux-gnueabihf 13 | # needs: `sudo apt install crossbuild-essential-armhf` 14 | # Binary for Raspberry Pi (arm64, 64 bit) 15 | #rustup target add aarch64-unknown-linux-gnu 16 | # needs: `sudo apt install crossbuild-essential-arm64` 17 | 18 | 19 | 20 | CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_LINKER=/usr/bin/arm-linux-gnueabihf-gcc \ 21 | CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER=/usr/bin/aarch64-linux-gnu-gcc \ 22 | CARGO_TARGET_I686_UNKNOWN_LINUX_GNU_LINKER=/usr/bin/i686-linux-gnu-gcc \ 23 | cargo build \ 24 | --target i686-unknown-linux-gnu \ 25 | --target i686-unknown-linux-musl \ 26 | --target x86_64-unknown-linux-gnu \ 27 | --target x86_64-unknown-linux-musl \ 28 | --target x86_64-pc-windows-gnu \ 29 | --target armv7-unknown-linux-gnueabihf \ 30 | --target aarch64-unknown-linux-gnu \ 31 | --release 32 | 33 | -------------------------------------------------------------------------------- /shell.nix: -------------------------------------------------------------------------------- 1 | # { pkgs ? import {} }: 2 | # pkgs.mkShell { 3 | # nativeBuildInputs = with pkgs; [ 4 | # cargo 5 | # cargo-edit 6 | # rustc 7 | # rustfmt 8 | # clippy 9 | # rust-analyzer 10 | # ]; 11 | # 12 | # # Certain Rust tools won't work without this 13 | # # This can also be fixed by using oxalica/rust-overlay and specifying the rust-src extension 14 | # # See https://discourse.nixos.org/t/rust-src-not-found-and-other-misadventures-of-developing-rust-on-nixos/11570/3?u=samuela. for more details. 15 | # RUST_SRC_PATH = "${pkgs.rust.packages.stable.rustPlatform.rustLibSrc}"; 16 | # } 17 | 18 | /* 19 | based on 20 | https://discourse.nixos.org/t/how-can-i-set-up-my-rust-programming-environment/4501/9 21 | */ 22 | let 23 | rust_overlay = import (builtins.fetchTarball "https://github.com/oxalica/rust-overlay/archive/master.tar.gz"); 24 | pkgs = import { overlays = [ rust_overlay ]; }; 25 | # rustVersion = "latest"; 26 | rustVersion = "1.80.1"; 27 | rust = pkgs.rust-bin.stable.${rustVersion}.default.override { 28 | extensions = [ 29 | "rust-src" # for rust-analyzer 30 | "rust-analyzer" 31 | ]; 32 | }; 33 | in 34 | pkgs.mkShell { 35 | buildInputs = [ 36 | rust 37 | ] ++ (with pkgs; [ 38 | pkg-config 39 | # other dependencies 40 | #gtk3 41 | #wrapGAppsHook 42 | ]); 43 | RUST_BACKTRACE = 1; 44 | } 45 | -------------------------------------------------------------------------------- /doc/markdown2html: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Author: Jens Getreu 3 | 4 | # apt install pandoc docbook-xsl-ns xsltproc 5 | render () { 6 | ### parse args 7 | 8 | #set -x 9 | InPath="$1" 10 | InFile="${InPath##*/}" 11 | InFileExt="${InPath##*.}" 12 | InBase="${InFile%.*}" 13 | InDir="${InPath%/*}" 14 | if [ "$InDir" = "$InPath" ] ; then 15 | InDir="." 16 | fi 17 | 18 | OutPath="$2" 19 | OutFile="${OutPath##*/}" 20 | OutBase="${OutFile%.*}" 21 | OutDir="${OutPath%/*}" 22 | if [ "$OutDir" = "$OutPath" ] ; then 23 | OutDir="." 24 | fi 25 | 26 | 27 | ### Prepare 28 | 29 | mkdir -p "$OutDir" 30 | 31 | cp -r -L "$InDir/assets/" "$OutDir" 32 | cp "$InDir/web.css" "$OutDir" 33 | cp "$InPath" "$OutDir" 34 | CssPath="web.css" 35 | HtmlPath="$OutBase.html" 36 | 37 | ### Generate HTML 38 | cd "$OutDir" 39 | pandoc -s --to=html --from=markdown+yaml_metadata_block \ 40 | --toc --number-sections -H "$CssPath" \ 41 | -o "$HtmlPath" "$InFile" 42 | 43 | # Remove temp files 44 | rm "$InFile" 45 | } 46 | 47 | 48 | 49 | ### Main 50 | # usage: 51 | # render FILE [FILE] 52 | # render report.md ./rendition/report.html 53 | 54 | if [[ -n "${2/[ ]*\n/}" ]] ; then 55 | OutPath="$2" 56 | else 57 | OutPath="${1%.*}.html" # $2 is empty 58 | fi 59 | render "$1" "$OutPath" 60 | 61 | 62 | -------------------------------------------------------------------------------- /doc/markdown2pdf: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Author: Jens Getreu 3 | 4 | # apt install pandoc weasyprint 5 | render () { 6 | ### parse args 7 | 8 | #set -x 9 | InPath="$1" 10 | InFile="${InPath##*/}" 11 | InFileExt="${InPath##*.}" 12 | InBase="${InFile%.*}" 13 | InDir="${InPath%/*}" 14 | if [ "$InDir" = "$InPath" ] ; then 15 | InDir="." 16 | fi 17 | 18 | OutPath="$2" 19 | OutFile="${OutPath##*/}" 20 | OutBase="${OutFile%.*}" 21 | OutDir="${OutPath%/*}" 22 | if [ "$OutDir" = "$OutPath" ] ; then 23 | OutDir="." 24 | fi 25 | 26 | 27 | ### Prepare 28 | 29 | mkdir -p "$OutDir" 30 | 31 | cp -r -L "$InDir/assets/" "$OutDir" 32 | cp "$InDir/print.css" "$OutDir" 33 | cp "$InPath" "$OutDir" 34 | pushd "$OutDir" 35 | 36 | ### Generate XML 37 | 38 | # unfortunately the chain does not honor --number-section yet 39 | 40 | pandoc "$InFile" --from=markdown+yaml_metadata_block --to=pdf \ 41 | --pdf-engine=weasyprint --pdf-engine-opt="-s" \ 42 | --pdf-engine-opt="print.css" --number-sections -o "$OutFile" 43 | rm -r "assets" 44 | rm "$InFile" 45 | rm "print.css" 46 | popd 47 | 48 | } 49 | 50 | 51 | 52 | ### Main 53 | # usage: 54 | # render FILE [FILE] 55 | # render report.md ./rendition/report.html 56 | 57 | if [[ -n "${2/[ ]*\n/}" ]] ; then 58 | OutPath="$2" 59 | else 60 | OutPath="${1%.*}.html" # $2 is empty 61 | fi 62 | render "$1" "$OutPath" 63 | 64 | 65 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "stringsext" 3 | version = "2.3.5" 4 | authors = ["Jens Getreu "] 5 | edition = "2021" 6 | readme = "README.md" 7 | description = "find multi-byte-encoded strings in binary data" 8 | license = "MIT/Apache-2.0" 9 | categories = ["command-line-utilities", "encoding", "text-processing", 10 | "filesystem"] 11 | documentation = "https://blog.getreu.net/projects/stringsext/stringsext--manpage.html" 12 | homepage = "https://blog.getreu.net/projects/stringsext/" 13 | repository = "https://github.com/getreu/stringsext" 14 | rust-version = "1.80.1" 15 | 16 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 17 | 18 | [dependencies] 19 | encoding_rs= "0.8.34" 20 | lazy_static = "1.5.0" 21 | serde = "1.0.210" 22 | serde_derive = "1.0.210" 23 | itertools = "0.13.0" 24 | scoped_threadpool = "0.1.9" 25 | anyhow = "1.0.89" 26 | clap = "4.5.19" 27 | structopt = "0.3.26" 28 | pin-project = "1.1.5" 29 | 30 | # Metadata for Debian packages with cargo-deb 31 | 32 | [package.metadata.deb] 33 | maintainer = "Jens Getreu, " 34 | copyright = "2015-2020, Jens Getreu " 35 | depends = "$auto" 36 | extended-description = """\ 37 | stringsext is a Unicode enhancement of the GNU strings tool with additional\ 38 | functionalities: stringsext recognizes Cyrillic, Arabic, CJKV characters and\ 39 | other scripts in all supported multi-byte-encodings, while GNU strings fails\ 40 | in finding any of these scripts in UTF-16 and many other encodings. """ 41 | section = "utilities" 42 | priority = "optional" 43 | assets = [ 44 | ["doc/build/html/index.html", "usr/share/doc/stringsext/", "644"], 45 | ["doc/build/man/man1/stringsext.1.gz", "/usr/share/man/man1/", "644"], 46 | ["target/release/stringsext", "usr/bin/", "755"] 47 | ] 48 | -------------------------------------------------------------------------------- /src/help.rs: -------------------------------------------------------------------------------- 1 | //! Help the user with command-line-arguments. 2 | 3 | use crate::mission::ASCII_FILTER_ALIASSE; 4 | use crate::mission::UNICODE_BLOCK_FILTER_ALIASSE; 5 | use crate::mission::{Missions, MISSIONS}; 6 | use crate::options::ARGS; 7 | use crate::options::ASCII_ENC_LABEL; 8 | use crate::AUTHOR; 9 | use crate::VERSION; 10 | use std::process; 11 | use std::str; 12 | 13 | /// Function called at the beginning of `stringsext`. When help is printed to the 14 | /// user, the program exits. 15 | 16 | pub fn help() { 17 | if ARGS.version { 18 | println!("Version {}, {}", VERSION.unwrap_or("unknown"), AUTHOR); 19 | process::exit(0); 20 | }; 21 | 22 | if ARGS.debug_option { 23 | println!("GIVEN COMMANDLINE-ARGUMENTS\n"); 24 | println!("Input files\n-----------"); 25 | for (n, name) in ARGS.inputs.iter().enumerate() { 26 | println!("{} = {:?}", char::from((n + 65) as u8), name); 27 | } 28 | 29 | println!("\nEncoding and filter definitions\n-------------------------------"); 30 | for (n, name) in ARGS.encoding.iter().enumerate() { 31 | println!("{} = {}", char::from((n + 97) as u8), name); 32 | } 33 | 34 | println!("\n\nPARSED COMMANDLINE-ARGUMENTS\n"); 35 | 36 | let ms: &'static Missions = &MISSIONS; 37 | for (i, m) in ms.v.iter().enumerate() { 38 | println!( 39 | "Scanner ({})\n-----------\n{:#?}\n", 40 | char::from((i + 97) as u8), 41 | m 42 | ); 43 | } 44 | process::exit(0); 45 | }; 46 | 47 | if ARGS.list_encodings { 48 | // Is there a way to programmatically query a list from `Encoding`? 49 | // This list is taken from the `Encoding` source file (2019-12-11) 50 | // and may not be up to date. 51 | println!("LIST OF AVAILABLE ENCODINGS AND PREDEFINED FILTERS\n"); 52 | println!("Format: --encoding=[ENC_NAME],[MIN],[AF,UBF],[GREP]\n\n"); 53 | println!("ENC_NAME (Encoding)="); 54 | let list: [&'static str; 41] = [ 55 | ASCII_ENC_LABEL, 56 | "Big5", 57 | "EUC-JP", 58 | "EUC-KR", 59 | "GBK", 60 | "IBM866", 61 | "ISO-2022-JP", 62 | "ISO-8859-10", 63 | "ISO-8859-13", 64 | "ISO-8859-14", 65 | "ISO-8859-15", 66 | "ISO-8859-16", 67 | "ISO-8859-2", 68 | "ISO-8859-3", 69 | "ISO-8859-4", 70 | "ISO-8859-5", 71 | "ISO-8859-6", 72 | "ISO-8859-7", 73 | "ISO-8859-8", 74 | "ISO-8859-8-I", 75 | "KOI8-R", 76 | "KOI8-U", 77 | "Shift_JIS", 78 | "UTF-16BE", 79 | "UTF-16LE", 80 | "UTF-8", 81 | "gb18030", 82 | "macintosh", 83 | "replacement", 84 | "windows-1250", 85 | "windows-1251", 86 | "windows-1252", 87 | "windows-1253", 88 | "windows-1254", 89 | "windows-1255", 90 | "windows-1256", 91 | "windows-1257", 92 | "windows-1258", 93 | "windows-874", 94 | "x-mac-cyrillic", 95 | "x-user-defined", 96 | ]; 97 | 98 | // Available encodings 99 | for e in list.iter() { 100 | println!("\t{}", e); 101 | } 102 | println!("\tWarning: this list may be outdated."); 103 | println!( 104 | "\tPlease consult the library `encoding_rs` documentation \ 105 | for more available encodings.\n\n" 106 | ); 107 | 108 | println!("MIN = "); 109 | println!("\tOnly strings with at least characters are printed.\n\n"); 110 | 111 | println!("AF (ASCII-Filter) = or "); 112 | for (e, b, c) in &ASCII_FILTER_ALIASSE { 113 | let b = format!("{:#x}", b); 114 | println!( 115 | "\t{} = {:>35} ({})", 116 | str::from_utf8(e).unwrap(), 117 | b, 118 | str::from_utf8(c).unwrap().trim() 119 | ); 120 | } 121 | println!( 122 | "\tUse predefined filter names above or your own filter starting with `0x...`.\n\n" 123 | ); 124 | 125 | println!("UBF (Unicode-Block-Filter) = or "); 126 | for (e, b, c) in &UNICODE_BLOCK_FILTER_ALIASSE { 127 | let b = format!("{:#x}", b); 128 | println!( 129 | "\t{} = {:>18} ({})", 130 | str::from_utf8(e).unwrap(), 131 | b, 132 | str::from_utf8(c).unwrap().trim() 133 | ); 134 | } 135 | println!( 136 | "\tUse predefined filter names above or your own filter starting with `0x...`.\n\n" 137 | ); 138 | 139 | println!("GREP = "); 140 | println!("\tPrint only lines having at least one character with ."); 141 | println!("\tUseful values are `47` (/) or `92` (\\) for path search."); 142 | println!("\t can be decimal or hexadecimal and must be < 128."); 143 | 144 | process::exit(0); 145 | } 146 | } 147 | -------------------------------------------------------------------------------- /src/options.rs: -------------------------------------------------------------------------------- 1 | //! This module deals with command-line arguments and directly related data 2 | //! structures. 3 | 4 | use crate::input::ByteCounter; 5 | use lazy_static::lazy_static; 6 | use std::path::PathBuf; 7 | use std::str::FromStr; 8 | use structopt::StructOpt; 9 | 10 | /// Encoding name literal used when simulating non-built-in 11 | /// ASCII-decoder. 12 | pub const ASCII_ENC_LABEL: &str = "ascii"; 13 | 14 | /// If no command-line argument `--chars_min` is given 15 | /// and none is specified in `--encoding` use this. 16 | /// Must be one of `--list-encodings`. 17 | pub const ENCODING_DEFAULT: &str = "UTF-8"; 18 | 19 | /// Default value, when no `--chars-min` command-line-argument 20 | /// is given. Must be `u8`. 21 | pub const CHARS_MIN_DEFAULT: u8 = 4; 22 | 23 | /// Default value, when no `--counter-offset` command-line-argument 24 | /// is given. 25 | pub const COUNTER_OFFSET_DEFAULT: ByteCounter = 0; 26 | 27 | /// Default value when no `--output-line-len` 28 | /// command-line-argument is given. 29 | pub const OUTPUT_LINE_CHAR_NB_MAX_DEFAULT: usize = 64; 30 | 31 | /// There must be space for at least 3 long Unicode characters, 32 | /// to guarantee progress in streaming. You want much longer lines. 33 | pub const OUTPUT_LINE_CHAR_NB_MIN: usize = 6; 34 | 35 | #[derive(Debug, PartialEq, StructOpt)] 36 | #[structopt( 37 | name = "stringsext", 38 | about = "Find multi-byte encoded strings in binary data." 39 | )] 40 | /// This structure holds the command-line-options and is populated by `docopt`. 41 | /// See man-page and the output of `--list-encodings` and `--help` for more 42 | /// information about their meaning. 43 | pub struct Args { 44 | /// filter applied after decoding (see 45 | /// `--list-encodings` for AF examples) 46 | #[structopt(long, short = "a")] 47 | pub ascii_filter: Option, 48 | /// never print byte-counter, encoding or filter 49 | #[structopt(long, short = "c")] 50 | pub no_metadata: bool, 51 | #[structopt(long, short = "d")] 52 | /// show how command-line-options are interpreted 53 | pub debug_option: bool, 54 | /// paths to files to scan (or `-` for stdin) 55 | #[structopt(name = "FILE", parse(from_os_str))] 56 | pub inputs: Vec, 57 | /// set (multiple) encodings to search for 58 | #[structopt(long, short = "e")] 59 | pub encoding: Vec, 60 | /// grep for characters with ASCII-code in output lines 61 | #[structopt(long, short = "g")] 62 | pub grep_char: Option, 63 | #[structopt(long, short = "l")] 64 | /// list predefined encoding and filter names for ENC 65 | pub list_encodings: bool, 66 | #[structopt(long, short = "n")] 67 | /// minimum characters of printed strings 68 | pub chars_min: Option, 69 | #[structopt(long, short = "r")] 70 | /// require chars in finding to be in the same Unicode-block 71 | pub same_unicode_block: bool, 72 | #[structopt(long, short = "p", parse(from_os_str))] 73 | /// print not to stdout but in file 74 | pub output: Option, 75 | /// output line length in Unicode-codepoints 76 | #[structopt(long, short = "q")] 77 | pub output_line_len: Option, 78 | /// start counting input bytes with NUM 79 | #[structopt(long, short = "s")] 80 | pub counter_offset: Option, 81 | // enable byte-counter with radix `o`, `x` or `d` 82 | #[structopt(long, short = "t")] 83 | pub radix: Option, 84 | /// filter applied after decoding 85 | /// (see `--list-encodings` for UBF examples) 86 | #[structopt(long, short = "u")] 87 | pub unicode_block_filter: Option, 88 | /// print version and exit 89 | #[structopt(long, short = "V")] 90 | pub version: bool, 91 | } 92 | 93 | #[derive(Debug, Hash, Clone, Eq, PartialEq, Copy)] 94 | /// radix of the `byte-counter` when printed 95 | pub enum Radix { 96 | // octal 97 | O, 98 | // hexadecimal 99 | X, 100 | // decimal 101 | D, 102 | } 103 | 104 | impl FromStr for Radix { 105 | type Err = String; 106 | fn from_str(rad: &str) -> Result { 107 | match &*rad.to_ascii_lowercase() { 108 | "o" => Ok(Radix::O), 109 | "x" => Ok(Radix::X), 110 | "d" => Ok(Radix::D), 111 | _ => Err(String::from("can not convert radix variant")), 112 | } 113 | } 114 | } 115 | 116 | lazy_static! { 117 | /// Structure to hold the parsed command-line arguments. 118 | pub static ref ARGS : Args = Args::from_args(); 119 | } 120 | 121 | #[cfg(test)] 122 | mod tests { 123 | 124 | /// Are the command-line option read and processed correctly? 125 | #[test] 126 | fn test_arg_parser() { 127 | use super::{Args, Radix}; 128 | use std::path::PathBuf; 129 | use structopt::StructOpt; 130 | 131 | // The argv. Normally you"d just use `parse` which will automatically 132 | // use `std::env::args()`. 133 | let argv = vec![ 134 | "stringsext", 135 | "-d", 136 | "-n", 137 | "10", 138 | "-g", 139 | "64", 140 | "-e", 141 | "ascii", 142 | "-e", 143 | "utf-8", 144 | "-V", 145 | "-l", 146 | "-s", 147 | "1500", 148 | "-p", 149 | "outfile", 150 | "-q", 151 | "40", 152 | "-t", 153 | "o", 154 | "-r", 155 | "infile1", 156 | "infile2", 157 | ]; 158 | let args = Args::from_iter(argv); 159 | 160 | assert_eq!(args.inputs[0], PathBuf::from("infile1")); 161 | assert_eq!(args.inputs[1], PathBuf::from("infile2")); 162 | assert!(args.debug_option); 163 | assert_eq!( 164 | args.encoding, 165 | vec!["ascii".to_string(), "utf-8".to_string()] 166 | ); 167 | assert!(args.version); 168 | assert!(args.list_encodings); 169 | assert_eq!(args.chars_min, Some("10".to_string())); 170 | assert!(args.same_unicode_block); 171 | assert_eq!(args.grep_char, Some("64".to_string())); 172 | assert_eq!(args.radix, Some(Radix::O)); 173 | assert_eq!(args.counter_offset, Some("1500".to_string())); 174 | assert_eq!(args.output, Some(PathBuf::from("outfile"))); 175 | assert_eq!(args.output_line_len, Some("40".to_string())); 176 | assert!(!args.no_metadata); 177 | } 178 | } 179 | -------------------------------------------------------------------------------- /doc/source/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: stringsext - search for multi-byte encoded strings in binary data 3 | --- 4 | 5 | [![Cargo](https://img.shields.io/crates/v/stringsext.svg)]( 6 | https://crates.io/crates/stringsext) 7 | [![Documentation](https://docs.rs/stringsext/badge.svg)]( 8 | https://docs.rs/stringsext) 9 | [![License](https://img.shields.io/badge/license-MIT%2FApache--2.0-blue.svg)]( 10 | https://gitlab.com/getreu/stringsext) 11 | 12 | 13 | **stringsext** is a Unicode enhancement of the *GNU strings* tool with 14 | additional functionalities: **stringsext** recognizes Cyrillic, Arabic, CJKV 15 | characters and other scripts in all supported multi-byte-encodings, while 16 | *GNU strings* fails in finding any of these scripts in UTF-16 and many other 17 | encodings. 18 | 19 | **stringsext** prints all graphic character sequences in *FILE* or 20 | *stdin* that are at least *MIN* bytes long. 21 | 22 | Unlike *GNU strings* **stringsext** can be configured to search for 23 | valid characters not only in ASCII but also in many other input 24 | encodings, e.g.: UTF-8, UTF-16BE, UTF-16LE, BIG5-2003, EUC-JP, KOI8-R 25 | and many others. The option **\--list-encodings** shows a list of valid 26 | encoding names based on the WHATWG Encoding Standard. When more than one 27 | encoding is specified, the scan is performed in different threads 28 | simultaneously. 29 | 30 | When searching for UTF-16 encoded strings, 96% of all possible two byte 31 | sequences, interpreted as UTF-16 code unit, relate directly to Unicode 32 | codepoints. As a result, the probability of encountering valid Unicode 33 | characters in a random byte stream, interpreted as UTF-16, is also 96%. 34 | In order to reduce this big number of false positives, **stringsext** 35 | provides a parametrizable Unicode-block-filter. See **\--encodings** 36 | and **\--same-unicode-block** options in the manual page for more details. 37 | 38 | **stringsext** is mainly useful for extracting Unicode content out of 39 | non-text files. 40 | 41 | When invoked with `stringsext -e ascii` **stringsext** can be used 42 | as *GNU strings* replacement. 43 | 44 | 45 | # Screenshot 46 | 47 | ``` 48 | stringsext -tx -e utf-8 -e utf-16le -e utf-16be \ 49 | -n 10 -a None -u African /dev/disk/by-uuid/567a8410 50 | 51 | 3de2fff0+ (b UTF-16LE) ݒݓݔݕݖݗݙݪ 52 | 3de30000+ (b UTF-16LE) ݫݱݶݷݸݹݺ 53 | <3de36528 (a UTF-8) فيأنمامعكلأورديافىهولملكاولهبسالإنهيأيقدهلثمبهلوليبلايبكشيام 54 | >3de36528+ (a UTF-8) أمنتبيلنحبهممشوش 55 | <3de3a708 (a UTF-8) علىإلىهذاآخرعددالىهذهصورغيركانولابينعرضذلكهنايومقالعليانالكن 56 | >3de3a708+ (a UTF-8) حتىقبلوحةاخرفقطعبدركنإذاكمااحدإلافيهبعضكيفبح 57 | 3de3a780+ (a UTF-8) ثومنوهوأناجدالهاسلمعندليسعبرصلىمنذبهاأنهمثلكنتالاحيثمصرشرححو 58 | 3de3a7f8+ (a UTF-8) لوفياذالكلمرةانتالفأبوخاصأنتانهاليعضووقدابنخيربنتلكمشاءوهياب 59 | 3de3a870+ (a UTF-8) وقصصومارقمأحدنحنعدمرأياحةكتبدونيجبمنهتحتجهةسنةيتمكرةغزةنفسبي 60 | 3de3a8e8+ (a UTF-8) تللهلناتلكقلبلماعنهأولشيءنورأمافيكبكلذاترتببأنهمسانكبيعفقدحس 61 | 3de3a960+ (a UTF-8) نلهمشعرأهلشهرقطرطلب 62 | 3df4cca8 (c UTF-16BE) փօև։֋֍֏֑֛֚֓֕֗֙֜֝֞׹ 63 | <3df4cd20 (c UTF-16BE) ־ֿ׀ׁׂ׃ׅׄ׆ׇ׈׉׊׋ 64 | ``` 65 | 66 | 67 | # Documentation 68 | 69 | User documentation 70 | 71 | * [Manual page (html)](/projects/stringsext/stringsext--manpage.html) 72 | 73 | * [Manual page (pdf)](/_downloads/stringsext--manpage.pdf) 74 | 75 | * [Blogposts about Stringsext](/tags/stringsext/) 76 | 77 | * [Paper about Stringsext](https://commons.erau.edu/jdfsl/vol14/iss2/4) 78 | 79 | Developer documentation 80 | 81 | * [API documentation](/projects/stringsext/_downloads/doc/stringsext/) 82 | 83 | * [Forensic Tool Development with Rust](/projects/forensic-tool-development-with-rust) 84 | 85 | # Source code 86 | 87 | Repository 88 | 89 | * [Stringsext on Gitlab](https://gitlab.com/getreu/stringsext) 90 | 91 | * [Stringsext on Github (mirror)](https://github.com/getreu/stringsext) 92 | 93 | # Distribution 94 | 95 | * Binaries for Ubuntu-Linux 18.04, Windows, MacOS (see below for 96 | Debian binaries) 97 | 98 | 1. Open: [Releases - getreu/stringsext](https://github.com/getreu/stringsext/releases) 99 | 100 | 2. Open the latest release. 101 | 102 | 3. Open *assets*. 103 | 104 | 4. Download the packed executable for your operating system. 105 | 106 | 5. Installation: see below. 107 | 108 | * Binaries and packages (usually built from latest commit): 109 | 110 | - Executable for Windows: 111 | 112 | [x86_64-pc-windows-gnu/release/stringsext.exe](/projects/stringsext/_downloads/x86_64-pc-windows-gnu/release/stringsext.exe) 113 | 114 | - Binary for Debian 10 Buster: 115 | 116 | [x86_64-unknown-linux-gnu/release/stringsext](/projects/stringsext/_downloads/x86_64-unknown-linux-gnu/release/stringsext) 117 | 118 | [x86_64-unknown-linux-musl/release/stringsext](/projects/stringsext/_downloads/x86_64-unknown-linux-musl/release/stringsext) 119 | 120 | [i686-unknown-linux-gnu/release/stringsext](/projects/stringsext/_downloads/i686-unknown-linux-gnu/release/stringsext) 121 | 122 | [i686-unknown-linux-musl/release/stringsext](/projects/stringsext/_downloads/i686-unknown-linux-musl/release/stringsext) 123 | 124 | - Package for Debian 10 Buster: 125 | 126 | [x86_64-unknown-linux-gnu/debian/stringsext_2.3.5_amd64.deb](/projects/stringsext/_downloads/x86_64-unknown-linux-gnu/debian/stringsext_2.3.5_amd64.deb) 127 | 128 | [i686-unknown-linux-gnu/debian/stringsext_2.3.5_i386.deb](/projects/stringsext/_downloads/i686-unknown-linux-gnu/debian/stringsext_2.3.5_i386.deb) 129 | 130 | - Package Raspberry Pi 32 bit: 131 | 132 | [armv7-unknown-linux-gnueabihf/release/stringsext](/projects/stringsext/_downloads/armv7-unknown-linux-gnueabihf/release/stringsext) 133 | 134 | - Package Raspberry Pi 64 bit: 135 | 136 | [aarch64-unknown-linux-gnu/release/stringsext](/projects/stringsext/_downloads/aarch64-unknown-linux-gnu/release/stringsext) 137 | 138 | 139 | * Installable Unix man-page: 140 | 141 | - [stringsext.1.gz](/projects/stringsext/_downloads/stringsext.1.gz) 142 | 143 | * Zipfile with all binaries and documentation: 144 | 145 | - [stringsext all](/_downloads/stringsext.zip) 146 | 147 | 148 | 149 | # Building and installing 150 | 151 | 1. [Install Rust](https://www.rust-lang.org/tools/install), e.g. 152 | 153 | curl https://sh.rustup.rs -sSf | sh 154 | 155 | 2. Download, compile and install: 156 | 157 | cargo install stringsext 158 | sudo cp ~/.cargo/bin/stringsext /usr/local/bin 159 | 160 | This project follows [Semantic Versioning](https://semver.org/). 161 | 162 | 163 | 164 | # About 165 | 166 | Author 167 | 168 | * Jens Getreu 169 | 170 | Copyright 171 | 172 | * Apache 2 license or MIT license 173 | 174 | Build status 175 | 176 | * ![status](https://travis-ci.org/getreu/stringsext.svg?branch=master) 177 | 178 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: stringsext - search for multi-byte encoded strings in binary data 3 | --- 4 | 5 | [![Cargo](https://img.shields.io/crates/v/stringsext.svg)]( 6 | https://crates.io/crates/stringsext) 7 | [![Documentation](https://docs.rs/stringsext/badge.svg)]( 8 | https://docs.rs/stringsext) 9 | [![License](https://img.shields.io/badge/license-MIT%2FApache--2.0-blue.svg)]( 10 | https://gitlab.com/getreu/stringsext) 11 | 12 | 13 | **stringsext** is a Unicode enhancement of the *GNU strings* tool with 14 | additional functionalities: **stringsext** recognizes Cyrillic, Arabic, CJKV 15 | characters and other scripts in all supported multi-byte-encodings, while 16 | *GNU strings* fails in finding any of these scripts in UTF-16 and many other 17 | encodings. 18 | 19 | **stringsext** prints all graphic character sequences in *FILE* or 20 | *stdin* that are at least *MIN* bytes long. 21 | 22 | Unlike *GNU strings* **stringsext** can be configured to search for 23 | valid characters not only in ASCII but also in many other input 24 | encodings, e.g.: UTF-8, UTF-16BE, UTF-16LE, BIG5-2003, EUC-JP, KOI8-R 25 | and many others. The option **\--list-encodings** shows a list of valid 26 | encoding names based on the WHATWG Encoding Standard. When more than one 27 | encoding is specified, the scan is performed in different threads 28 | simultaneously. 29 | 30 | When searching for UTF-16 encoded strings, 96% of all possible two byte 31 | sequences, interpreted as UTF-16 code unit, relate directly to Unicode 32 | codepoints. As a result, the probability of encountering valid Unicode 33 | characters in a random byte stream, interpreted as UTF-16, is also 96%. 34 | In order to reduce this big number of false positives, **stringsext** 35 | provides a parametrizable Unicode-block-filter. See **\--encodings** 36 | and **\--same-unicode-block** options in the manual page for more details. 37 | 38 | **stringsext** is mainly useful for extracting Unicode content out of 39 | non-text files. 40 | 41 | When invoked with `stringsext -e ascii` **stringsext** can be used 42 | as *GNU strings* replacement. 43 | 44 | 45 | # Screenshot 46 | 47 | ``` 48 | stringsext -tx -e utf-8 -e utf-16le -e utf-16be \ 49 | -n 10 -a None -u African /dev/disk/by-uuid/567a8410 50 | 51 | 3de2fff0+ (b UTF-16LE) ݒݓݔݕݖݗݙݪ 52 | 3de30000+ (b UTF-16LE) ݫݱݶݷݸݹݺ 53 | <3de36528 (a UTF-8) فيأنمامعكلأورديافىهولملكاولهبسالإنهيأيقدهلثمبهلوليبلايبكشيام 54 | >3de36528+ (a UTF-8) أمنتبيلنحبهممشوش 55 | <3de3a708 (a UTF-8) علىإلىهذاآخرعددالىهذهصورغيركانولابينعرضذلكهنايومقالعليانالكن 56 | >3de3a708+ (a UTF-8) حتىقبلوحةاخرفقطعبدركنإذاكمااحدإلافيهبعضكيفبح 57 | 3de3a780+ (a UTF-8) ثومنوهوأناجدالهاسلمعندليسعبرصلىمنذبهاأنهمثلكنتالاحيثمصرشرححو 58 | 3de3a7f8+ (a UTF-8) لوفياذالكلمرةانتالفأبوخاصأنتانهاليعضووقدابنخيربنتلكمشاءوهياب 59 | 3de3a870+ (a UTF-8) وقصصومارقمأحدنحنعدمرأياحةكتبدونيجبمنهتحتجهةسنةيتمكرةغزةنفسبي 60 | 3de3a8e8+ (a UTF-8) تللهلناتلكقلبلماعنهأولشيءنورأمافيكبكلذاترتببأنهمسانكبيعفقدحس 61 | 3de3a960+ (a UTF-8) نلهمشعرأهلشهرقطرطلب 62 | 3df4cca8 (c UTF-16BE) փօև։֋֍֏֑֛֚֓֕֗֙֜֝֞׹ 63 | <3df4cd20 (c UTF-16BE) ־ֿ׀ׁׂ׃ׅׄ׆ׇ׈׉׊׋ 64 | ``` 65 | 66 | 67 | # Documentation 68 | 69 | User documentation 70 | 71 | * [Manual page (html)](https://blog.getreu.net/projects/stringsext/stringsext--manpage.html) 72 | 73 | * [Manual page (pdf)](https://blog.getreu.net/_downloads/stringsext--manpage.pdf) 74 | 75 | * [Blogposts about Stringsext](https://blog.getreu.net/tags/stringsext/) 76 | 77 | * [Paper about Stringsext](https://commons.erau.edu/jdfsl/vol14/iss2/4) 78 | 79 | Developer documentation 80 | 81 | * [API documentation](https://blog.getreu.net/projects/stringsext/_downloads/doc/stringsext/) 82 | 83 | * [Forensic Tool Development with Rust](https://blog.getreu.net/projects/forensic-tool-development-with-rust) 84 | 85 | # Source code 86 | 87 | Repository 88 | 89 | * [Stringsext on Gitlab](https://gitlab.com/getreu/stringsext) 90 | 91 | * [Stringsext on Github (mirror)](https://github.com/getreu/stringsext) 92 | 93 | # Distribution 94 | 95 | * Binaries for Ubuntu-Linux 18.04, Windows, MacOS (see below for 96 | Debian binaries) 97 | 98 | 1. Open: [Releases - getreu/stringsext](https://github.com/getreu/stringsext/releases) 99 | 100 | 2. Open the latest release. 101 | 102 | 3. Open *assets*. 103 | 104 | 4. Download the packed executable for your operating system. 105 | 106 | 5. Installation: see below. 107 | 108 | * Binaries and packages (usually built from latest commit): 109 | 110 | - Executable for Windows: 111 | 112 | [x86_64-pc-windows-gnu/release/stringsext.exe](https://blog.getreu.net/projects/stringsext/_downloads/x86_64-pc-windows-gnu/release/stringsext.exe) 113 | 114 | - Binary for Debian 10 Buster: 115 | 116 | [x86_64-unknown-linux-gnu/release/stringsext](https://blog.getreu.net/projects/stringsext/_downloads/x86_64-unknown-linux-gnu/release/stringsext) 117 | 118 | [x86_64-unknown-linux-musl/release/stringsext](https://blog.getreu.net/projects/stringsext/_downloads/x86_64-unknown-linux-musl/release/stringsext) 119 | 120 | [i686-unknown-linux-gnu/release/stringsext](https://blog.getreu.net/projects/stringsext/_downloads/i686-unknown-linux-gnu/release/stringsext) 121 | 122 | [i686-unknown-linux-musl/release/stringsext](https://blog.getreu.net/projects/stringsext/_downloads/i686-unknown-linux-musl/release/stringsext) 123 | 124 | - Package for Debian 10 Buster: 125 | 126 | [x86_64-unknown-linux-gnu/debian/stringsext_2.3.5_amd64.deb](https://blog.getreu.net/projects/stringsext/_downloads/x86_64-unknown-linux-gnu/debian/stringsext_2.3.5_amd64.deb) 127 | 128 | [i686-unknown-linux-gnu/debian/stringsext_2.3.5_i386.deb](https://blog.getreu.net/projects/stringsext/_downloads/i686-unknown-linux-gnu/debian/stringsext_2.3.5_i386.deb) 129 | 130 | - Package Raspberry Pi 32 bit: 131 | 132 | [armv7-unknown-linux-gnueabihf/release/stringsext](https://blog.getreu.net/projects/stringsext/_downloads/armv7-unknown-linux-gnueabihf/release/stringsext) 133 | 134 | - Package Raspberry Pi 64 bit: 135 | 136 | [aarch64-unknown-linux-gnu/release/stringsext](https://blog.getreu.net/projects/stringsext/_downloads/aarch64-unknown-linux-gnu/release/stringsext) 137 | 138 | 139 | * Installable Unix man-page: 140 | 141 | - [stringsext.1.gz](https://blog.getreu.net/projects/stringsext/_downloads/stringsext.1.gz) 142 | 143 | * Zipfile with all binaries and documentation: 144 | 145 | - [stringsext all](https://blog.getreu.net/_downloads/stringsext.zip) 146 | 147 | 148 | 149 | # Building and installing 150 | 151 | 1. [Install Rust](https://www.rust-lang.org/tools/install), e.g. 152 | 153 | curl https://sh.rustup.rs -sSf | sh 154 | 155 | 2. Download, compile and install: 156 | 157 | cargo install stringsext 158 | sudo cp ~/.cargo/bin/stringsext /usr/local/bin 159 | 160 | This project follows [Semantic Versioning](https://semver.org/). 161 | 162 | 163 | 164 | # About 165 | 166 | Author 167 | 168 | * Jens Getreu 169 | 170 | Copyright 171 | 172 | * Apache 2 license or MIT license 173 | 174 | Build status 175 | 176 | * ![status](https://travis-ci.org/getreu/stringsext.svg?branch=master) 177 | 178 | -------------------------------------------------------------------------------- /src/finding.rs: -------------------------------------------------------------------------------- 1 | //! Store string-findings and prepare them for output. 2 | 3 | extern crate encoding_rs; 4 | 5 | use crate::input::ByteCounter; 6 | use crate::mission::Mission; 7 | use crate::options::Radix; 8 | use crate::options::ARGS; 9 | use crate::options::ASCII_ENC_LABEL; 10 | use std::io::Write; 11 | use std::str; 12 | 13 | /// `OUTPUT_BUF_LEN` needs to be long enough to hold all findings that are 14 | /// decoded to UTF-8 in `scan::scan()`. To estimate the space needed to receive 15 | /// all decodings in UTF-8, the worst case - Asian like `EUC_JP` - has to be 16 | /// taken into consideration: Therefor, in order to avoid output buffer overflow, 17 | /// `OUTPUT_BUF_LEN` should be at least twice as big as `INPUT_BUF_LEN`. You can 18 | /// also check the minimum length with 19 | /// `Decoder::max_utf8_buffer_length_without_replacement`. Unfortunately this can 20 | /// not be done programmatically, because `output_buffer` is a statically 21 | /// allocated array. 22 | #[cfg(not(test))] 23 | pub const OUTPUT_BUF_LEN: usize = 0x9192; 24 | #[cfg(test)] 25 | pub const OUTPUT_BUF_LEN: usize = 0x40; 26 | 27 | /// Extra space in bytes for `ByteCounter` and encoding-name when `Finding::print()` 28 | /// prints a `Finding`. 29 | pub const OUTPUT_LINE_METADATA_LEN: usize = 40; 30 | 31 | #[derive(Debug, Eq, PartialEq)] 32 | /// Used to express the precision of `Finding::position` when the algorithm can 33 | /// not determine its exact position. 34 | pub enum Precision { 35 | /// The finding is located somewhere before `Finding::position`. It is 36 | /// guarantied, that the finding is not farer than 2*`--output-line-len` 37 | /// bytes (or the previous finding from the same scanner) away. 38 | Before, 39 | /// The algorithm could determine the exact position of the `Finding` at 40 | /// `Finding::position`. 41 | Exact, 42 | /// The finding is located some `[1..2* --output_line_len]` bytes after 43 | /// `Finding::position` or - in any case - always before the next 44 | /// `Finding::position`. 45 | After, 46 | } 47 | 48 | /// `Finding` represents a valid result string decoded to UTF-8 with it's 49 | /// original location and its original encoding in the input stream. 50 | #[derive(Debug)] 51 | pub struct Finding<'a> { 52 | /// A label identifying the origin of the input data: If the origin of the data 53 | /// is `stdin`: `None`, otherwise: `Some(1)` for input coming from the first 54 | /// file, `Some(2)` for input from the second file, `Some(3)` for ... 55 | pub input_file_id: Option, 56 | /// `Mission` associated with this finding. We need a reference to the 57 | /// corresponding `Mission` object here, in order to get additional information, 58 | /// e.g. the label of the encoding, when we print this `Finding`. 59 | pub mission: &'static Mission, 60 | /// The byte number position of this `Finding` in the input stream. 61 | pub position: ByteCounter, 62 | /// In some cases the `position` can not be determined exactly. Therefor, 63 | /// `position_precision` indicates how well the finding is localized. In case 64 | /// that the position is not exactly known, we indicate if the finding is 65 | /// somewhere before or after `position`. 66 | pub position_precision: Precision, 67 | /// Whatever the original encoding was, the result string `s` is always stored as 68 | /// UTF-8. `s` is a `&str` pointing into `FindingCollection::output_buffer`. 69 | pub s: &'a str, 70 | /// This flag indicates that `s` holds only the second part of a cut finding 71 | /// from the previous `scanner::scan()` run. This can happen when a finding from 72 | /// the previous run has hit the`input_buffer`-boundary. 73 | pub s_completes_previous_s: bool, 74 | } 75 | 76 | impl Eq for Finding<'_> {} 77 | 78 | /// Useful to compare findings for debugging or testing. 79 | impl PartialEq for Finding<'_> { 80 | fn eq(&self, other: &Self) -> bool { 81 | (self.position == other.position) 82 | && (self.position_precision == other.position_precision) 83 | && (self.mission.encoding.name() == other.mission.encoding.name()) 84 | && (self.mission.filter == other.mission.filter) 85 | && (self.s == other.s) 86 | } 87 | } 88 | 89 | /// When `itertools::kmerge()` merges `FindingCollections` into an iterator over 90 | /// `Finding` s, it needs to compare `Finding` s. Therefor, we must implement 91 | /// `PartialOrd`. 92 | impl PartialOrd for Finding<'_> { 93 | fn partial_cmp(&self, other: &Self) -> Option { 94 | if self.position != other.position { 95 | self.position.partial_cmp(&other.position) 96 | } else if self.mission.mission_id != other.mission.mission_id { 97 | self.mission 98 | .mission_id 99 | .partial_cmp(&other.mission.mission_id) 100 | } else if self.mission.filter.ubf != other.mission.filter.ubf { 101 | self.mission 102 | .filter 103 | .ubf 104 | .partial_cmp(&other.mission.filter.ubf) 105 | } else { 106 | self.mission.filter.af.partial_cmp(&other.mission.filter.af) 107 | } 108 | } 109 | } 110 | 111 | impl<'a> Finding<'a> { 112 | pub fn print(&self, out: &mut dyn Write) -> Result<(), Box> { 113 | out.write_all(b"\n")?; 114 | if !ARGS.no_metadata { 115 | if ARGS.inputs.len() > 1 { 116 | if let Some(i) = self.input_file_id { 117 | // map 1 -> 'A', 2 -> 'B', 3 -> 'C' 118 | out.write_all(&[i + 64_u8, b' '])?; 119 | } 120 | }; 121 | 122 | if ARGS.radix.is_some() { 123 | match &self.position_precision { 124 | Precision::After => out.write_all(b">")?, 125 | Precision::Exact => out.write_all(b" ")?, 126 | Precision::Before => out.write_all(b"<")?, 127 | }; 128 | match ARGS.radix { 129 | Some(Radix::X) => out.write_fmt(format_args!("{:0x}", self.position,))?, 130 | Some(Radix::D) => out.write_fmt(format_args!("{:0}", self.position,))?, 131 | Some(Radix::O) => out.write_fmt(format_args!("{:0o}", self.position,))?, 132 | None => {} 133 | }; 134 | if self.s_completes_previous_s { 135 | out.write_all(b"+\t")? 136 | } else { 137 | out.write_all(b" \t")? 138 | }; 139 | } 140 | 141 | if ARGS.encoding.len() > 1 { 142 | // map 0 -> 'a', 1 -> 'b', 2 -> 'c' ... 143 | out.write_all(&[b'(', self.mission.mission_id + 97_u8, b' '])?; 144 | out.write_all(if self.mission.print_encoding_as_ascii { 145 | ASCII_ENC_LABEL.as_bytes() 146 | } else { 147 | self.mission.encoding.name().as_bytes() 148 | })?; 149 | // After ")" send two tabs. 150 | out.write_all(b")\t")?; 151 | }; 152 | }; 153 | out.write_all(self.s.as_bytes())?; 154 | Ok(()) 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /src/input.rs: -------------------------------------------------------------------------------- 1 | //! Cut the input stream in chunks for batch processing. 2 | 3 | use crate::as_mut_slice_no_borrow_check; 4 | use crate::options::ARGS; 5 | use std::fs::File; 6 | use std::io; 7 | use std::io::Read; 8 | use std::iter::Peekable; 9 | use std::path::Path; 10 | use std::path::PathBuf; 11 | use std::slice; 12 | use std::slice::Iter; 13 | 14 | /// This is the type used to count bytes in the input stream. Maybe in a future 15 | /// version we raise this to `u128`. 16 | pub type ByteCounter = u64; 17 | 18 | /// This is the size of `input_buffer` in bytes. It should be aligned with a 19 | /// multiple of the memory page size, which is - depending on the hardware - `n * 20 | /// 4096` bytes. 21 | #[cfg(not(test))] 22 | pub const INPUT_BUF_LEN: usize = 4096; 23 | 24 | #[cfg(test)] 25 | pub const INPUT_BUF_LEN: usize = 0x20; 26 | 27 | /// Struct to store the `Slicer`-iterator state. The iterator fills the 28 | /// `input-buffer` with bytes coming from files, whose names are given in the 29 | /// vector `ARGS.inputs`. When one file is exhausted, the iterator switches 30 | /// automatically and transparently to the next file in `ARGS.inputs`. When no 31 | /// data is left in any file, `next()` returns `None`. 32 | 33 | pub struct Slicer<'a> { 34 | /// An iterator over `ARGS.inputs` wrapped in an option. If the option is 35 | /// `Some()`, then the input should be read from files, whose filenames are 36 | /// delivered with the iterator's `next()`. If the option is `None`, then the 37 | /// data comes from `std::stdin`. 38 | filename_iter: Option>>, 39 | 40 | /// The reader associated with the current file. 41 | reader: Box, 42 | 43 | /// An index identifying the source of the input: 44 | /// The input comes from: 45 | /// * 0: `stdin`, 46 | /// * 1: the first file in `ARGS.inputs`, 47 | /// * 2: the second file in `ARGS.inputs`, 48 | /// * 3: ... 49 | current_input_idx: usize, 50 | 51 | /// Is true, when this is the last iteration. After this, comes 52 | /// only `None`. 53 | current_input_is_last: bool, 54 | 55 | /// Buffer to store all incoming bytes from the readers. The input is 56 | /// streamed in this buffer first, before being analysed later in batches. 57 | input_buffer: [u8; INPUT_BUF_LEN], 58 | } 59 | 60 | impl Slicer<'_> { 61 | #[inline] 62 | pub fn new() -> Self { 63 | if (ARGS.inputs.is_empty()) 64 | || ((ARGS.inputs.len() == 1) && ARGS.inputs[0] == Path::new("-")) 65 | { 66 | Self { 67 | filename_iter: None, 68 | reader: Box::new(io::stdin()) as Box, 69 | current_input_idx: 0, 70 | current_input_is_last: true, 71 | input_buffer: [0u8; INPUT_BUF_LEN], 72 | } 73 | } else { 74 | let mut filename_iter = ARGS.inputs.iter().peekable(); 75 | // `unwrap()` is save because we know `if` above, that there is at least one 76 | // filename. 77 | let filename = filename_iter.next().unwrap(); 78 | let reader = match File::open(Path::new(filename)) { 79 | Ok(file) => Box::new(file) as Box, 80 | Err(e) => { 81 | eprintln!("Error: can not read file`{:?}`: {}", filename, e); 82 | Box::new(io::empty()) as Box 83 | } 84 | }; 85 | let current_input_is_last = filename_iter.peek().is_none(); 86 | 87 | Self { 88 | filename_iter: Some(filename_iter), 89 | // Just to start with something, will be overwritten 90 | // immediately. 91 | reader, 92 | // Convention here: `0` means "not started". 93 | current_input_idx: 1, 94 | // There might be more than one file. 95 | current_input_is_last, 96 | input_buffer: [0u8; INPUT_BUF_LEN], 97 | } 98 | } 99 | } 100 | } 101 | 102 | /// Iterator over the input stream coming from `std::stdin` or from files whose 103 | /// names are listed in `ARGS.inputs`. 104 | impl<'a> Iterator for Slicer<'a> { 105 | /// The iterator's `next()` returns a tuple `(&[u8], Option, bool)` with 3 members: 106 | /// * First member `&[u8]`: \ 107 | /// a slice of input bytes comprising all valid bytes in `input_buffer`. 108 | /// * Second member `Option`:\ 109 | /// A label identifying the origin of the bytes in `&[u8]`:\ 110 | /// * `None`: the origin of the input is `stdin`, 111 | /// * `Some(1)`: the bytes come from the first file in `ARGS.inputs`, 112 | /// * `Some(2)`: the bytes come from the second file in `ARGS.inputs`, 113 | /// * `Some(3)`: ... 114 | /// * Third member `bool`:\ 115 | /// * `true`: this chunk of input data is the very last one. All further 116 | /// `next()` will return `None`. 117 | /// * `false`: More input data will come with the next `next()`. 118 | type Item = (&'a [u8], Option, bool); 119 | /// Returns the next slice of input. 120 | fn next(&mut self) -> Option { 121 | let input_buffer_slice = as_mut_slice_no_borrow_check!(self.input_buffer); 122 | // Fill the input buffer. 123 | let no_bytes_received = self.reader.read(input_buffer_slice).unwrap_or_else(|_| { 124 | panic!( 125 | "Error: Could not read input stream no. {}", 126 | self.current_input_idx 127 | ) 128 | }); 129 | let result = &input_buffer_slice[..no_bytes_received]; 130 | let this_stream_ended = no_bytes_received == 0; 131 | let input_ended = self.current_input_is_last && this_stream_ended; 132 | 133 | // More files to open? 134 | if this_stream_ended { 135 | if self.current_input_is_last { 136 | // Early return 137 | return None; 138 | } else { 139 | // We can safely do first `unwrap()` because 140 | // `!self.current_input_is_last` can only happen (be true) 141 | // if `self.filename_iter()` is not `None`. 142 | // We can safely do second `unwarp()` here, because we have `peek()` ed 143 | // we already and know there is at least one more filename. 144 | let filename = self.filename_iter.as_mut().unwrap().next().unwrap(); 145 | self.current_input_idx += 1; 146 | // The next run needs to know if there is more. 147 | self.current_input_is_last = self.filename_iter.as_mut().unwrap().peek().is_none(); 148 | let reader = match File::open(Path::new(filename)) { 149 | Ok(file) => Box::new(file) as Box, 150 | Err(e) => { 151 | eprintln!("Error: can not read file: {}", e); 152 | Box::new(io::empty()) as Box 153 | } 154 | }; 155 | // Store the reader for the `next()` run. 156 | self.reader = reader; 157 | } 158 | }; 159 | 160 | // Change type for output. 161 | let current_file_id = match self.current_input_idx { 162 | 0 => None, 163 | // Map 1 -> "A", 2 -> "B", ... 164 | c => Some(c as u8), 165 | }; 166 | Some((result, current_file_id, input_ended)) 167 | } 168 | } 169 | -------------------------------------------------------------------------------- /.azure-pipelines.yml: -------------------------------------------------------------------------------- 1 | trigger: 2 | branches: 3 | include: ['*'] 4 | tags: 5 | include: ['*'] 6 | 7 | stages: 8 | - stage: Tests 9 | jobs: 10 | - job: 11 | strategy: 12 | matrix: 13 | windows-stable: 14 | imageName: 'windows-latest' 15 | rustup_toolchain: stable 16 | target: 'x86_64-pc-windows-msvc' 17 | mac-stable: 18 | imageName: 'macOS-latest' 19 | rustup_toolchain: stable 20 | target: 'x86_64-apple-darwin' 21 | linux-stable: 22 | imageName: 'ubuntu-latest' 23 | rustup_toolchain: stable 24 | target: 'x86_64-unknown-linux-gnu' 25 | pool: 26 | vmImage: $(imageName) 27 | steps: 28 | - script: | 29 | curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain $RUSTUP_TOOLCHAIN 30 | echo "##vso[task.setvariable variable=PATH;]$PATH:$HOME/.cargo/bin" 31 | displayName: Install rust 32 | condition: ne( variables['Agent.OS'], 'Windows_NT' ) 33 | - script: | 34 | curl -sSf -o rustup-init.exe https://win.rustup.rs 35 | rustup-init.exe -y --default-toolchain %RUSTUP_TOOLCHAIN% --default-host x86_64-pc-windows-msvc 36 | echo "##vso[task.setvariable variable=PATH;]%PATH%;%USERPROFILE%\.cargo\bin" 37 | displayName: Windows install rust 38 | condition: eq( variables['Agent.OS'], 'Windows_NT' ) 39 | - script: cargo build --all 40 | displayName: Cargo build 41 | - script: cargo test --all 42 | displayName: Cargo test 43 | 44 | 45 | - stage: Release 46 | dependsOn: Tests 47 | condition: startsWith(variables['Build.SourceBranch'], 'refs/tags/') 48 | jobs: 49 | - job: 50 | strategy: 51 | matrix: 52 | windows-stable: 53 | imageName: 'windows-latest' 54 | rustup_toolchain: stable 55 | target: 'x86_64-pc-windows-msvc' 56 | mac-stable: 57 | imageName: 'macOS-latest' 58 | rustup_toolchain: stable 59 | target: 'x86_64-apple-darwin' 60 | linux-stable: 61 | imageName: 'ubuntu-latest' 62 | rustup_toolchain: stable 63 | target: 'x86_64-unknown-linux-gnu' 64 | pool: 65 | vmImage: $(imageName) 66 | steps: 67 | - script: | 68 | curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain $RUSTUP_TOOLCHAIN 69 | echo "##vso[task.setvariable variable=PATH;]$PATH:$HOME/.cargo/bin" 70 | displayName: Install rust 71 | condition: ne( variables['Agent.OS'], 'Windows_NT' ) 72 | - script: | 73 | cargo install cargo-deb 74 | condition: eq( variables['Agent.OS'], 'Linux' ) 75 | displayName: Install build dependencies 76 | - script: | 77 | set CARGO_HOME=%USERPROFILE%\.cargo 78 | curl -sSf -o rustup-init.exe https://win.rustup.rs 79 | rustup-init.exe -y --default-toolchain %RUSTUP_TOOLCHAIN% --default-host x86_64-pc-windows-msvc 80 | set PATH=%PATH%;%USERPROFILE%\.cargo\bin 81 | echo "##vso[task.setvariable variable=PATH;]%PATH%;%USERPROFILE%\.cargo\bin" 82 | displayName: Windows install rust 83 | condition: eq( variables['Agent.OS'], 'Windows_NT' ) 84 | - script: | 85 | rustup target add $TARGET 86 | cargo build --release --target $TARGET 87 | strip '$(Build.SourcesDirectory)/target/$(TARGET)/release/stringsext' 88 | condition: ne( variables['Agent.OS'], 'Windows_NT' ) 89 | displayName: Build 90 | - script: | 91 | sudo apt-get install pandoc docbook5-xml docbook-xsl-ns xsltproc fop xmlto libxml2-utils xmlstarlet 92 | cd '$(Build.SourcesDirectory)/doc/' 93 | './make--all' 94 | condition: eq( variables['Agent.OS'], 'Linux' ) 95 | displayName: Build documentation 96 | - script: | 97 | cargo deb --target $TARGET 98 | condition: eq( variables['Agent.OS'], 'Linux' ) 99 | displayName: Build deb package 100 | - script: | 101 | rustup target add %TARGET% 102 | cargo build --release --target %TARGET% 103 | condition: eq( variables['Agent.OS'], 'Windows_NT' ) 104 | displayName: Build on Windows 105 | - task: CopyFiles@2 106 | displayName: Copy documentation assets 107 | condition: eq( variables['Agent.OS'], 'Linux' ) 108 | inputs: 109 | sourceFolder: '$(Build.SourcesDirectory)/doc/build/pdf' 110 | contents: '*.pdf' 111 | targetFolder: '$(Build.ArtifactStagingDirectory)/' 112 | - task: CopyFiles@2 113 | displayName: Copy assets 114 | condition: ne( variables['Agent.OS'], 'Windows_NT' ) 115 | inputs: 116 | sourceFolder: '$(Build.SourcesDirectory)/target/$(TARGET)/release' 117 | contents: stringsext 118 | targetFolder: '$(Build.BinariesDirectory)/' 119 | - task: CopyFiles@2 120 | displayName: Copy deb package 121 | condition: eq( variables['Agent.OS'], 'Linux' ) 122 | inputs: 123 | sourceFolder: '$(Build.SourcesDirectory)/target/$(TARGET)/debian' 124 | contents: '*.deb' 125 | targetFolder: '$(Build.ArtifactStagingDirectory)/' 126 | - task: CopyFiles@2 127 | displayName: Copy assets on Windows 128 | condition: eq( variables['Agent.OS'], 'Windows_NT' ) 129 | inputs: 130 | sourceFolder: '$(Build.SourcesDirectory)/target/$(TARGET)/release' 131 | contents: stringsext.exe 132 | targetFolder: '$(Build.BinariesDirectory)/' 133 | 134 | - task: ArchiveFiles@2 135 | displayName: Gather assets 136 | condition: ne( variables['Agent.OS'], 'Windows_NT' ) 137 | inputs: 138 | rootFolderOrFile: '$(Build.BinariesDirectory)/stringsext' 139 | archiveType: 'tar' 140 | tarCompression: 'gz' 141 | archiveFile: '$(Build.ArtifactStagingDirectory)/stringsext-$(Build.SourceBranchName)-$(TARGET).tar.gz' 142 | - task: ArchiveFiles@2 143 | displayName: Gather assets 144 | condition: eq( variables['Agent.OS'], 'Windows_NT' ) 145 | inputs: 146 | rootFolderOrFile: '$(Build.BinariesDirectory)/stringsext.exe' 147 | archiveType: 'zip' 148 | archiveFile: '$(Build.ArtifactStagingDirectory)/stringsext-$(Build.SourceBranchName)-$(TARGET).zip' 149 | 150 | - task: GithubRelease@0 151 | condition: eq( variables['Agent.OS'], 'Linux' ) 152 | inputs: 153 | gitHubConnection: 'github.com_getreu' 154 | repositoryName: 'getreu/stringsext' 155 | action: 'edit' 156 | target: '$(build.sourceVersion)' 157 | tagSource: 'manual' 158 | tag: '$(Build.SourceBranchName)' 159 | assets: '$(Build.ArtifactStagingDirectory)/*' 160 | title: '$(Build.SourceBranchName)' 161 | assetUploadMode: 'replace' 162 | addChangeLog: false 163 | - task: GithubRelease@0 164 | condition: eq( variables['Agent.OS'], 'Darwin' ) 165 | inputs: 166 | gitHubConnection: 'github.com_getreu' 167 | repositoryName: 'getreu/stringsext' 168 | action: 'edit' 169 | target: '$(build.sourceVersion)' 170 | tagSource: 'manual' 171 | tag: '$(Build.SourceBranchName)' 172 | assets: '$(Build.ArtifactStagingDirectory)/stringsext-$(Build.SourceBranchName)-$(TARGET).tar.gz' 173 | title: '$(Build.SourceBranchName)' 174 | assetUploadMode: 'replace' 175 | addChangeLog: false 176 | - task: GithubRelease@0 177 | condition: eq( variables['Agent.OS'], 'Windows_NT' ) 178 | inputs: 179 | gitHubConnection: 'github.com_getreu' 180 | repositoryName: 'getreu/stringsext' 181 | action: 'edit' 182 | target: '$(build.sourceVersion)' 183 | tagSource: 'manual' 184 | tag: '$(Build.SourceBranchName)' 185 | assets: '$(Build.ArtifactStagingDirectory)/stringsext-$(Build.SourceBranchName)-$(TARGET).zip' 186 | title: '$(Build.SourceBranchName)' 187 | assetUploadMode: 'replace' 188 | addChangeLog: false 189 | -------------------------------------------------------------------------------- /tests/functional/expected_output2: -------------------------------------------------------------------------------- 1 | 2 | A 500 (a UTF-8) A Gathering: Chicago 3 | A <540 (a UTF-8) Who Moved My Cheese?: The Story 4 | A <580 (a UTF-8) The Mice: Sniff & Scurry 5 | A >580 (a UTF-8) The Little people: Hem & Haw 6 | A <640 (a UTF-8) A Discussion: Later That Same Da 7 | A >640+ (a UTF-8) y 8 | A <700 (a UTF-8) the mice: "Sniff" and "Scurry;' 9 | A >700+ (a UTF-8) and 10 | A >700 (a UTF-8) the Little people: "Hem" and "Ha 11 | A >700+ (a UTF-8) w" 12 | A 900 (a UTF-8) hing in common: 13 | A 1a80 (a UTF-8) common: every morning, they each 14 | A >1a80+ (a UTF-8) pu 15 | A 1ac0+ (a UTF-8) t on their jogging suits and run 16 | A >1ac0+ (a UTF-8) ning 17 | A 27c0 (a UTF-8) them smile. One read: 18 | A 2ec0 (a UTF-8) Who Moved My Cheese? The Movie: 19 | A 2fc0 (a UTF-8) Aft A-Mawng Change Profile: 20 | A 3200 (a UTF-8) To learn more, visit: 21 | B <10272 (b UTF-16LE) input device path: "%s" 22 | B <102b2 (b UTF-16LE) sub-path (%hhd,%hhd): "%s" 23 | B <10332 (b UTF-16LE) Could not get file info: %r 24 | B <10372 (b UTF-16LE) Couldn't open "%s": %r 25 | B <104b2 (b UTF-16LE) Could not create variable: %r 26 | B 10572 (b UTF-16LE) nbootorder: %d 27 | B <105b2 (b UTF-16LE) BootOrder: 28 | B >105b2 (b UTF-16LE) file DP: %s 29 | B 105f2 (b UTF-16LE) device path: "%s" 30 | B <10632 (b UTF-16LE) CSV data: "%s" 31 | B >10632 (b UTF-16LE) filename: "%s" 32 | B <10672 (b UTF-16LE) label: "%s" 33 | B >10672 (b UTF-16LE) arguments: "%s" 34 | B <106f2 (b UTF-16LE) Could not read file "%s": %r 35 | B <10732 (b UTF-16LE) File looks like: 36 | B 107b2 (b UTF-16LE) t get info for "%s": %r 37 | B <107f2 (b UTF-16LE) Could not read \EFI\%s\: %r 38 | B <10872 (b UTF-16LE) Couldn't open \EFI\%s\%s: %r 39 | B <108b2 (b UTF-16LE) Could not process \EFI\%s\%s: %r 40 | B <108f2 (b UTF-16LE) Couldn't find file system: %r 41 | B <10932 (b UTF-16LE) Couldn't open file system: %r 42 | B <10972 (b UTF-16LE) Couldn't open EFI: %r 43 | B <109b2 (b UTF-16LE) Couldn't set file position: %r 44 | B <109f2 (b UTF-16LE) Could not read \EFI\: %r 45 | B <10a72 (b UTF-16LE) %d Couldn't open %s: %r 46 | B <10ab2 (b UTF-16LE) LoadImage failed: %r 47 | B >10ab2 (b UTF-16LE) Device path: "%s" 48 | B <10af2 (b UTF-16LE) StartImage failed: %r 49 | B 10bf2 (b UTF-16LE) find loaded image: %r 50 | B <10cb2 (b UTF-16LE) Error: could not find boot optio 51 | B >10cb2+ (b UTF-16LE) ns: %r 52 | B <10ef2 (b UTF-16LE) TPM logging failed: %r 53 | B 10fb2 (b UTF-16LE) w%a:%d %a() 54 | B <11072 (b UTF-16LE) Failed to read the keystroke: %r 55 | B 110b2+ (b UTF-16LE) OK 56 | B <11472 (b UTF-16LE) %s: (0x%x) %s 57 | B <114b2 (b UTF-16LE) SSL Error: %a:%d %a(): %r 58 | B 117b2 (b UTF-16LE) x:%x:%x:%x:%x:%x:%x 59 | B <11fb2 (b UTF-16LE) %02d/%02d/%02d %02d:%02d%c 60 | B 12132 (b UTF-16LE) %*a%X: %-.48a *%a* 61 | B <12172 (b UTF-16LE) Press Enter to continue : 62 | B 125f2 (a UTF-8) %lu:%s:%s:%d:%s 63 | B 12632 (a UTF-8) assertion failed: *(unsigned int 64 | B >12632+ (a UTF-8) *)lock == 1 65 | B <12832 (a UTF-8) assertion failed: *sbuffer != NU 66 | B >12832+ (a UTF-8) LL || bu 67 | B 12872+ (a UTF-8) ffer != NULL 68 | B >12872 (a UTF-8) assertion failed: *currlen <= *m 69 | B >12872+ (a UTF-8) axlen 70 | B <128b2 (a UTF-8) assertion failed: *sbuffer != NU 71 | B >128b2+ (a UTF-8) LL 72 | B >128b2 (a UTF-8) %s:%d: OpenSSL internal error: % 73 | B 128f2+ (a UTF-8) s 74 | B >128f2 (a UTF-8) error:%08lX:%s:%s:%s 75 | B <1d7f2 (a UTF-8) assertion failed: ctx->digest->m 76 | B >1d7f2+ (a UTF-8) d_size <= EVP_MAX_MD_SIZE 77 | B <1d832 (a UTF-8) assertion failed: l <= sizeof(c- 78 | B 1d872+ (a UTF-8) >iv) 79 | B >1d872 (a UTF-8) assertion failed: j <= sizeof(c- 80 | B >1d872+ (a UTF-8) >iv) 81 | B 1d8b2 (a UTF-8) assertion failed: EVP_CIPHER_key 82 | B >1d8b2+ (a UTF-8) _length(cipher) <= (int) 83 | B 1d8f2+ (a UTF-8) sizeof(md_tmp) 84 | B >1d8f2 (a UTF-8) assertion failed: EVP_CIPHER_iv_ 85 | B >1d8f2+ (a UTF-8) length(cipher) < 86 | B 1d932+ (a UTF-8) = 16 87 | B >1d932 (a UTF-8) assertion failed: keylen <= size 88 | B >1d932+ (a UTF-8) of key 89 | B 1d972 (a UTF-8) assertion failed: j <= (int)size 90 | B >1d972+ (a UTF-8) of(ctx-> 91 | B 1d9b2+ (a UTF-8) key) 92 | B <1d9f2 (a UTF-8) assertion failed: chunk >= 0 93 | B <1deb2 (a UTF-8) assertion failed: i != 0 94 | B <1e0f2 (a UTF-8) :BAD OBJECT 95 | B >1e0f2 (a UTF-8) :BAD BOOLEAN 96 | B >1e0f2 (a UTF-8) [HEX DUMP]: 97 | B <1e132 (a UTF-8) :BAD INTEGER 98 | B >1e132 (a UTF-8) :BAD ENUMERATED 99 | B 1e7f2 (a UTF-8) assertion failed: vv == NULL 100 | B <1ecf2 (a UTF-8) assertion failed: ctx->cipher->b 101 | B >1ecf2+ (a UTF-8) lock_size == 1 || ctx->cipher->b 102 | B >1ecf2+ (a UTF-8) lock_size == 8 | 103 | B 1ed32+ (a UTF-8) | ctx->cipher->block_size == 16 104 | B >1ed32 (a UTF-8) assertion failed: EVP_CIPHER_CTX 105 | B 1ed72+ (a UTF-8) _iv_length(ctx) <= (int)sizeof(c 106 | B >1ed72+ (a UTF-8) tx->iv) 107 | B 1edb2 (a UTF-8) assertion failed: bl <= (int)siz 108 | B >1edb2+ (a UTF-8) eof(ctx->buf) 109 | B <1edf2 (a UTF-8) assertion failed: b <= sizeof ct 110 | B >1edf2+ (a UTF-8) x->buf 111 | B >1edf2 (a UTF-8) assertion failed: b <= sizeof ct 112 | B >1edf2+ (a UTF-8) x->final 113 | B 1f232 (a UTF-8) assertion failed: num == 1 && ct 114 | B >1f232+ (a UTF-8) x->num_untrusted == num 115 | B <1f272 (a UTF-8) assertion failed: num > i && i > 116 | B >1f272+ (a UTF-8) 0 && ss == 0 117 | B <1f2b2 (a UTF-8) assertion failed: ctx->num_untru 118 | B >1f2b2+ (a UTF-8) sted <= num 119 | B >1f2b2 (a UTF-8) assertion failed: num == ctx->nu 120 | B >1f2b2+ (a UTF-8) m_untrus 121 | B 1f2f2+ (a UTF-8) ted 122 | B <1f5f2 (a UTF-8) assertion failed: pp == NULL || 123 | B >1f5f2+ (a UTF-8) *pp != NULL 124 | B <1f772 (a UTF-8) %*sPolicy: 125 | B >1f772 (a UTF-8) %*sCPS: %s 126 | B >1f772 (a UTF-8) %*sUser Notice: 127 | B <1f7b2 (a UTF-8) %*sUnknown Qualifier: 128 | B >1f7b2 (a UTF-8) %*sOrganization: %s 129 | B >1f7b2 (a UTF-8) %*sNumber%s: 130 | B <1f7f2 (a UTF-8) %*sExplicit Text: %s 131 | B <1fa32 (a UTF-8) %*sFull Name: 132 | B <1fa72 (a UTF-8) %*sRelative Name: 133 | B 1faf2 (a UTF-8) %*sCRL Issuer: 134 | B <1fd72 (a UTF-8) IP Address: 135 | B <1fdb2 (a UTF-8) %*sPath Length Constraint: 136 | B >1fdb2 (a UTF-8) %*sPolicy Language: 137 | B <1fdf2 (a UTF-8) %*sPolicy Text: %s 138 | B <1ff32 (a UTF-8) Not Before: 139 | B >1ff32 (a UTF-8) Not After: 140 | B <200f2 (a UTF-8) %*sVersion: %ld (0x%lX) 141 | B <20132 (a UTF-8) %*sZone: %s, User: 142 | B <201f2 (a UTF-8) %s %2d %02d:%02d:%02d%.*s %d%s 143 | B <202b2 (a UTF-8) '()+,-./:=? 144 | B <206f2 (a UTF-8) %s %2d %02d:%02d:%02d %d%s 145 | B <20a32 (a UTF-8) :EXTERNAL TYPE %s 146 | B <20a72 (a UTF-8) ERROR: selector [%d] invalid 147 | B <21fb2 (a UTF-8) :':+:1:K:Q:[:c:g:m:y: 148 | B <229b2 (a UTF-8) assertion failed: bits > prime_m 149 | B >229b2+ (a UTF-8) ultiplier_bits 150 | B <22a32 (a UTF-8) %s: (%d bit) 151 | B >22a32 (a UTF-8) private-key: 152 | B >22a32 (a UTF-8) public-key: 153 | B >22a32 (a UTF-8) generator: 154 | B >22a32 (a UTF-8) subgroup order: 155 | B <22a72 (a UTF-8) subgroup factor: 156 | B <22ab2 (a UTF-8) recommended-private-length: %d b 157 | B >22ab2+ (a UTF-8) its 158 | B 22bb2 (a UTF-8) Content-Type: application/ocsp-r 159 | B >22bb2+ (a UTF-8) equest 160 | B >22bb2 (a UTF-8) Content-Length: %d 161 | B 22bf2 (a UTF-8) %*scrlUrl: 162 | B >22bf2 (a UTF-8) %*scrlNum: 163 | B >22bf2 (a UTF-8) %*scrlTime: 164 | B <22c32 (a UTF-8) %*sIssuer: 165 | B >22c32 (a UTF-8) Private-Key: (%d bit) 166 | B <22c72 (a UTF-8) publicExponent: 167 | B >22c72 (a UTF-8) Public-Key: (%d bit) 168 | B >22c72 (a UTF-8) privateExponent: 169 | B <22cb2 (a UTF-8) exponent1: 170 | B >22cb2 (a UTF-8) exponent2: 171 | B >22cb2 (a UTF-8) coefficient: 172 | B <22cf2 (a UTF-8) Hash Algorithm: 173 | B >22cf2 (a UTF-8) Mask Algorithm: 174 | B <22d32 (a UTF-8) Salt Length: 0x 175 | B <22d72 (a UTF-8) Trailer Field: 0x 176 | B <22e32 (a UTF-8) Certificate: 177 | B <22e72 (a UTF-8) %8sVersion: %ld (0x%lx) 178 | B >22e72 (a UTF-8) %8sVersion: Unknown (%ld) 179 | B >22e72 (a UTF-8) Serial Number: 180 | B <22eb2 (a UTF-8) Issuer:%c 181 | B 22ef2 (a UTF-8) Not Before: 182 | B <22f32 (a UTF-8) Not After : 183 | B >22f32 (a UTF-8) Subject:%c 184 | B >22f32 (a UTF-8) Subject Public Key Info: 185 | B <22f72 (a UTF-8) %12sPublic Key Algorithm: 186 | B <22fb2 (a UTF-8) %8sIssuer Unique ID: 187 | B >22fb2 (a UTF-8) %8sSubject Unique ID: 188 | B <22ff2 (a UTF-8) Subject OCSP hash: 189 | B >22ff2 (a UTF-8) Public key OCSP hash: 190 | B <23032 (a UTF-8) Signature Algorithm: 191 | B >23032 (a UTF-8) %*sTrusted Uses: 192 | B <23072 (a UTF-8) %*sRejected Uses: 193 | B <230b2 (a UTF-8) %*sAlias: %s 194 | B >230b2 (a UTF-8) %*sKey Id: 195 | B <231f2 (a UTF-8) othername: 196 | B >231f2 (a UTF-8) X400Name: 197 | B <23232 (a UTF-8) EdiPartyName: 198 | B >23232 (a UTF-8) IP Address:%d.%d.%d.%d 199 | B <23272 (a UTF-8) IP Address: 200 | B >23272 (a UTF-8) Registered ID: 201 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | 204 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | //! `stringsext` searches for multi-byte encoded strings in binary data.\ 2 | 3 | //! `stringsext` is a Unicode enhancement of the GNU strings tool with 4 | //! additional functionalities: stringsext recognizes Cyrillic, CJKV characters 5 | //! and other scripts in all supported multi-byte-encodings, while GNU strings 6 | //! fails in finding any of these scripts in UTF-16 and many other encodings.\ 7 | 8 | //! The role of the main-module is to launch the processing of the input stream in 9 | //! batches with threads. It also receives, merges, sorts and prints the results. 10 | 11 | //! # Operating principle 12 | 13 | //! 1. The iterator `input::Slicer` concatenates the input-files and cuts 14 | //! the input stream into slices called `main::slice`. 15 | //! 16 | //! 2. In `main::run()` these slices are feed in parallel to threads, where each has 17 | //! its own `Mission` configuration. 18 | //! 19 | //! 3. Each thread runs a search in `main::slice` == `scanner::input_buffer`. The 20 | //! search is performed by `scanner::FindingCollection::scan()`, which cuts 21 | //! the `scanner::input_buffer` into smaller chunks of size 22 | //! 2*`output_line_char_nb_max` bytes hereafter called `input_window`. 23 | //! 24 | //! 4. The `Decoder` runs through the `input_window`, searches for valid strings and 25 | //! decodes them into UTF-8-chunks. 26 | //! 27 | //! 5. Each UTF-8-chunk is then fed into the filter `helper::SplitStr` to be 28 | //! analyzed if parts of it satisfy certain filter conditions. 29 | //! 30 | //! 6. Doing so, the `helper::SplitStr` cuts the UTF-8-chunk into even smaller 31 | //! `SplitStr`-chunks not longer than `output_line_char_nb_max` and sends 32 | //! them back to the `scanner::FindingCollection::scan()` loop. 33 | //! 34 | //! 7. There the `SplitStr`-chunk is packed into a `finding::Finding` object and 35 | //! then successively added to a `finding::FindingCollection`. 36 | //! 37 | //! 8. After finishing its run through the `input_window` the search continues with 38 | //! the next `input_window. Goto 5. 39 | //! 40 | //! 9. When all `input_window` s are processed, `scanner::FindingCollection::scan()` 41 | //! returns the `finding::FindingCollection` to `main::run()` and exits. 42 | //! 43 | //! 10. `main::run()` waits for all threads to return their 44 | //! `finding::FindingCollection` s. Then, all `Findings` s are merged, 45 | //! sorted and finally print out by `finding::print()`. 46 | //! 47 | //! 11. While the print still running, the next `main::slice` == 48 | //! `scanner::input_buffer` is sent to all threads for the next search. 49 | //! Goto 3. 50 | //! 51 | //! 12. `main::run()` exits when all `main::slice` s are processed. 52 | 53 | extern crate encoding_rs; 54 | 55 | mod finding; 56 | mod finding_collection; 57 | mod help; 58 | mod helper; 59 | mod input; 60 | mod mission; 61 | mod options; 62 | mod scanner; 63 | 64 | use crate::finding::OUTPUT_LINE_METADATA_LEN; 65 | use crate::finding_collection::FindingCollection; 66 | use crate::help::help; 67 | use crate::input::Slicer; 68 | use crate::mission::MISSIONS; 69 | use crate::options::ARGS; 70 | use crate::scanner::ScannerStates; 71 | use itertools::kmerge; 72 | use scoped_threadpool::Pool; 73 | use std::fs::File; 74 | use std::io; 75 | use std::io::LineWriter; 76 | use std::io::Write; 77 | use std::path::Path; 78 | use std::pin::Pin; 79 | use std::process; 80 | use std::str; 81 | use std::sync::mpsc; 82 | use std::thread; 83 | use std::thread::JoinHandle; 84 | 85 | /// Uses the version-number defined in `../Cargo.toml`. 86 | const VERSION: Option<&'static str> = option_env!("CARGO_PKG_VERSION"); 87 | /// (c) Jens Getreu 88 | const AUTHOR: &str = "(c) Jens Getreu, 2016-2024"; 89 | 90 | /// Processes the input stream in batches with threads. Then receives, merges, sorts and 91 | /// prints the result 92 | 93 | fn run() -> Result<(), anyhow::Error> { 94 | let merger: JoinHandle<_>; 95 | // Scope for threads 96 | { 97 | let n_threads = MISSIONS.len(); 98 | let (tx, rx) = mpsc::sync_channel(n_threads); 99 | // 100 | // Receiver thread: 101 | 102 | // Receive `FindingCollection`s from scanner threads. 103 | merger = thread::spawn(move || { 104 | // Set up output channel. 105 | let mut output = match ARGS.output { 106 | Some(ref fname) => { 107 | let f = File::create(Path::new(fname))?; 108 | // There is at least one `Mission` in `MISSIONS`. 109 | let output_line_len = 110 | 2 * MISSIONS[0].output_line_char_nb_max + OUTPUT_LINE_METADATA_LEN; 111 | let f = LineWriter::with_capacity(output_line_len, f); 112 | Box::new(f) as Box 113 | } 114 | None => Box::new(io::stdout()) as Box, 115 | }; 116 | output.write_all("\u{feff}".as_bytes())?; 117 | 118 | 'batch_receiver: loop { 119 | // collect 120 | let mut results: Vec>> = Vec::with_capacity(n_threads); 121 | for _ in 0..n_threads { 122 | results.push(match rx.recv() { 123 | // It would be safe to unpin here, as only read operations on data follow: 124 | // Ok(fc) => unsafe { *Pin::into_inner_unchecked(fc) }, 125 | // Instead, we implement `IntoIterator` for the pinned `Pin>` type, 126 | // allowing us to `kmerge` a vector of type `Vec>>`. 127 | // In this way no unsafe is needed. 128 | Ok(fc) => fc, 129 | _ => break 'batch_receiver, 130 | }); 131 | } 132 | // merge 133 | for finding in kmerge(&results) { 134 | finding.print(&mut output)?; 135 | } 136 | } 137 | //println!("Merger terminated."); 138 | output.write_all(&[b'\n'])?; 139 | output.flush()?; 140 | Ok(()) 141 | }); 142 | 143 | // 144 | // Sender threads: 145 | 146 | // Setting up the data slice producer. 147 | let input = Slicer::new(); 148 | 149 | // We set up the processor. 150 | let mut sss = ScannerStates::new(&MISSIONS); 151 | let mut pool = Pool::new(MISSIONS.len() as u32); 152 | 153 | for (slice, input_file_id, is_last_input_buffer) in input { 154 | pool.scoped(|scope| { 155 | for ss in sss.v.iter_mut() { 156 | let tx = tx.clone(); 157 | scope.execute(move || { 158 | let fc = 159 | FindingCollection::from(ss, input_file_id, slice, is_last_input_buffer); 160 | // Send the result to the receiver thread. 161 | tx.send(fc).expect( 162 | "Error: Can not sent result through output channel. \ 163 | Write permissions? Is there enough space? ", 164 | ); 165 | }); 166 | } 167 | }); 168 | } 169 | } // `tx` drops here, which breaks the `batch_receiver`-loop. 170 | 171 | // If everything goes well, we get `()` here. 172 | merger.join().unwrap() 173 | 174 | // All threads terminated. 175 | } 176 | 177 | /// Application entry point. 178 | fn main() { 179 | help(); 180 | 181 | if let Err(e) = run() { 182 | eprintln!("Error: `{:?}`.", e); 183 | process::exit(1); 184 | } 185 | } 186 | 187 | #[cfg(test)] 188 | mod tests { 189 | use crate::finding::Precision; 190 | use crate::finding_collection::FindingCollection; 191 | use crate::mission::Missions; 192 | use crate::options::{Args, Radix}; 193 | use crate::scanner::ScannerState; 194 | use itertools::Itertools; 195 | use lazy_static::lazy_static; 196 | use std::path::PathBuf; 197 | 198 | lazy_static! { 199 | pub static ref ARGS: Args = Args { 200 | inputs: vec![PathBuf::from("myfile.txt")], 201 | debug_option: false, 202 | encoding: vec!["ascii".to_string(), "utf-8".to_string()], 203 | list_encodings: false, 204 | version: false, 205 | chars_min: Some("5".to_string()), 206 | same_unicode_block: true, 207 | grep_char: None, 208 | radix: Some(Radix::X), 209 | output: None, 210 | output_line_len: Some("30".to_string()), 211 | no_metadata: false, 212 | counter_offset: Some("5000".to_string()), 213 | ascii_filter: None, 214 | unicode_block_filter: None, 215 | }; 216 | } 217 | 218 | lazy_static! { 219 | pub static ref MISSIONS: Missions = Missions::new( 220 | ARGS.counter_offset.as_ref(), 221 | &ARGS.encoding, 222 | ARGS.chars_min.as_ref(), 223 | ARGS.same_unicode_block, 224 | ARGS.ascii_filter.as_ref(), 225 | ARGS.unicode_block_filter.as_ref(), 226 | ARGS.grep_char.as_ref(), 227 | ARGS.output_line_len.as_ref(), 228 | ) 229 | .unwrap(); 230 | } 231 | 232 | /// Tests the concurrent scanning with 2 threads, while one thread merges and prints. 233 | #[test] 234 | fn test_merger() { 235 | use std::pin::Pin; 236 | 237 | let inp = "abcdefgÜhijklmn€opÜqrstuvwÜxyz".as_bytes(); 238 | 239 | let missions = &MISSIONS; 240 | //println!("{:#?}", *MISSIONS); 241 | 242 | let mut ss0 = ScannerState::new(&missions.v[0]); 243 | let mut ss1 = ScannerState::new(&missions.v[1]); 244 | 245 | let mut resv: Vec>> = Vec::new(); 246 | let fc = FindingCollection::from(&mut ss0, Some(0), inp, true); 247 | resv.push(fc); 248 | let fc = FindingCollection::from(&mut ss1, Some(0), inp, true); 249 | resv.push(fc); 250 | 251 | //println!("{:#?}", resv); 252 | 253 | assert_eq!(resv.len(), 2); 254 | assert_eq!(resv[0].v.len(), 3); 255 | assert_eq!(resv[0].v[0].s, "abcdefg"); 256 | assert_eq!(resv[0].v[1].s, "hijklmn"); 257 | assert_eq!(resv[0].v[2].s, "qrstuvw"); 258 | assert_eq!(resv[1].v.len(), 2); 259 | assert_eq!(resv[1].v[0].s, "abcdefgÜhijklmn"); 260 | assert_eq!(resv[1].v[1].s, "opÜqrstuvwÜxyz"); 261 | 262 | // Merge the results. 263 | 264 | let mut iter = resv.iter().kmerge(); 265 | // for res in iter { 266 | // println!("Result {:#?}", res); 267 | // }; 268 | 269 | // After merging and sorting the order is deterministic. 270 | // See implementation of `PartialOrd` for `Finding` for more 271 | // details. 272 | 273 | let f = iter.next().unwrap(); 274 | assert_eq!(f.s, "abcdefg"); 275 | assert_eq!(f.position, 5000); 276 | assert_eq!(f.position_precision, Precision::Exact); 277 | assert_eq!(f.mission.mission_id, 0); 278 | 279 | let f = iter.next().unwrap(); 280 | assert_eq!(f.s, "hijklmn"); 281 | assert_eq!(f.position, 5000); 282 | assert_eq!(f.position_precision, Precision::After); 283 | assert_eq!(f.mission.mission_id, 0); 284 | 285 | let f = iter.next().unwrap(); 286 | assert_eq!(f.s, "qrstuvw"); 287 | assert_eq!(f.position, 5000); 288 | assert_eq!(f.position_precision, Precision::After); 289 | assert_eq!(f.mission.mission_id, 0); 290 | 291 | let f = iter.next().unwrap(); 292 | assert_eq!(f.s, "abcdefgÜhijklmn"); 293 | assert_eq!(f.position, 5000); 294 | assert_eq!(f.position_precision, Precision::Exact); 295 | assert_eq!(f.mission.mission_id, 1); 296 | 297 | let f = iter.next().unwrap(); 298 | assert_eq!(f.s, "opÜqrstuvwÜxyz"); 299 | assert_eq!(f.position, 5000); 300 | assert_eq!(f.position_precision, Precision::After); 301 | assert_eq!(f.mission.mission_id, 1); 302 | 303 | let f = iter.next(); 304 | assert_eq!(f, None); 305 | } 306 | } 307 | -------------------------------------------------------------------------------- /tests/functional/input1: -------------------------------------------------------------------------------- 1 | Who Moved My Cheese? 2 | An A-Mazing Way To Deal With Change In Your Work 3 | And In Your Life 4 | Who Moved My Cheese? is a simple parable that reveals profound truths 5 | about change. It is an amusing and enlightening story of four characters who 6 | live in a 'Maze' and look for 'Cheese' to nourish them and make them happy. 7 | Two are mice named Sniff and Scurry. And two are little people' - beings the 8 | size of mice who look and act a lot like people. Their names are Hem and 9 | Haw. 'Cheese' is a metaphor for what you want to have in life - whether it's a 10 | good job, a loving relationship, money, a possession, good health, or spiritual 11 | peace of mind. And 'The Maze' is where you look for what you want - the 12 | organization you work in, or the family or community you live in. 13 | In the story, the characters are faced with unexpected change. Eventually, 14 | one of them deals with it successfully, and writes what he has learned from 15 | his experience on the maze walls. 16 | When you come to see 'The Handwriting on the Wall', you can discover for 17 | yourself how to deal with change, so that you can enjoy less stress and more 18 | success (however you define it) in your work and in your life. 19 | Written for all ages, this story takes less than an hour to read, but its unique 20 | insights can last for a lifetime. 21 | 22 | Who Moved My Cheese? 23 | Contents 24 | Parts of All of Us 25 | A Gathering: Chicago 26 | Who Moved My Cheese?: The Story 27 | Four Characters 28 | Finding Cheese 29 | No Cheese! 30 | The Mice: Sniff & Scurry 31 | The Little people: Hem & Haw 32 | Meanwhile, Back In the Maze 33 | Getting Beyond Fear 34 | Enjoying The Adventure 35 | Moving With The Cheese 36 | The Handwriting On The Wall 37 | Tasting New Cheese 38 | Enjoying Change! 39 | A Discussion: Later That Same Day 40 | New Cheese ! 41 | 42 | Parts of All of Us 43 | The Simple and The Complex 44 | The four imaginary characters 45 | depicted in this story — 46 | the mice: "Sniff" and "Scurry;' and 47 | the Little people: "Hem" and "Haw" — 48 | are intended to represent the simple and 49 | the complex parts of ourselves, regardless of 50 | our age, gender, race or nationality. 51 | Sometimes we may act like 52 | Sniff 53 | Who sniffs out change early, or 54 | Scurry 55 | Who scurries into action, or 56 | Hem 57 | Who denies and resists change as he fears 58 | it will lead to something worse, or 59 | Haw 60 | Who learns to adapt in time when he sees 61 | changing can lead to something better! 62 | Whatever parts of us we choose to use, 63 | we all share something in common: 64 | a need to find our way in the Maze 65 | and succeed in changing times. 66 | 67 | A Gathering 68 | Chicago 69 | One sunny Sunday in Chicago, several former classmates, who were good 70 | friends in school, gathered for lunch, having attended their high school 71 | reunion the night before. They wanted to hear more about what was 72 | happening in each other's lives. After a good deal of kidding, and a good 73 | meal, they settled into an interesting conversation. 74 | Angela, who had been one of the most popular people in the class, said, "Life 75 | sure turned out differently than I thought it would when we were in school. A 76 | lot has changed." 77 | "It certainly has," Nathan echoed. They knew he had gone into his family's 78 | business, which had operated pretty much the same and had been a part of 79 | the local community for as long as they could remember. So, they were 80 | surprised when he seemed concerned. He asked, "But, have you noticed 81 | how we don't want to change when things change?" 82 | Carlos said, "I guess we resist changing, because we're afraid of change." 83 | "Carlos, you were Captain of the football team" Jessica said. "I never thought 84 | I'd hear you say anything about being afraid!" 85 | They all laughed as they realized that although they had gone off in different 86 | directions—from working at home to managing companies—they were 87 | experiencing similar feelings. 88 | Everyone was trying to cope with the unexpected changes that were 89 | happening to them in recent years. And most admitted that they did not know 90 | a good way to handle them. 91 | Then Michael said, "I used to be afraid of change. When a big change came 92 | along in our business, we didn't know what to do. So we didn't adjust and we 93 | almost lost it. 94 | "That is," he continued, "until I heard a funny little story that changed 95 | everything." 96 | "How so?" Nathan asked. 97 | "Well, the story changed the way I looked at change—from losing something 98 | to gaining some-thing—and it showed me how to do it. After that, things 99 | quickly improved—at work and in my life. 100 | "At first I was annoyed with the obvious simplicity of the story because it 101 | sounded like something we might have been told in school. 102 | "Then I realized I was really annoyed with myself for not seeing the obvious 103 | and doing what works when things change. 104 | 105 | "When I realized the four characters in the story represented the various parts 106 | of myself, I decided who I wanted to act like and I changed. 107 | "Later, I passed the story on to some people in our company and they passed 108 | it on to others, and soon our business did much better, because most of us 109 | adapted to change better. And like me, many people said it helped them in 110 | their personal lives. 111 | "However there were a few people who said they got nothing out of it. They 112 | either knew the lessons and were already living them, or, more commonly, 113 | they thought they already knew everything and didn't want to learn. They 114 | couldn't see why so many others were benefiting from it. 115 | "When one of our senior executives, who was having difficulty adapting, said 116 | the story was a waste of time, other people kidded him saying they knew 117 | which character he was in the story—meaning the one who learned nothing 118 | new and did not change.'" 119 | "What's the story?" Angela asked. 120 | "It's called. Who Moved My Cheese?" 121 | The group laughed. "I think I like it already," Carlos said. "Would you tell us 122 | the story? Maybe we can get something from it." 123 | "Sure," Michael replied. "I'd be happy to—it doesn't take long." And so he 124 | began: 125 | 126 | Who Moved My Cheese? The Story 127 | UNCE, long ago in a land far away, there lived four little characters who ran 128 | through a Maze looking for cheese to nourish them and make them happy. 129 | Two were mice, named "Sniff" and "Scurry" and two were Little people— 130 | beings who were as small as mice but who looked and acted a lot like people 131 | today. Their names were "Hem" and "Haw." 132 | Due to their small size, it would be easy not to notice what the four of them 133 | were doing. But if you looked closely enough, you could discover the most 134 | amazing things! 135 | Every day the mice and the Little people spent time in the Maze looking for 136 | their own special cheese. 137 | The mice. Sniff and Scurry, possessing simple brains and good instincts, 138 | searched for the hard nibbling cheese they liked, as mice often do. 139 | The two Little people, Hem and Haw, used their complex brains, filled with 140 | many beliefs and emotions, to search for a very different kind of Cheese— 141 | with a capital C—which they believed would make them feel happy and 142 | successful. 143 | As different as the mice and Little people were, they shared something in 144 | common: every morning, they each put on their jogging suits and running 145 | shoes, left their little homes, and raced out into the Maze looking for their 146 | favourite cheese. 147 | The Maze was a labyrinth of corridors and chambers, some containing 148 | delicious cheese. But there were also dark corners and blind alleys leading 149 | nowhere. It was an easy place for anyone to get lost. 150 | However, for those who found their way, the Maze held secrets that let them 151 | enjoy a better life. The mice, Sniff and Scurry, used the simple trial-and-error 152 | method of finding cheese. They ran down one corridor, and if it proved empty, 153 | they turned and ran down another. They remembered the corridors that held 154 | no cheese and quickly went into new areas. 155 | Sniff would smell out the general direction of the cheese, using his great nose, 156 | and Scurry would race ahead. They got lost, as you might expect, went off in 157 | the wrong direction and often bumped into walls. 158 | But after a while, they found their way. 159 | Like the mice, the two Little people, Hem and Haw, also used their ability to 160 | think and learn from their past experiences. However, they relied on their 161 | complex brains to develop more sophisticated methods of finding Cheese. 162 | Sometimes they did well, but at other times their powerful human beliefs and 163 | 164 | emotions took over and clouded the way they looked at things. It made life in 165 | the Maze more complicated and challenging. 166 | Nonetheless, Sniff, Scurry, Hem and Haw all discovered, in their own way, 167 | what they were looking for. They each found their own kind of cheese one 168 | day at the end of one of the corridors in cheese Station C. 169 | Every morning after that, the mice and the Little people dressed in their 170 | running gear and headed over to Cheese Station C. It wasn't long before they 171 | each established their own routine. 172 | Sniff and Scurry continued to wake early every day and race through the 173 | Maze, always following the same route. 174 | When they arrived at their destination, the mice took off their running shoes, 175 | tied them together and hung them around their necks—so they could get to 176 | them quickly whenever they needed them again. 177 | Then they enjoyed the cheese. 178 | In the beginning Hem and Haw also raced toward Cheese Station C every 179 | morning to enjoy the tasty new morsels that awaited them. But after a while, a 180 | different routine set in for the Little people. 181 | Hem and Haw awoke each day a little later, dressed a little slower, and 182 | walked to Cheese Station C. After all, they knew where the Cheese was now 183 | and how to get there. 184 | They had no idea where the Cheese came from, or who put it there. They just 185 | assumed it would be there. 186 | As soon as Hem and Haw arrived at Cheese Station C each morning, they 187 | settled in and made themselves at home. They hung up their jogging suits, 188 | put away their running shoes and put on their slippers. They were becoming 189 | very comfortable now that they had found the Cheese. 190 | "This is great" Hem said. "There's enough Cheese here to last us forever." 191 | The Little people felt happy and successful, and thought they were now 192 | secure. 193 | It wasn't long before Hem and Haw regarded the Cheese they found at 194 | Cheese Station C as their cheese. It was such a large store of Cheese that 195 | they eventually moved their homes to be closer to it, and built a social life 196 | around it. 197 | To make themselves feel more at home, Hem and Haw decorated the walls 198 | with sayings and even drew pictures of Cheese around them which made 199 | them smile. One read: 200 | 201 | Sometimes Hem and Haw would take their friends by to see their pile of 202 | Cheese at Cheese Station C, and point to it with pride, saying, "Pretty 203 | nice Cheese, hub?" Sometimes they shared it with their friends and 204 | sometimes they didn't. 205 | "We deserve this Cheese,"' Hem said. "We certainly had to work long and 206 | hard enough to find it." He picked up a nice fresh piece and ate it. Afterward, 207 | Hem fell asleep, as he often did. 208 | Every night the Little people would waddle home, full of Cheese, and every 209 | morning they would confidently return for more. This went on for quite some 210 | time. 211 | After a while Hem's and Haw's confidence grew into the arrogance of 212 | success. Soon they became so comfortable they didn't even notice what was 213 | happening. 214 | As time went on. Sniff and Scurry continued their routine. They arrived early 215 | each morning and sniffed and scratched and scurried around Cheese Station 216 | C, inspecting the area to see if there had been any changes from the day 217 | before. Then they would sit down to nibble on the cheese. 218 | One morning they arrived at Cheese Station C and discovered there was no 219 | cheese. 220 | They weren't surprised. Since Sniff and Scurry had noticed the supply of 221 | cheese had been getting smaller every day, they were prepared for the 222 | inevitable and knew instinctively what to do. 223 | They looked at each other, removed the running shoes they had tied together 224 | and hung conveniently around their necks, put them on their feet and laced 225 | them up. 226 | The mice did not over analyze things. To the mice, the problem and the 227 | 228 | 229 | Services 230 | Keynote Presentations, Change Leader Certification and Learning Programs 231 | Products 232 | The 'New Cheese 'Experience 233 | An interactive program used successfully by many organizations around the 234 | world to help individuals and organizations change and win. 235 | Who Moved My Cheese? The Movie: 236 | A 13 minute animated movie on videocassette tells the Story of Who Moved 237 | My Cheese? through the adventures of Sniff, Scurry, Hem and Haw, as a way 238 | to introduce change in your organization in a fun and non-threatening way. 239 | Aft A-Mawng Change Profile: 240 | A self-scoring tool. Find out who you are, what personalities are at work 241 | around you, and how you can work together to change and win! 242 | NEW! Who Moved My Cheese? Personal Planner Inserts / Binder: 243 | This daily planner will help you keep track of your most important 'Cheese' 244 | things to do, notes and contact information using Cheese language in a fun 245 | way. 246 | Fun, Practical Reminders 247 | Posters, Day-to-Day Desk Calendar, Coffee Mugs, Post-It Notes, Cheese 248 | Squeezes, Maze Pens, Logo Shirts, Handwriting on the Wall Cards, and 249 | more! 250 | To learn more, visit: 251 | www.WhoMovedMyCheese.com 252 | 253 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "ansi_term" 7 | version = "0.11.0" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" 10 | dependencies = [ 11 | "winapi", 12 | ] 13 | 14 | [[package]] 15 | name = "anstream" 16 | version = "0.6.15" 17 | source = "registry+https://github.com/rust-lang/crates.io-index" 18 | checksum = "64e15c1ab1f89faffbf04a634d5e1962e9074f2741eef6d97f3c4e322426d526" 19 | dependencies = [ 20 | "anstyle", 21 | "anstyle-parse", 22 | "anstyle-query", 23 | "anstyle-wincon", 24 | "colorchoice", 25 | "is_terminal_polyfill", 26 | "utf8parse", 27 | ] 28 | 29 | [[package]] 30 | name = "anstyle" 31 | version = "1.0.8" 32 | source = "registry+https://github.com/rust-lang/crates.io-index" 33 | checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1" 34 | 35 | [[package]] 36 | name = "anstyle-parse" 37 | version = "0.2.5" 38 | source = "registry+https://github.com/rust-lang/crates.io-index" 39 | checksum = "eb47de1e80c2b463c735db5b217a0ddc39d612e7ac9e2e96a5aed1f57616c1cb" 40 | dependencies = [ 41 | "utf8parse", 42 | ] 43 | 44 | [[package]] 45 | name = "anstyle-query" 46 | version = "1.1.1" 47 | source = "registry+https://github.com/rust-lang/crates.io-index" 48 | checksum = "6d36fc52c7f6c869915e99412912f22093507da8d9e942ceaf66fe4b7c14422a" 49 | dependencies = [ 50 | "windows-sys", 51 | ] 52 | 53 | [[package]] 54 | name = "anstyle-wincon" 55 | version = "3.0.4" 56 | source = "registry+https://github.com/rust-lang/crates.io-index" 57 | checksum = "5bf74e1b6e971609db8ca7a9ce79fd5768ab6ae46441c572e46cf596f59e57f8" 58 | dependencies = [ 59 | "anstyle", 60 | "windows-sys", 61 | ] 62 | 63 | [[package]] 64 | name = "anyhow" 65 | version = "1.0.89" 66 | source = "registry+https://github.com/rust-lang/crates.io-index" 67 | checksum = "86fdf8605db99b54d3cd748a44c6d04df638eb5dafb219b135d0149bd0db01f6" 68 | 69 | [[package]] 70 | name = "atty" 71 | version = "0.2.14" 72 | source = "registry+https://github.com/rust-lang/crates.io-index" 73 | checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" 74 | dependencies = [ 75 | "hermit-abi", 76 | "libc", 77 | "winapi", 78 | ] 79 | 80 | [[package]] 81 | name = "bitflags" 82 | version = "1.3.2" 83 | source = "registry+https://github.com/rust-lang/crates.io-index" 84 | checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" 85 | 86 | [[package]] 87 | name = "cfg-if" 88 | version = "1.0.0" 89 | source = "registry+https://github.com/rust-lang/crates.io-index" 90 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 91 | 92 | [[package]] 93 | name = "clap" 94 | version = "2.33.3" 95 | source = "registry+https://github.com/rust-lang/crates.io-index" 96 | checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002" 97 | dependencies = [ 98 | "ansi_term", 99 | "atty", 100 | "bitflags", 101 | "strsim 0.8.0", 102 | "textwrap", 103 | "unicode-width", 104 | "vec_map", 105 | ] 106 | 107 | [[package]] 108 | name = "clap" 109 | version = "4.5.19" 110 | source = "registry+https://github.com/rust-lang/crates.io-index" 111 | checksum = "7be5744db7978a28d9df86a214130d106a89ce49644cbc4e3f0c22c3fba30615" 112 | dependencies = [ 113 | "clap_builder", 114 | ] 115 | 116 | [[package]] 117 | name = "clap_builder" 118 | version = "4.5.19" 119 | source = "registry+https://github.com/rust-lang/crates.io-index" 120 | checksum = "a5fbc17d3ef8278f55b282b2a2e75ae6f6c7d4bb70ed3d0382375104bfafdb4b" 121 | dependencies = [ 122 | "anstream", 123 | "anstyle", 124 | "clap_lex", 125 | "strsim 0.11.1", 126 | ] 127 | 128 | [[package]] 129 | name = "clap_lex" 130 | version = "0.7.2" 131 | source = "registry+https://github.com/rust-lang/crates.io-index" 132 | checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" 133 | 134 | [[package]] 135 | name = "colorchoice" 136 | version = "1.0.2" 137 | source = "registry+https://github.com/rust-lang/crates.io-index" 138 | checksum = "d3fd119d74b830634cea2a0f58bbd0d54540518a14397557951e79340abc28c0" 139 | 140 | [[package]] 141 | name = "either" 142 | version = "1.6.1" 143 | source = "registry+https://github.com/rust-lang/crates.io-index" 144 | checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" 145 | 146 | [[package]] 147 | name = "encoding_rs" 148 | version = "0.8.34" 149 | source = "registry+https://github.com/rust-lang/crates.io-index" 150 | checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59" 151 | dependencies = [ 152 | "cfg-if", 153 | ] 154 | 155 | [[package]] 156 | name = "heck" 157 | version = "0.3.3" 158 | source = "registry+https://github.com/rust-lang/crates.io-index" 159 | checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" 160 | dependencies = [ 161 | "unicode-segmentation", 162 | ] 163 | 164 | [[package]] 165 | name = "hermit-abi" 166 | version = "0.1.19" 167 | source = "registry+https://github.com/rust-lang/crates.io-index" 168 | checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" 169 | dependencies = [ 170 | "libc", 171 | ] 172 | 173 | [[package]] 174 | name = "is_terminal_polyfill" 175 | version = "1.70.1" 176 | source = "registry+https://github.com/rust-lang/crates.io-index" 177 | checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" 178 | 179 | [[package]] 180 | name = "itertools" 181 | version = "0.13.0" 182 | source = "registry+https://github.com/rust-lang/crates.io-index" 183 | checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" 184 | dependencies = [ 185 | "either", 186 | ] 187 | 188 | [[package]] 189 | name = "lazy_static" 190 | version = "1.5.0" 191 | source = "registry+https://github.com/rust-lang/crates.io-index" 192 | checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" 193 | 194 | [[package]] 195 | name = "libc" 196 | version = "0.2.107" 197 | source = "registry+https://github.com/rust-lang/crates.io-index" 198 | checksum = "fbe5e23404da5b4f555ef85ebed98fb4083e55a00c317800bc2a50ede9f3d219" 199 | 200 | [[package]] 201 | name = "pin-project" 202 | version = "1.1.5" 203 | source = "registry+https://github.com/rust-lang/crates.io-index" 204 | checksum = "b6bf43b791c5b9e34c3d182969b4abb522f9343702850a2e57f460d00d09b4b3" 205 | dependencies = [ 206 | "pin-project-internal", 207 | ] 208 | 209 | [[package]] 210 | name = "pin-project-internal" 211 | version = "1.1.5" 212 | source = "registry+https://github.com/rust-lang/crates.io-index" 213 | checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" 214 | dependencies = [ 215 | "proc-macro2", 216 | "quote", 217 | "syn 2.0.79", 218 | ] 219 | 220 | [[package]] 221 | name = "proc-macro-error" 222 | version = "1.0.4" 223 | source = "registry+https://github.com/rust-lang/crates.io-index" 224 | checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" 225 | dependencies = [ 226 | "proc-macro-error-attr", 227 | "proc-macro2", 228 | "quote", 229 | "syn 1.0.81", 230 | "version_check", 231 | ] 232 | 233 | [[package]] 234 | name = "proc-macro-error-attr" 235 | version = "1.0.4" 236 | source = "registry+https://github.com/rust-lang/crates.io-index" 237 | checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" 238 | dependencies = [ 239 | "proc-macro2", 240 | "quote", 241 | "version_check", 242 | ] 243 | 244 | [[package]] 245 | name = "proc-macro2" 246 | version = "1.0.86" 247 | source = "registry+https://github.com/rust-lang/crates.io-index" 248 | checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" 249 | dependencies = [ 250 | "unicode-ident", 251 | ] 252 | 253 | [[package]] 254 | name = "quote" 255 | version = "1.0.37" 256 | source = "registry+https://github.com/rust-lang/crates.io-index" 257 | checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" 258 | dependencies = [ 259 | "proc-macro2", 260 | ] 261 | 262 | [[package]] 263 | name = "scoped_threadpool" 264 | version = "0.1.9" 265 | source = "registry+https://github.com/rust-lang/crates.io-index" 266 | checksum = "1d51f5df5af43ab3f1360b429fa5e0152ac5ce8c0bd6485cae490332e96846a8" 267 | 268 | [[package]] 269 | name = "serde" 270 | version = "1.0.210" 271 | source = "registry+https://github.com/rust-lang/crates.io-index" 272 | checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a" 273 | dependencies = [ 274 | "serde_derive", 275 | ] 276 | 277 | [[package]] 278 | name = "serde_derive" 279 | version = "1.0.210" 280 | source = "registry+https://github.com/rust-lang/crates.io-index" 281 | checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f" 282 | dependencies = [ 283 | "proc-macro2", 284 | "quote", 285 | "syn 2.0.79", 286 | ] 287 | 288 | [[package]] 289 | name = "stringsext" 290 | version = "2.3.5" 291 | dependencies = [ 292 | "anyhow", 293 | "clap 4.5.19", 294 | "encoding_rs", 295 | "itertools", 296 | "lazy_static", 297 | "pin-project", 298 | "scoped_threadpool", 299 | "serde", 300 | "serde_derive", 301 | "structopt", 302 | ] 303 | 304 | [[package]] 305 | name = "strsim" 306 | version = "0.8.0" 307 | source = "registry+https://github.com/rust-lang/crates.io-index" 308 | checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" 309 | 310 | [[package]] 311 | name = "strsim" 312 | version = "0.11.1" 313 | source = "registry+https://github.com/rust-lang/crates.io-index" 314 | checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" 315 | 316 | [[package]] 317 | name = "structopt" 318 | version = "0.3.26" 319 | source = "registry+https://github.com/rust-lang/crates.io-index" 320 | checksum = "0c6b5c64445ba8094a6ab0c3cd2ad323e07171012d9c98b0b15651daf1787a10" 321 | dependencies = [ 322 | "clap 2.33.3", 323 | "lazy_static", 324 | "structopt-derive", 325 | ] 326 | 327 | [[package]] 328 | name = "structopt-derive" 329 | version = "0.4.18" 330 | source = "registry+https://github.com/rust-lang/crates.io-index" 331 | checksum = "dcb5ae327f9cc13b68763b5749770cb9e048a99bd9dfdfa58d0cf05d5f64afe0" 332 | dependencies = [ 333 | "heck", 334 | "proc-macro-error", 335 | "proc-macro2", 336 | "quote", 337 | "syn 1.0.81", 338 | ] 339 | 340 | [[package]] 341 | name = "syn" 342 | version = "1.0.81" 343 | source = "registry+https://github.com/rust-lang/crates.io-index" 344 | checksum = "f2afee18b8beb5a596ecb4a2dce128c719b4ba399d34126b9e4396e3f9860966" 345 | dependencies = [ 346 | "proc-macro2", 347 | "quote", 348 | "unicode-xid", 349 | ] 350 | 351 | [[package]] 352 | name = "syn" 353 | version = "2.0.79" 354 | source = "registry+https://github.com/rust-lang/crates.io-index" 355 | checksum = "89132cd0bf050864e1d38dc3bbc07a0eb8e7530af26344d3d2bbbef83499f590" 356 | dependencies = [ 357 | "proc-macro2", 358 | "quote", 359 | "unicode-ident", 360 | ] 361 | 362 | [[package]] 363 | name = "textwrap" 364 | version = "0.11.0" 365 | source = "registry+https://github.com/rust-lang/crates.io-index" 366 | checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" 367 | dependencies = [ 368 | "unicode-width", 369 | ] 370 | 371 | [[package]] 372 | name = "unicode-ident" 373 | version = "1.0.13" 374 | source = "registry+https://github.com/rust-lang/crates.io-index" 375 | checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" 376 | 377 | [[package]] 378 | name = "unicode-segmentation" 379 | version = "1.8.0" 380 | source = "registry+https://github.com/rust-lang/crates.io-index" 381 | checksum = "8895849a949e7845e06bd6dc1aa51731a103c42707010a5b591c0038fb73385b" 382 | 383 | [[package]] 384 | name = "unicode-width" 385 | version = "0.1.9" 386 | source = "registry+https://github.com/rust-lang/crates.io-index" 387 | checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973" 388 | 389 | [[package]] 390 | name = "unicode-xid" 391 | version = "0.2.2" 392 | source = "registry+https://github.com/rust-lang/crates.io-index" 393 | checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" 394 | 395 | [[package]] 396 | name = "utf8parse" 397 | version = "0.2.2" 398 | source = "registry+https://github.com/rust-lang/crates.io-index" 399 | checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" 400 | 401 | [[package]] 402 | name = "vec_map" 403 | version = "0.8.2" 404 | source = "registry+https://github.com/rust-lang/crates.io-index" 405 | checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" 406 | 407 | [[package]] 408 | name = "version_check" 409 | version = "0.9.3" 410 | source = "registry+https://github.com/rust-lang/crates.io-index" 411 | checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe" 412 | 413 | [[package]] 414 | name = "winapi" 415 | version = "0.3.9" 416 | source = "registry+https://github.com/rust-lang/crates.io-index" 417 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" 418 | dependencies = [ 419 | "winapi-i686-pc-windows-gnu", 420 | "winapi-x86_64-pc-windows-gnu", 421 | ] 422 | 423 | [[package]] 424 | name = "winapi-i686-pc-windows-gnu" 425 | version = "0.4.0" 426 | source = "registry+https://github.com/rust-lang/crates.io-index" 427 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 428 | 429 | [[package]] 430 | name = "winapi-x86_64-pc-windows-gnu" 431 | version = "0.4.0" 432 | source = "registry+https://github.com/rust-lang/crates.io-index" 433 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" 434 | 435 | [[package]] 436 | name = "windows-sys" 437 | version = "0.52.0" 438 | source = "registry+https://github.com/rust-lang/crates.io-index" 439 | checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" 440 | dependencies = [ 441 | "windows-targets", 442 | ] 443 | 444 | [[package]] 445 | name = "windows-targets" 446 | version = "0.52.6" 447 | source = "registry+https://github.com/rust-lang/crates.io-index" 448 | checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" 449 | dependencies = [ 450 | "windows_aarch64_gnullvm", 451 | "windows_aarch64_msvc", 452 | "windows_i686_gnu", 453 | "windows_i686_gnullvm", 454 | "windows_i686_msvc", 455 | "windows_x86_64_gnu", 456 | "windows_x86_64_gnullvm", 457 | "windows_x86_64_msvc", 458 | ] 459 | 460 | [[package]] 461 | name = "windows_aarch64_gnullvm" 462 | version = "0.52.6" 463 | source = "registry+https://github.com/rust-lang/crates.io-index" 464 | checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" 465 | 466 | [[package]] 467 | name = "windows_aarch64_msvc" 468 | version = "0.52.6" 469 | source = "registry+https://github.com/rust-lang/crates.io-index" 470 | checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" 471 | 472 | [[package]] 473 | name = "windows_i686_gnu" 474 | version = "0.52.6" 475 | source = "registry+https://github.com/rust-lang/crates.io-index" 476 | checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" 477 | 478 | [[package]] 479 | name = "windows_i686_gnullvm" 480 | version = "0.52.6" 481 | source = "registry+https://github.com/rust-lang/crates.io-index" 482 | checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" 483 | 484 | [[package]] 485 | name = "windows_i686_msvc" 486 | version = "0.52.6" 487 | source = "registry+https://github.com/rust-lang/crates.io-index" 488 | checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" 489 | 490 | [[package]] 491 | name = "windows_x86_64_gnu" 492 | version = "0.52.6" 493 | source = "registry+https://github.com/rust-lang/crates.io-index" 494 | checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" 495 | 496 | [[package]] 497 | name = "windows_x86_64_gnullvm" 498 | version = "0.52.6" 499 | source = "registry+https://github.com/rust-lang/crates.io-index" 500 | checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" 501 | 502 | [[package]] 503 | name = "windows_x86_64_msvc" 504 | version = "0.52.6" 505 | source = "registry+https://github.com/rust-lang/crates.io-index" 506 | checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" 507 | -------------------------------------------------------------------------------- /doc/source/stringsext--manpage.md: -------------------------------------------------------------------------------- 1 | % STRINGSEXT(1) Version 2.3.5 | Stringsext Documentation 2 | 3 | 65 | 66 | # NAME 67 | 68 | Search for multi-byte encoded strings in binary data. 69 | 70 | # SYNOPSIS 71 | 72 | stringsext [options] [-e ENC...] [--] [FILE...] 73 | stringsext [options] [-e ENC...] [--] [-] 74 | 75 | # DESCRIPTION 76 | 77 | **stringsext** is a Unicode enhancement of the *GNU strings* tool with 78 | additional functionalities: **stringsext** recognizes Cyrillic, CJKV 79 | characters and other scripts in all supported multi-byte-encodings, 80 | while *GNU strings* fails in finding any of these scripts in UTF-16 and 81 | many other encodings. 82 | 83 | **stringsext** is mainly useful for determining the Unicode content in 84 | binary data: It prints all graphic character sequences in *FILE* or 85 | *stdin* that are at least *MIN* bytes long. 86 | 87 | Unlike *GNU strings* **stringsext** can be configured to search for 88 | valid characters not only in ASCII but also in many other input 89 | encodings, e.g.: *utf-8, utf-16be, utf-16le, big5, euc-jp, koi8-r* 90 | and many others. **\--list-encodings** shows a list of valid encoding 91 | names based on the WHATWG Encoding Standard. When more than one encoding 92 | is specified, the scan is performed in different threads simultaneously. 93 | 94 | When searching for UTF-16 encoded strings, 96% of all possible two byte 95 | sequences, interpreted as UTF-16 code unit, relate directly to Unicode 96 | code points. As a result, the probability of encountering valid Unicode 97 | characters in a random byte stream, interpreted as UTF-16, is also 96%. 98 | In order to reduce this big number of false positives, **stringsext** 99 | provides a parametrizable Unicode-block-filter. See **\--encodings** 100 | option for more details. 101 | 102 | **stringsext** reads its input data from (multiple) **FILE**s. With no 103 | **FILE** is given, or when **FILE** is "`-`", it reads standard input *stdin*. 104 | 105 | When invoked with "`stringsext -e ascii`", **stringsext** can be used 106 | as *GNU strings* replacement. 107 | 108 | 109 | # OPTIONS 110 | 111 | **-a** *AF*, **\--ascii-filter**=*AF* 112 | 113 | : Apply ASCII-Filter. After the string-findings had been decoded into 114 | UTF-8, the ASCII-filter is one of the 4 filters all string-findings have 115 | to pass before being printed. The ASCII-filter is applied to Unicode 116 | characters in "`U+0000..U+007F`" only. The filter parameter AF decides 117 | which of these codes will pass the filter. AF is some 128 bit integer, 118 | where each bit is mapped to one character in the above character-range, 119 | e.g. the character "`U+0020`" will pass the filter only, if bit no. 32 120 | (=0x20) is set. If the filter is configured with bit no. 32 cleared, all 121 | characters "`U+0020`" will be rejected. 122 | 123 | The integer AF is notated in hexadecimal with prefix "`0x...`". 124 | For the most common use-cases, predefined filters can be set: 125 | e.g. alias names like "`All-Ctrl`" or "`All-Ctrl+Wsp`" are shorthand 126 | terms for ASCII-filters "all codes, but no control-codes" or 127 | "all codes, including white-space, but no control-codes". 128 | See the output of "`--list-encodings`" for more details 129 | about filter-names. 130 | 131 | **-c**, **\--no-metadata** 132 | 133 | : Suppress all metadata in output. 134 | "`stringsext`" presents its string-findings in one or more 135 | output-lines. Each line shows some meta information before 136 | printing the finding itself. See the section "`Output Format`" 137 | for more information about metadata. 138 | 139 | **-d**, **\--debug-options** 140 | 141 | : Show how command-line-options are interpreted. When set, "`stringsext`" 142 | prints an exhaustive filter parameter synoptic. Can be used for debugging 143 | to check how the present command-line-arguments are interpreted or for 144 | documentation purpose. Does not run the scanner. 145 | 146 | **-e** *ENC*, **\--encoding**=*ENC* 147 | 148 | : Set (multiple) input search encodings. 149 | 150 | *ENC*==*[ENCNAME],*\[*MIN*\],\[*AF*\],\[*UBF*\], 151 | \[*GREP*\] 152 | 153 | *ENCNAME* 154 | 155 | : Search for strings encoded as ENCNAME. Encoding names 156 | *ENCNAME* are denoted following the WATHWG standard. 157 | "`--list-encodings`" prints a list of available encodings. 158 | 159 | *MIN*, *AF*, *UBF*, *GREP* 160 | 161 | : Once the input is decoded to UTF-8, all characters have to pass 4 162 | additional filters before being printed: MIN (see "`--chars-min`"), 163 | AF (see "`--ascii-filter`"), UBF (see "`--unicode-block-filter`") and 164 | GREP (see "`--grep-char`"). 165 | 166 | The values given here override - for this ENC only - the default 167 | values given by "`--chars-min`", "`--ascii-filter`", 168 | "`--unicode-block-filter`" and "`--grep-char`". 169 | 170 | "`--list-encodings`" prints a list of predefined filter-names. 171 | 172 | **-g** *ASCII_CODE*, **\--grep-char**=*ASCII_CODE* 173 | 174 | : Print only findings having at least one character with ASCII_CODE. 175 | "`--grep-char`" is one of the 4 filters decoded output lines must pass 176 | before being printed. "`--grep-char`" checks for the presence of 177 | ASCII_CODE in the first output-line of a string-finding. The ASCII-code 178 | can be given as decimal or hexadecimal number. The latter starts with 179 | "`0x...`". Useful values are "`47`" (`/`) or "`92`" (`\`) for path 180 | search. 181 | 182 | **-h, \--help** 183 | 184 | : Print a synopsis of available options and default values. 185 | 186 | **-l, \--list-encodings** 187 | 188 | : List available encodings as WHATWG-Encoding-Standard-names, 189 | predefined ASCII-filter and Unicode-Block-Filter alias names. 190 | 191 | **-n** *MIN*, **\--chars-min**=*MIN* 192 | 193 | : Print only strings at least *MIN* characters long. The string length is 194 | measured in Unicode-characters (codepoints). **\--help** shows 195 | the default value. 196 | 197 | **-p** *FILE*, **\--output**=*FILE* 198 | 199 | : Print to *FILE* instead of *stdout*. 200 | 201 | **-q** *NUM*, **\--output-line-len**=*NUM* 202 | 203 | : Set the printed output-line-length in UTF-8 characters (string-findings 204 | only, metadata excluded). The line-length is limited by some internal 205 | buffer size value (see "`OUTPUT_BUF_LEN`" in source code). A value 206 | "`NUM`" bigger than "`OUTPUT_BUF_LEN/2`" is set to "`OUTPUT_BUF_LEN/2`". 207 | The longer the line-length is, the fewer strings will be wrapped to the 208 | next line. The downside with long output lines is, that the scanner loses 209 | precision in locating the findings. 210 | 211 | **-r**, **\--same-unicode-block** 212 | 213 | : Require all characters in a finding to originate from the same Unicode 214 | block. This option helps to reduce false positives, especially when 215 | scanning for UTF-16. When set, "`stringsext`" prints only Unicode block 216 | homogenous strings. For example: "`-u All -n 10 -r`" finds a sequence of at 217 | least 10 Cyrillic characters in a row or finds at least 10 Greek characters 218 | in a row, whereas it ignores strings with randomly Cyrillic-Greek mixed 219 | characters. Technically, this option guarantees, that all multibyte 220 | characters of a finding - decoded into UTF-8 - start with the same leading 221 | byte. This might be the default behavoir, in some future version of 222 | **stringsext**. 223 | 224 | **-s** *NUM*, **\--counter-offset**=*NUM* 225 | 226 | : Start offset NUM for the input-stream-byte-counter given as decimal or 227 | hexadecimal integer. This is useful when large input data is stored 228 | split in separate files and when these files are so big that they should 229 | be analysed in separate **stringsext** runs. 230 | 231 | Note: in general, it is better to treat all input files in one run by 232 | listing them as command-line-parameter. Thus, **stringsext** concatenates 233 | the split input-files to one input-stream before analyzing it. This way 234 | it is able to even recognize split strings at the cutting edge between 235 | two input files. 236 | 237 | **-t** *RADIX*, **\--radix**=*RADIX* 238 | 239 | : Print the position of the decoded string. The position indicated as 240 | input-stream bytes-offset. The single character argument specifies the 241 | RADIX of the offset: **o** for octal, **x** for hexadecimal, or **d** for 242 | decimal. 243 | 244 | **-u** *UBF*, **\--unicode-block-filter**=*UBF* 245 | 246 | : Unicode-block-filter UBF applied after decoding to UTF-8. 247 | 248 | The decoder first searches for validly encoded character sequences in the 249 | input stream. Then, the sequence of valid characters is decoded into a 250 | chunk of UTF-8 characters, which has to pass 4 filters before being 251 | printed: "`--chars-min`", "`--ascii-filter`", "`--unicode-bloc-filter`" and 252 | "`--grep-char`". 253 | 254 | The Unicode-block-filter applies to all decoded UTF-8 characters "`> 255 | U+007f`" and can be parametrized with the`--unicode-block-filter`" option 256 | which is a 64-bit integer given in hexadecimal, prepended with "`0x...`". 257 | 258 | Every bit "`0..=63`" maps to one leading-byte's code position in 259 | "`0xC0..0xFF`", e.g. if bit 0 is set -> all characters with leading byte 260 | "`0xC0`" pass the filter; if bit 1 is set -> all characters with leading 261 | byte "`0xC1`", pass the filter. Otherwise, the character is rejected. For 262 | example, to print only Syriac, set UFB to "`0x1000_0000`" (bit number 29 263 | set) and AF to '0x0'. This instructs the filter to let pass only UTF-8 264 | characters, whose leading byte is "`0xDC`". Table 3 on page 265 | shows UTF-8-leading-bytes and 266 | their codes. 267 | 268 | Alternatively, predefined alias names for the most common Unicode-blocks 269 | can be used: e.g. "`Latin`", "`Cyrillic`", "`Greek`" and many others. See the 270 | output of "`--list-encodings`" for more predefined filter names. 271 | 272 | 273 | **-V, \--version** 274 | 275 | : Print version info and exit. 276 | 277 | # EXIT STATUS 278 | 279 | **0** 280 | 281 | : Success. 282 | 283 | **other values** 284 | 285 | : Failure. 286 | 287 | 288 | # OUTPUT FORMAT 289 | 290 | The way **stringsext** prints its output can be configured with the following 291 | options: "`--no-metadata`", "`--radix`" and "`--output-line-len`". The first 292 | "`--no-metadata`" controls if metadata is presented printed, "`--radix` 293 | determines if and how the byte-counter is shown and the latter 294 | "`--output-line-len`" at what byte position string-findings are wrapped to the 295 | next line. 296 | 297 | **stringsext**'s output syntax is best illustrated by example. Consider 298 | the following screen-shot: 299 | 300 | ``` 301 | stringsext -t x -q 30 -e utf8,10 -e ascii,50 test.txt test-small.txt (1) 302 | (2) 303 | A 0 (a UTF-8) Who Moved My Cheese? (3) 304 | A <1e (a UTF-8) An A-Mazing Way To Deal With C (4) 305 | A >1e+ (a UTF-8) hange In (5) 306 | A <1e (b ascii) An A-Mazing Way To Deal With C (6) 307 | A >1e+ (b ascii) hange In (7) 308 | A 3c+ (a UTF-8) Your Work (8) 309 | A >3c (a UTF-8) And In Your Life (9) 310 | A 3c+ (b ascii) Your Work (10) 311 | ``` 312 | 313 | (3): The letter "`A`" in the first column indicates, that the input originates 314 | from the first input file "`test.txt`". "`B`" denotes the second input file, etc. 315 | 316 | (3): "`0`" indicates, that the string-finding "`Who Moved My Cheese?`" was found 317 | at position "`0x0`". 318 | 319 | (4): "`<1e`" means, that the string-finding "`An A-Mazing Way To Deal With C`" 320 | was found somewhere in "`0x1..=0x1e`". In addition, the implemented algorithm 321 | guarantees that the string-finding is never more than 60 bytes (2* `-q 30`) away 322 | from the indicated position, here: "`0x1e`". 323 | 324 | (5): The string-finding "`hange In`" continues the previous string, hence "`+`", 325 | and is situated "`>1e`", meaning somewhere in the range "`0x1f..=3b`". 326 | Here again, it is guaranteed, that the string-finding is always fewer 327 | than 60 bytes (2*`-q 30`) away from "`1e`". 328 | 329 | (3): "`a`" in "`(a UTF-8)`" indicates, that the string-finding "`Who Moved My 330 | Cheese?`" was found by the first scanner "`-e utf8,10`". 331 | 332 | (6): "`b`" refers to the second scanner, here "`-e ascii,50`". 333 | 334 | 335 | 336 | # EXAMPLES 337 | 338 | List available encodings and predefined filter names: 339 | 340 | stringsext -l 341 | 342 | Search for UTF-8 and UTF-16 Big-Endian encoded strings: 343 | 344 | stringsext -t x -e utf-8 -e utf-16be -- someimage.raw 345 | 346 | The same, but read from "`stdin`": 347 | 348 | cat someimage.raw | stringsext -t x -e utf-8 -e utf-16be -- - 349 | 350 | Scan a non-file device: 351 | 352 | stringsext -t x -e utf-8 -e utf-16be -- /dev/sda1 353 | 354 | Reduce the number of false positives, when scanning for UTF-16LE or UTF-16BE: 355 | 356 | stringsext -t x --same-unicode-block -e UTF-16le -- someimage.raw 357 | 358 | Search for Cyrillic only: 359 | 360 | stringsext -t x -e UTF-16le,,None,Cyrillic -- someimage.raw 361 | 362 | Search for UTF-16LE encoded Arabic and the digits 0 to 9: 363 | 364 | stringsext -t x -e UTF-16le,,0x3f000000000000,Arabic -- someimage.raw 365 | 366 | Search for UTF-8 encoded Syriac and all ASCII, control-codes excluded: 367 | 368 | stringsext -t x -e UTF-8,,All-Ctrl,0x10000000 -- someimage.raw 369 | 370 | Combine Little-Endian and Big-Endian scanning: 371 | 372 | stringsext -t x -e UTF-16be -e UTF-16le -- someimage.raw 373 | 374 | Show the filter default values used in the above example for debugging: 375 | 376 | stringsext -d -t x -e UTF-16be -e UTF-16le -- someimage.raw 377 | 378 | Search for path-names and URLs in some disk-partition: 379 | 380 | sudo stringsext -t x -e utf-8 -n 15 -g 47 -- /dev/disk/by-uuid/91C8-2721 381 | 382 | Equivalent to the above: 383 | 384 | sudo stringsext -t x -e utf-8,15,,,47 -- /dev/disk/by-uuid/91C8-2721 385 | 386 | 387 | # OPERATING PRINCIPLE 388 | 389 | A *valid* string is a sequence of valid characters according to the encoding 390 | chosen with **\--encoding**. A valid string may contain 391 | *control* characters and *graphic* (visible and human readable) 392 | characters. **stringsext** is a tool to extract graphic characters out of 393 | binary data streams. 394 | 395 | Scanners are parametrized with the **\--encoding ENC** option. Multiple scanners 396 | may operate in parallel. Their search field is divided into input chunks of "2 * 397 | `--output-line-len`" bytes (see source code documentation for details). 398 | 399 | Before being printed, valid strings must pass four different **filter**, whose 400 | filter criteria are defined with the parameters: *MIN*, *AF*, *UBF* or *GREP* 401 | (see above). 402 | 403 | 404 | # LIMITATIONS 405 | 406 | The ASCII-character GREP, searched with the "`--grep_char`" option, must appear 407 | in the first "`--output-line-len`" bytes to be reliably found in long strings. 408 | Increase "`--output-line-len`" if you search for very long strings. 409 | 410 | ## Limitations related to the encoding_rs library 411 | 412 | **stringsext** version 2 uses the external library **encoding_rs** to decode 413 | the incoming stream. Compared to the previous library **rust-encoding** 414 | used in **stringsext** version 1, the current library has some shortcomings 415 | mainly due to the restrictive API policy of the **encoding_rs** project. 416 | 417 | 1. **stringsext** could be faster, if **encoding_rs** were extensible 418 | (**rust-encoding** was): [feature request: ASCII-filter · Issue #46 · 419 | hsivonen/encoding_rs](https://github.com/hsivonen/encoding_rs/issues/46) 420 | 421 | 2. **stringsext**'s location counter could be more precise if the encoder 422 | state were observable: [Enhancement: get read access to the decoder's 423 | inner state · Issue #48 · 424 | hsivonen/encoding_rs](https://github.com/hsivonen/encoding_rs/issues/48) 425 | 426 | 3. **stringsext**'s encoding list could be more up to date, if **encoding_rs**' 427 | list were `public`: [Make encoding lists public by getreu · Pull Request 428 | #47 · 429 | hsivonen/encoding_rs](https://github.com/hsivonen/encoding_rs/pull/47) 430 | 431 | While being desirable, the **stringsext** project does not have the required 432 | resources to maintain a fork of **encoding_rs**. 433 | 434 | 435 | # RESOURCES 436 | 437 | **Project website:** 438 | 439 | # COPYING 440 | 441 | Copyright (C) 2016-2020 Jens Getreu 442 | 443 | Licensed under either of 444 | 445 | - [Apache Licence](), Version 2.0 446 | or 447 | ) 448 | - [MIT licence]() 449 | 450 | at your option. 451 | 452 | ## Contribution 453 | 454 | Unless you explicitly state otherwise, any contribution intentionally 455 | submitted for inclusion in the work by you, as defined in the Apache-2.0 456 | licence, shall be dual licensed as above, without any additional terms 457 | or conditions. Licensed under the Apache Licence, Version 2.0 (the 458 | \"Licence\"); you may not use this file except in compliance with the 459 | Licence. 460 | 461 | 462 | # AUTHORS 463 | 464 | Jens Getreu 465 | -------------------------------------------------------------------------------- /src/scanner.rs: -------------------------------------------------------------------------------- 1 | //! Find encoded strings in some input chunk, apply a filter (defined by a 2 | //! `Mission`-object) and store the filtered strings as UTF-8 in `Finding`-objects. 3 | 4 | extern crate encoding_rs; 5 | 6 | use crate::input::ByteCounter; 7 | use crate::mission::Mission; 8 | use crate::mission::MISSIONS; 9 | use encoding_rs::Decoder; 10 | use std::ops::Deref; 11 | 12 | /// A vector of `ScannerState` s. 13 | pub struct ScannerStates { 14 | /// Vector of ScannerState 15 | pub v: Vec, 16 | } 17 | 18 | impl ScannerStates { 19 | /// Constructor. 20 | pub fn new(missions: &'static MISSIONS) -> Self { 21 | let mut v = Vec::with_capacity(missions.len()); 22 | for i in 0..missions.len() { 23 | v.push(ScannerState::new(&missions[i])) 24 | } 25 | Self { v } 26 | } 27 | } 28 | 29 | /// Access `ScannerState` without `.v`. 30 | impl Deref for ScannerStates { 31 | type Target = Vec; 32 | 33 | fn deref(&self) -> &Self::Target { 34 | &self.v 35 | } 36 | } 37 | 38 | /// Some object that holds the state of the `scanner::FindingCollection::scan()` function allowing 39 | /// to process the input stream in batches. 40 | pub struct ScannerState { 41 | /// It contains all (static) information needed to parametrize the decoding and the 42 | /// filtering performed by `scanner::FindingCollection::scan()` 43 | pub mission: &'static Mission, 44 | 45 | /// The decoder may hold in its internal state, among other 46 | /// things, some bytes of output, when a multibyte encoder was cut at the end 47 | /// of a buffer. 48 | pub decoder: Decoder, 49 | 50 | /// For short strings (`< chars_min_nb`) at the very end of the buffer, we 51 | /// can not decide immediately, if they have to be printed or not, because we 52 | /// can not `peek()` into what is coming in the next chunk. Maybe the 53 | /// beginning of the next chunk completes this short string from the previous 54 | /// run, and both together are long enough (`>= chars_min_nb`) to be printed? 55 | pub last_scan_run_leftover: String, 56 | 57 | /// The last printed string touched the right boundary of the buffer, so it 58 | /// might cut and to be continued with the first string in the next run. 59 | /// `last_run_str_was_printed_and_is_maybe_cut_str` remembers this fact and 60 | /// advises the filter to check if the first string of the next run touches 61 | /// the left boundary of the buffer. If yes, this string will be printed, 62 | /// whatever length it has. 63 | pub last_run_str_was_printed_and_is_maybe_cut_str: bool, 64 | 65 | /// This an absolute byte counter counting bytes of the input stream. The 66 | /// value will be update after a `FindingCollection::scan()` run to point to the first not 67 | /// scanned byte in the input stream. 68 | pub consumed_bytes: ByteCounter, 69 | } 70 | 71 | impl ScannerState { 72 | /// Constructor. 73 | pub fn new(mission: &'static Mission) -> Self { 74 | Self { 75 | mission, 76 | decoder: mission.encoding.new_decoder_without_bom_handling(), 77 | // 78 | // We keep only short substrings for the next run, because about all 79 | // longer ones we can decide immediately. 80 | // `mission.chars_min_nb` is enough space, we never need more. 81 | // We multiply `mission.chars_min_nb` by 4, because it is 82 | // counted Unicode-codepoints and a codepoint can have 83 | // maximum 4 bytes in UTF-8. 84 | last_scan_run_leftover: String::with_capacity(mission.output_line_char_nb_max), 85 | last_run_str_was_printed_and_is_maybe_cut_str: false, 86 | consumed_bytes: mission.counter_offset, 87 | } 88 | } 89 | } 90 | 91 | #[cfg(test)] 92 | pub mod tests { 93 | use super::*; 94 | use crate::finding::Precision; 95 | use crate::finding_collection::FindingCollection; 96 | use crate::mission::Mission; 97 | use crate::mission::{Utf8Filter, AF_ALL, AF_CTRL, AF_WHITESPACE, UBF_LATIN, UBF_NONE}; 98 | use crate::mission::{UTF8_FILTER_ALL_VALID, UTF8_FILTER_LATIN}; 99 | use encoding_rs::Encoding; 100 | use lazy_static::lazy_static; 101 | 102 | // To see println!() output in test run, launch 103 | // cargo test -- --nocapture 104 | 105 | lazy_static! { 106 | pub static ref MISSION_ALL_UTF8: Mission = Mission { 107 | mission_id: 0, 108 | counter_offset: 10_000, 109 | print_encoding_as_ascii: false, 110 | encoding: Encoding::for_label(("utf-8").as_bytes()).unwrap(), 111 | chars_min_nb: 3, 112 | require_same_unicode_block: false, 113 | // this is a pass all filter 114 | filter: UTF8_FILTER_ALL_VALID, 115 | output_line_char_nb_max: 10, 116 | }; 117 | } 118 | lazy_static! { 119 | pub static ref MISSION_LATIN_UTF8: Mission = Mission { 120 | mission_id: 0, 121 | counter_offset: 10_000, 122 | print_encoding_as_ascii: false, 123 | encoding: Encoding::for_label(("utf-8").as_bytes()).unwrap(), 124 | chars_min_nb: 3, 125 | require_same_unicode_block: false, 126 | // this is a pass all filter 127 | filter: UTF8_FILTER_LATIN, 128 | output_line_char_nb_max: 10, 129 | }; 130 | } 131 | 132 | lazy_static! { 133 | pub static ref MISSION_LATIN_UTF8_GREP42: Mission = Mission { 134 | mission_id: 0, 135 | counter_offset: 10_000, 136 | print_encoding_as_ascii: false, 137 | encoding: Encoding::for_label(("utf-8").as_bytes()).unwrap(), 138 | chars_min_nb: 3, 139 | require_same_unicode_block: false, 140 | // this is a pass all filter 141 | filter: Utf8Filter { 142 | af: AF_ALL & !AF_CTRL | AF_WHITESPACE, 143 | ubf: UBF_LATIN, 144 | grep_char: Some(42), 145 | }, 146 | output_line_char_nb_max: 10, 147 | }; 148 | } 149 | 150 | lazy_static! { 151 | pub static ref MISSION_ALL_X_USER_DEFINED: Mission = Mission { 152 | mission_id: 0, 153 | counter_offset: 10_000, 154 | print_encoding_as_ascii: false, 155 | encoding: Encoding::for_label(("x-user-defined").as_bytes()).unwrap(), 156 | chars_min_nb: 3, 157 | require_same_unicode_block: false, 158 | filter: UTF8_FILTER_ALL_VALID, 159 | output_line_char_nb_max: 10, 160 | }; 161 | } 162 | lazy_static! { 163 | pub static ref MISSION_ASCII: Mission = Mission { 164 | mission_id: 0, 165 | counter_offset: 10_000, 166 | print_encoding_as_ascii: false, 167 | encoding: Encoding::for_label(("x-user-defined").as_bytes()).unwrap(), 168 | chars_min_nb: 3, 169 | require_same_unicode_block: false, 170 | // this is a pass all filter 171 | filter: Utf8Filter { 172 | af: AF_ALL & !AF_CTRL | AF_WHITESPACE, 173 | ubf: UBF_NONE, 174 | grep_char: None, 175 | }, 176 | output_line_char_nb_max: 10, 177 | }; 178 | } 179 | lazy_static! { 180 | pub static ref MISSION_REAL_DATA_SCAN: Mission = Mission { 181 | mission_id: 0, 182 | counter_offset: 10_000, 183 | print_encoding_as_ascii: false, 184 | encoding: Encoding::for_label(("utf-8").as_bytes()).unwrap(), 185 | chars_min_nb: 4, 186 | require_same_unicode_block: false, 187 | // this is a pass all filter 188 | filter: UTF8_FILTER_LATIN, 189 | output_line_char_nb_max: 60, 190 | }; 191 | } 192 | #[test] 193 | fn test_scan_input_buffer_chunks() { 194 | // This test uses INP_BUF_LEN=0x20 and 195 | // OUTPUT_BUF_LEN=0x40. 196 | // For other parameter see `ALL` above. 197 | let m: &'static Mission = &MISSION_ALL_UTF8; 198 | 199 | let mut ss = ScannerState::new(m); 200 | 201 | let input = b"a234567890b234567890c234"; 202 | let fc = FindingCollection::from(&mut ss, Some(0), input, true); 203 | 204 | assert_eq!(fc.v[0].position, 10000); 205 | assert_eq!(fc.v[0].position_precision, Precision::Exact); 206 | assert_eq!(fc.v[0].s, "a234567890"); 207 | 208 | assert_eq!(fc.v[1].position, 10000); 209 | assert_eq!(fc.v[1].position_precision, Precision::After); 210 | assert_eq!(fc.v[1].s, "b234567890"); 211 | 212 | assert_eq!(fc.v[2].position, 10020); 213 | assert_eq!(fc.v[2].position_precision, Precision::Exact); 214 | assert_eq!(fc.v[2].s, "c234"); 215 | assert!(!ss.last_run_str_was_printed_and_is_maybe_cut_str); 216 | 217 | assert_eq!(fc.first_byte_position, 10000); 218 | // This should never be true, since `OUTPUT_BUF_LEN` is 2* `INP_BUF_LEN`. 219 | assert!(!fc.str_buf_overflow); 220 | assert_eq!(ss.consumed_bytes, 10000 + 24); 221 | } 222 | 223 | #[test] 224 | fn test_scan_store_in_scanner_state() { 225 | // This test uses INP_BUF_LEN=0x20 and 226 | // OUTPUT_BUF_LEN=0x40. 227 | // For other parameter see `ALL` above. 228 | let m: &'static Mission = &MISSION_ALL_UTF8; 229 | 230 | let mut ss = ScannerState::new(m); 231 | 232 | let input = b"a234567890b234567890c2"; 233 | // True because this is the only and last input. 234 | let fc = FindingCollection::from(&mut ss, Some(0), input, true); 235 | 236 | assert_eq!(fc.v.len(), 3); 237 | assert_eq!(fc.first_byte_position, 10000); 238 | // This should never be true, since `OUTPUT_BUF_LEN` is 2* `INP_BUF_LEN`. 239 | assert!(!fc.str_buf_overflow); 240 | 241 | assert_eq!(fc.v[0].position, 10000); 242 | assert_eq!(fc.v[0].position_precision, Precision::Exact); 243 | assert_eq!(fc.v[0].s, "a234567890"); 244 | 245 | assert_eq!(fc.v[1].position, 10000); 246 | assert_eq!(fc.v[1].position_precision, Precision::After); 247 | assert_eq!(fc.v[1].s, "b234567890"); 248 | 249 | assert_eq!(fc.v[2].position, 10020); 250 | assert_eq!(fc.v[2].position_precision, Precision::Exact); 251 | assert_eq!(fc.v[2].s, "c2"); 252 | 253 | assert!(!ss.last_run_str_was_printed_and_is_maybe_cut_str); 254 | assert_eq!(ss.consumed_bytes, 10000 + 22); 255 | } 256 | 257 | #[test] 258 | fn test_split_str_iterator_and_store_in_scanner_state() { 259 | // This test uses INP_BUF_LEN=0x20 and 260 | // OUTPUT_BUF_LEN=0x40. 261 | // For other parameter see `ALL` above. 262 | // We test UTF-8 as input encoding. 263 | let m: &'static Mission = &MISSION_ALL_UTF8; 264 | 265 | let mut ss = ScannerState::new(m); 266 | 267 | let input = b"You\xC0\x82\xC0co"; 268 | // `false` because this is not the last input. 269 | let fc = FindingCollection::from(&mut ss, Some(0), input, false); 270 | 271 | assert_eq!(fc.v[0].position, 10000); 272 | assert_eq!(fc.v[0].position_precision, Precision::Exact); 273 | assert_eq!(fc.v[0].s, "You"); 274 | 275 | // "co" is not printed, because we do not know if 276 | // it can be completed by the next run. 277 | // It will be forwarded to the next run. 278 | assert_eq!(fc.v.len(), 1); 279 | assert_eq!(ss.last_scan_run_leftover, "co"); 280 | 281 | assert_eq!(fc.first_byte_position, 10000); 282 | assert!(!fc.str_buf_overflow); 283 | assert_eq!(ss.consumed_bytes, 10000 + 8); 284 | 285 | let input = b"me\xC0\x82\xC0home."; 286 | // True, because last input. 287 | let fc = FindingCollection::from(&mut ss, Some(0), input, true); 288 | 289 | assert_eq!(fc.v.len(), 2); 290 | assert_eq!(fc.v[0].position, 10008); 291 | assert_eq!(fc.v[0].position_precision, Precision::Before); 292 | // Note the "co"! 293 | assert_eq!(fc.v[0].s, "come"); 294 | 295 | assert_eq!(fc.v[1].position, 10013); 296 | assert_eq!(fc.v[1].position_precision, Precision::Exact); 297 | assert_eq!(fc.v[1].s, "home."); 298 | 299 | assert_eq!(ss.last_scan_run_leftover, ""); 300 | 301 | assert_eq!(fc.first_byte_position, 10008); 302 | assert!(!fc.str_buf_overflow); 303 | assert_eq!(ss.consumed_bytes, 10008 + 10); 304 | } 305 | 306 | #[test] 307 | fn test_grep_in_scan() { 308 | // This test uses INP_BUF_LEN=0x20 and 309 | // OUTPUT_BUF_LEN=0x40. 310 | // For other parameter see `ALL` above. 311 | // We test UTF-8 as input encoding. 312 | let m: &'static Mission = &MISSION_LATIN_UTF8_GREP42; 313 | 314 | let mut ss = ScannerState::new(m); 315 | 316 | let input = b"You\xC0\x82\xC0co"; 317 | // `false` because this is not the last input. 318 | let fc = FindingCollection::from(&mut ss, Some(0), input, false); 319 | 320 | assert_eq!(fc.v.len(), 0); 321 | 322 | // "co" is not printed, because we do not know if 323 | // it can be completed by the next run. 324 | // It will be forwarded to the next run. 325 | assert_eq!(ss.last_scan_run_leftover, "co"); 326 | 327 | assert_eq!(fc.first_byte_position, 10000); 328 | assert!(!fc.str_buf_overflow); 329 | assert_eq!(ss.consumed_bytes, 10000 + 8); 330 | 331 | let input = b"me*\xC0\x82\xC0ho*me.\x82"; 332 | // True, because last input. 333 | let fc = FindingCollection::from(&mut ss, Some(0), input, true); 334 | 335 | assert_eq!(fc.v.len(), 2); 336 | assert_eq!(fc.v[0].position, 10008); 337 | assert_eq!(fc.v[0].position_precision, Precision::Before); 338 | // Note the "co"! 339 | assert_eq!(fc.v[0].s, "come*"); 340 | 341 | assert_eq!(fc.v[1].position, 10014); 342 | assert_eq!(fc.v[1].position_precision, Precision::Exact); 343 | assert_eq!(fc.v[1].s, "ho*me."); 344 | 345 | assert_eq!(ss.last_scan_run_leftover, ""); 346 | 347 | assert_eq!(fc.first_byte_position, 10008); 348 | assert!(!fc.str_buf_overflow); 349 | assert_eq!(ss.consumed_bytes, 10008 + 13); 350 | } 351 | 352 | #[test] 353 | /// What happens when a multi-byte UTF-8 is split at the 354 | /// end of the input buffer between two scan runs? 355 | fn test_scan_buffer_split_multibyte() { 356 | // We test UTF-8 as input encoding. 357 | let m: &'static Mission = &MISSION_ALL_UTF8; 358 | 359 | let mut ss = ScannerState::new(m); 360 | 361 | // One letter more, and we get "OutputFull" because 362 | // the scanner can not be sure to have enough space. 363 | // The last bytes are the beginning of a multi-byte 364 | // character, that is cut between two runs. 365 | let input = b"word\xe2\x82"; 366 | 367 | // This `FindingCollection` is empty. 368 | let _fc = FindingCollection::from(&mut ss, Some(0), input, false); 369 | 370 | //println!("{:#?}",fc); 371 | 372 | //second run 373 | // The first byte is the remaining € sign from the 374 | // last run. 375 | let input = b"\xacoh\xC0no no"; 376 | 377 | let fc = FindingCollection::from(&mut ss, Some(0), input, false); 378 | 379 | //println!("{:#?}",fc); 380 | 381 | assert_eq!(fc.v[0].position, 10006); 382 | assert_eq!(fc.v[0].position_precision, Precision::Before); 383 | assert_eq!(fc.v[0].s, "word€oh"); 384 | 385 | assert_eq!(fc.first_byte_position, 10006); 386 | assert!(!fc.str_buf_overflow); 387 | assert_eq!(ss.consumed_bytes, 10006 + 9); 388 | 389 | // Third run. 390 | // There are no remaining bytes stored in the decoder. The first byte is the beginning 391 | // of the € sign. 392 | let input = b"\xe2\x82\xacStream end."; 393 | 394 | let fc = FindingCollection::from(&mut ss, Some(0), input, true); 395 | 396 | //println!("{:#?}", fc); 397 | 398 | assert_eq!(fc.len(), 2); 399 | 400 | assert_eq!(fc.v[0].position, 10015); 401 | assert_eq!(fc.v[0].position_precision, Precision::Before); 402 | assert_eq!(fc.v[0].s, "no no€Stre"); 403 | // Here the line is full. 404 | 405 | assert_eq!(fc.v[1].position, 10015); 406 | assert_eq!(fc.v[1].position_precision, Precision::After); 407 | assert_eq!(fc.v[1].s, "am end."); 408 | 409 | assert_eq!(fc.first_byte_position, 10015); 410 | assert!(!fc.str_buf_overflow); 411 | assert_eq!(ss.consumed_bytes, 10015 + 14); 412 | } 413 | 414 | #[test] 415 | fn test_to_short1() { 416 | // As `chars_min_nb` is 3, we expect stings with 417 | // length 1 to be omitted. 418 | 419 | // We test UTF-8 as input encoding. 420 | let m: &'static Mission = &MISSION_ALL_UTF8; 421 | 422 | let mut ss = ScannerState::new(m); 423 | 424 | let input = b"ii\xC0abc\xC0\xC1de\xC0fgh\xC0ijk"; 425 | 426 | let fc = FindingCollection::from(&mut ss, Some(0), input, false); 427 | 428 | //println!("{:#?}", fc.v); 429 | 430 | assert_eq!(fc.first_byte_position, 10000); 431 | assert!(!fc.str_buf_overflow); 432 | assert_eq!(fc.v.len(), 2); 433 | 434 | assert_eq!(fc.v[0].s, "abc"); 435 | assert_eq!(fc.v[0].position, 10003); 436 | assert_eq!(fc.v[0].position_precision, Precision::Exact); 437 | 438 | // Note that "de" is missing, too short. 439 | assert_eq!(fc.v[1].s, "fgh"); 440 | assert_eq!(fc.v[1].position, 10011); 441 | assert_eq!(fc.v[1].position_precision, Precision::Exact); 442 | 443 | assert_eq!(ss.consumed_bytes, 10000 + 18); 444 | assert!(!ss.last_run_str_was_printed_and_is_maybe_cut_str); 445 | assert_eq!(ss.last_scan_run_leftover, "ijk"); 446 | 447 | // Second run 448 | // Only "def" is long enough. 449 | let input = b"b\xC0\x82c\xC0def"; 450 | 451 | let fc = FindingCollection::from(&mut ss, Some(0), input, true); 452 | 453 | //println!("{:#?}", fc.v); 454 | 455 | assert_eq!(fc.first_byte_position, 10018); 456 | assert!(!fc.str_buf_overflow); 457 | assert_eq!(fc.v.len(), 2); 458 | 459 | assert_eq!(fc.v[0].position, 10018); 460 | assert_eq!(fc.v[0].position_precision, Precision::Before); 461 | assert_eq!(fc.v[0].s, "ijkb"); 462 | 463 | assert_eq!(fc.v[1].position, 10023); 464 | assert_eq!(fc.v[1].position_precision, Precision::Exact); 465 | assert_eq!(fc.v[1].s, "def"); 466 | 467 | assert_eq!(ss.consumed_bytes, 10018 + 8); 468 | assert!(!ss.last_run_str_was_printed_and_is_maybe_cut_str); 469 | assert_eq!(ss.last_scan_run_leftover, ""); 470 | } 471 | 472 | #[test] 473 | fn test_to_short2() { 474 | // As `chars_min_nb` is 3, we expect stings with 475 | // length 1 to be omitted. 476 | 477 | // We test UTF-8 as input encoding. 478 | let m: &'static Mission = &MISSION_LATIN_UTF8; 479 | 480 | let mut ss = ScannerState::new(m); 481 | 482 | let input = "ii€ääà€€de€fgh€ijk".as_bytes(); 483 | 484 | let fc = FindingCollection::from(&mut ss, Some(0), input, false); 485 | 486 | //println!("{:#?}", fc.v); 487 | 488 | assert_eq!(fc.first_byte_position, 10000); 489 | assert!(!fc.str_buf_overflow); 490 | assert_eq!(fc.v.len(), 2); 491 | 492 | assert_eq!(fc.v[0].s, "ääà"); 493 | assert_eq!(fc.v[0].position, 10000); 494 | // This was cut at the edge of `input_window`. 495 | assert_eq!(fc.v[0].position_precision, Precision::Exact); 496 | 497 | // Note that "de" is missing, too short. 498 | assert_eq!(fc.v[1].s, "fgh"); 499 | assert_eq!(fc.v[1].position, 10020); 500 | // This was cut at the edge of `input_window`. 501 | assert_eq!(fc.v[1].position_precision, Precision::Before); 502 | 503 | assert_eq!(ss.consumed_bytes, 10000 + 31); 504 | assert!(!ss.last_run_str_was_printed_and_is_maybe_cut_str); 505 | assert_eq!(ss.last_scan_run_leftover, "ijk"); 506 | 507 | // Second run 508 | // Only "def" is long enough. 509 | let input = b"b\xC0\x82c\xC0def"; 510 | 511 | let fc = FindingCollection::from(&mut ss, Some(0), input, true); 512 | 513 | //println!("{:#?}", fc.v); 514 | 515 | assert_eq!(fc.first_byte_position, 10031); 516 | assert!(!fc.str_buf_overflow); 517 | assert_eq!(fc.v.len(), 2); 518 | 519 | assert_eq!(fc.v[0].position, 10031); 520 | assert_eq!(fc.v[0].position_precision, Precision::Before); 521 | assert_eq!(fc.v[0].s, "ijkb"); 522 | 523 | assert_eq!(fc.v[1].position, 10036); 524 | // This was cut at the edge of `input_window`. 525 | assert_eq!(fc.v[1].position_precision, Precision::Exact); 526 | assert_eq!(fc.v[1].s, "def"); 527 | 528 | assert_eq!(ss.consumed_bytes, 10031 + 8); 529 | assert!(!ss.last_run_str_was_printed_and_is_maybe_cut_str); 530 | assert_eq!(ss.last_scan_run_leftover, ""); 531 | } 532 | 533 | #[test] 534 | fn test_field_with_zeros() { 535 | // Input-data 536 | 537 | // 00000000 7f 45 4c 46 02 01 01 00 00 00 00 00 00 00 00 00 |.ELF............| 538 | // 00000010 03 00 3e 00 01 00 00 00 40 51 07 00 00 00 00 00 |..>.....@Q......| 539 | // 00000020 40 00 00 00 00 00 00 00 c8 c1 46 01 00 00 00 00 |@.........F.....| 540 | // 00000030 00 00 00 00 40 00 38 00 0c 00 40 00 2c 00 2b 00 |....@.8...@.,.+.| 541 | 542 | // First line in the following output is a bug. 543 | // ./stringsext -e utf-8 -t X -q 16 ../debug/stringsext 544 | 545 | // 30 `+` 546 | // 2e0 `/lib64/ld-linux-` 547 | // 2f0+ `x86-64.so.2` 548 | // 353 `B1(M` 549 | 550 | // We test UTF-8 as input encoding. 551 | let m: &'static Mission = &MISSION_REAL_DATA_SCAN; 552 | let mut ss = ScannerState::new(m); 553 | 554 | let input = b"\x00\x00\x00\x00\x40\x00\x38\x00\x0c\x00\x40\x00\x2c\x00\x2b\x00"; 555 | let fc = FindingCollection::from(&mut ss, Some(0), input, false); 556 | // Test that the bug is gone 557 | assert_ne!(fc.v.len(), 1); 558 | //assert_ne!(fc.v[0].s, "+"); 559 | } 560 | } 561 | -------------------------------------------------------------------------------- /src/finding_collection.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::assertions_on_constants)] 2 | extern crate encoding_rs; 3 | 4 | use crate::as_mut_str_unchecked_no_borrow_check; 5 | use crate::as_str_unchecked_no_borrow_check; 6 | use crate::finding::Finding; 7 | use crate::finding::Precision; 8 | use crate::finding::OUTPUT_BUF_LEN; 9 | use crate::helper::starts_with_multibyte_char; 10 | use crate::helper::SplitStr; 11 | use crate::input::ByteCounter; 12 | use crate::input::INPUT_BUF_LEN; 13 | use crate::scanner::ScannerState; 14 | use encoding_rs::DecoderResult; 15 | use std::io::Write; 16 | use std::marker::PhantomPinned; 17 | use std::ops::Deref; 18 | use std::pin::Pin; 19 | use std::slice; 20 | use std::str; 21 | 22 | /// `FindingCollection` is a set of ordered `Finding` s. 23 | /// The box `output_buffer_bytes` and the struct `Finding` are self-referential, 24 | /// because `Finding.s` points into `output_buffer_bytes`. Therefore, special 25 | /// care is taken that, `output_buffer_bytes` is protected from being moved in 26 | /// memory: 27 | /// 1. `output_buffer_bytes` is private. 28 | /// 2. The returned `FindingCollection` is wrapped in a 29 | /// `Pin>>`. 30 | #[derive(Debug)] 31 | pub struct FindingCollection<'a> { 32 | /// `Finding` s in this vector are in chronological order. 33 | pub v: Vec>, 34 | /// All concurrent `ScannerState::scan()` start at the same byte. All 35 | /// `Finding.position` refer to `first_byte_position` as zero. 36 | pub first_byte_position: ByteCounter, 37 | /// A buffer containing the UTF-8 representation of all findings during one 38 | /// `Self::from()` run. First, the `Decoder` fills in some UTF-8 39 | /// string. This string is then filtered. The result of this filtering is 40 | /// a collection of `Finding`-objects stored in a `FindingCollection`. The 41 | /// `Finding`-objects have a `&str`-member called `Finding.s` that is 42 | /// a substring (slice) of `output_buffer_bytes`. 43 | output_buffer_bytes: Box<[u8]>, 44 | /// If `output_buffer` is too small to receive all findings, this is set 45 | /// `true` indicating that only the last `Finding` s could be stored. At 46 | /// least one `Finding` got lost. This incident is reported to the user. If 47 | /// ever this happens, the `OUTPUT_BUF_LEN` was not chosen big enough. 48 | pub str_buf_overflow: bool, 49 | _marker: PhantomPinned, 50 | } 51 | impl FindingCollection<'_> { 52 | pub fn new(byte_offset: ByteCounter) -> Self { 53 | // This buffer lives on the heap. let mut output_buffer_bytes = 54 | // Box::new([0u8; OUTPUT_BUF_LEN]); 55 | let output_buffer_bytes = Box::new([0u8; OUTPUT_BUF_LEN]); 56 | FindingCollection { 57 | v: Vec::new(), 58 | first_byte_position: byte_offset, 59 | output_buffer_bytes, 60 | str_buf_overflow: false, 61 | _marker: PhantomPinned, 62 | } 63 | } 64 | 65 | /// First, scans for valid encoded strings in `input_buffer, then decodes them ` 66 | /// using `ss.decoder` to UTF-8 and writes the results as UTF-8 in 67 | /// `fc.output_buffer_bytes`. Finally some filter is applied to the found strings 68 | /// retaining only those who satisfy the filter criteria.\ 69 | /// 70 | /// * The input of this function is `input_buffer`. 71 | /// * The output of this function is the returned `FindingCollection`. 72 | /// 73 | /// The input parameter `input_file_id` is forwarded and stored in each `Finding` 74 | /// of the returned `FindingCollection`.\ 75 | /// The function keeps its inner state in 76 | /// `ss.decoder`, `ss.last_scan_run_leftover`, 77 | /// `ss.last_run_str_was_printed_and_is_maybe_cut_str` and `ss.consumed_bytes`.\ 78 | /// `ss.mission` is not directly used in this function, but some part of it, the 79 | /// `ss.mission.filter`, is forwarded to the helper function: 80 | /// `helper::SplitStr::next()`.\ 81 | /// In case this is the last `input_buffer` of the stream, `last` must be set 82 | /// to correctly flush the `ss.decoder`. 83 | 84 | pub fn from<'a>( 85 | ss: &mut ScannerState, 86 | input_file_id: Option, 87 | input_buffer: &[u8], 88 | is_last_input_buffer: bool, 89 | ) -> Pin>> { 90 | let mut fc = FindingCollection::new(ss.consumed_bytes); 91 | // We do not clear `output_buffer_bytes`, we just overwrite. 92 | 93 | // Initialisation 94 | let mut extra_round = false; 95 | let mut decoder_input_start = 0usize; 96 | let mut decoder_input_end; 97 | let mut decoder_output_start = 0usize; 98 | 99 | // Copy `ScannerState` in `last_window...` 100 | // Copy last run leftover bytes at the beginning of `output_buffer`. 101 | let mut last_window_leftover_len = 0usize; 102 | if !ss.last_scan_run_leftover.is_empty() { 103 | // We don't need to copy here, we just rewind temporarily 104 | // `decoder_output_start` to `ss.last_scan_run_leftover`. 105 | fc.output_buffer_bytes 106 | // Make the same space. 107 | [decoder_output_start..decoder_output_start + ss.last_scan_run_leftover.len()] 108 | .copy_from_slice(ss.last_scan_run_leftover.as_bytes()); 109 | // Remember for later use. 110 | last_window_leftover_len = ss.last_scan_run_leftover.len(); 111 | ss.last_scan_run_leftover.clear(); 112 | // Make the decoder write behind the insertion. 113 | decoder_output_start += last_window_leftover_len; 114 | } 115 | let mut last_window_str_was_printed_and_is_maybe_cut_str = 116 | ss.last_run_str_was_printed_and_is_maybe_cut_str; 117 | 118 | // In many encodings (e.g. UTF16), to fill one `output_line` we need more bytes of input. 119 | // If ever the string gets longer than `output_line_char_nb_max`, `SplitStr` will wrap the line. 120 | let decoder_input_window = 2 * ss.mission.output_line_char_nb_max; 121 | let mut is_last_window = false; 122 | 123 | // iterate over `input_buffer with ``decoder_input_window`-sized slices. 124 | '_input_window_loop: while decoder_input_start < input_buffer.len() { 125 | decoder_input_end = match decoder_input_start + decoder_input_window { 126 | n if n < input_buffer.len() => n, // There are at least one byte more left in `input_buffer`. 127 | _ => { 128 | is_last_window = true; 129 | input_buffer.len() 130 | } 131 | }; 132 | 133 | // Decode one `input_window`, go as far as you can, then loop again. 134 | 'decoder: loop { 135 | let output_buffer_slice: &mut str = as_mut_str_unchecked_no_borrow_check!( 136 | &mut fc.output_buffer_bytes[decoder_output_start..] 137 | ); 138 | let (decoder_result, decoder_read, decoder_written) = 139 | ss.decoder.decode_to_str_without_replacement( 140 | &input_buffer[decoder_input_start..decoder_input_end], 141 | output_buffer_slice, 142 | extra_round, 143 | ); 144 | 145 | // If the assumption is wrong we change later. 146 | let mut position_precision = Precision::Exact; 147 | 148 | // Regardless of whether the intermediate buffer got full 149 | // or the input buffer was exhausted, let's process what's 150 | // in the intermediate buffer. 151 | 152 | // The target encoding is always UTF-8. 153 | if decoder_written > 0 { 154 | // With the following `if`, we check if the previous scan has 155 | // potentially left some remaining bytes in the Decoder's inner 156 | // state. This is a complicated corner case, because the inner 157 | // state of the `encoding_rs` decoder is private and there is 158 | // yet not method to query if the decoder is in a neutral state. 159 | // Read the related Issue [Enhancement: get read access to the 160 | // decoder's inner state · Issue #48 · 161 | // hsivonen/encoding_rs](https://github.com/hsivonen/encoding_rs/issues/48) 162 | // 163 | // As a workaround, we first check if this is the first round 164 | // (`decoder_input_start == 0`). Seeing, that we only know the 165 | // `ByteCounter` precisely at that point and that all other 166 | // round's findings will be tagged `Precision::After` anyway, 167 | // there is no need to investigate further in these cases. 168 | // 169 | // We can reduce the cases of double decoding by checking if the 170 | // first decoded character is a multi-byte UTF-8. If yes, this 171 | // means (in most cases), that no bytes had been stored in the 172 | // decoder's inner state and therefore we can assume that the 173 | // first character was found exactly at `decoder_input_start`. 174 | // If so, we can then tag this string-finding with 175 | // `Precision::exact`. 176 | if decoder_input_start == 0 && starts_with_multibyte_char(output_buffer_slice) { 177 | // The only way to find out from which scan() run the first 178 | // bytes came, is to scan again with a new Decoder and compare 179 | // the results. 180 | let mut empty_decoder = 181 | ss.decoder.encoding().new_decoder_without_bom_handling(); 182 | // A short buffer on the stack will do. 183 | let mut buffer_bytes = [0u8; 8]; 184 | // This is save, because there are only valid 0 in 185 | // `buffer_bytes`. 186 | let buffer: &mut str = 187 | as_mut_str_unchecked_no_borrow_check!(buffer_bytes[..]); 188 | // Alternative code, but slower. let tmp_buffer: &mut str = 189 | // std::str::from_utf8_mut(&mut buffer_bytes[..]).unwrap(); 190 | let (_, _, written) = empty_decoder.decode_to_str_without_replacement( 191 | input_buffer, 192 | &mut *buffer, 193 | true, 194 | ); 195 | // When the result of the two decoders is not the same, as the 196 | // bytes originating from the previous run, we know the extra 197 | // bytes come from the previous run. Unfortunately there is no 198 | // way to determine how many the decoder had internally stored. 199 | // I can be one, two, or three. We only know that the multibyte 200 | // sequence started some byte before 0. 201 | 202 | if (written == 0) 203 | || (fc.output_buffer_bytes[0..written] != buffer_bytes[0..written]) 204 | { 205 | position_precision = Precision::Before; 206 | } 207 | } 208 | } 209 | 210 | // Prepare input for `SplitStr` 211 | let mut split_str_start = decoder_output_start; 212 | let split_str_end = decoder_output_start + decoder_written; 213 | // Enlarge window to the left, to cover not treated bytes again. 214 | if last_window_leftover_len > 0 { 215 | // Go some bytes to the left. 216 | split_str_start -= last_window_leftover_len; 217 | // We use it only once. 218 | last_window_leftover_len = 0; 219 | // We lose precision. 220 | position_precision = Precision::Before; 221 | }; 222 | 223 | // This is safe because the decoder guarantees us to return only valid UTF-8. 224 | // We need unsafe code here because the buffer is still borrowed mutably by decoder. 225 | let split_str_buffer = as_str_unchecked_no_borrow_check!( 226 | fc.output_buffer_bytes[split_str_start..split_str_end] 227 | ); 228 | 229 | // Another way of saying (decoder_result == DecoderResult::Malformed) || 230 | // (is_last_window ...): 231 | // This can only be `false`, when `split_str_buffer` touches the right boundary (end) 232 | // of an `input_window`. Normally it `true` because we usually stop at 233 | // `DecoderResult::Malformed`. 234 | let invalid_bytes_after_split_str_buffer = (decoder_result 235 | != DecoderResult::InputEmpty 236 | && decoder_result != DecoderResult::OutputFull) 237 | || (is_last_window && is_last_input_buffer); 238 | 239 | // Use it only once. 240 | let continue_str_if_possible = last_window_str_was_printed_and_is_maybe_cut_str; 241 | last_window_str_was_printed_and_is_maybe_cut_str = false; 242 | 243 | // Now we split `split_str_buffer` into substrings and store them in 244 | // vector `fc.v`. 245 | 246 | '_chunk_loop: for chunk in SplitStr::new( 247 | split_str_buffer, 248 | ss.mission.chars_min_nb, 249 | ss.mission.require_same_unicode_block, 250 | continue_str_if_possible, 251 | invalid_bytes_after_split_str_buffer, 252 | ss.mission.filter, 253 | ss.mission.output_line_char_nb_max, 254 | ) { 255 | if !chunk.s_is_to_be_filtered_again { 256 | // We keep it for printing. 257 | fc.v.push(Finding { 258 | input_file_id, 259 | mission: ss.mission, 260 | position: ss.consumed_bytes + decoder_input_start as ByteCounter, 261 | position_precision, 262 | s: chunk.s, 263 | s_completes_previous_s: chunk.s_completes_previous_s, 264 | }); 265 | 266 | last_window_leftover_len = 0; 267 | 268 | last_window_str_was_printed_and_is_maybe_cut_str = chunk.s_is_maybe_cut; 269 | } else { 270 | // `chunk.s_is_to_be_filtered_again` 271 | 272 | // This chunk will be inserted at the beginning 273 | // of the `output_buffer_bytes` and we do not print it 274 | // now. As we will see it (completed to its full 275 | // length) again, we can decide later what to do with 276 | // it. 277 | 278 | // As we exactly know where `chunk.s` is located in 279 | // `ss.output_buffer_bytes`, it is enough to remember 280 | // its length. 281 | last_window_leftover_len = chunk.s.len(); 282 | // As the chunk is not printed now, so we set this 283 | // to `false`: 284 | last_window_str_was_printed_and_is_maybe_cut_str = false; 285 | } 286 | 287 | // For all other following `SplitStr` we set this, 288 | // since we do not know their exact position. 289 | position_precision = Precision::After; 290 | } 291 | 292 | decoder_output_start += decoder_written; 293 | 294 | decoder_input_start += decoder_read; 295 | 296 | // Now let's see if we should read again or process the 297 | // rest of the current input buffer. 298 | match decoder_result { 299 | DecoderResult::InputEmpty => { 300 | if is_last_window && is_last_input_buffer && !extra_round { 301 | extra_round = true; 302 | } else { 303 | break 'decoder; 304 | } 305 | } 306 | DecoderResult::OutputFull => { 307 | // This should never happen. If ever it does we clear 308 | // the FindingCollection to make more space and 309 | // forget all previous findings. 310 | fc.clear_and_mark_incomplete(); 311 | eprintln!("Buffer overflow. Output buffer is too small to receive all decoder data.\ 312 | Some findings got lost in input {:x}..{:x} from file {:?} for scanner ({})!", 313 | ss.consumed_bytes, 314 | ss.consumed_bytes + decoder_input_start as ByteCounter, 315 | input_file_id, 316 | char::from(ss.mission.mission_id + 97) 317 | ); 318 | decoder_output_start = 0; 319 | debug_assert!( 320 | true, 321 | "Buffer overflow. Output buffer is too small to receive all decoder data." 322 | ); 323 | } 324 | DecoderResult::Malformed(_, _) => {} 325 | }; 326 | } 327 | } 328 | 329 | // Store possible leftovers in `ScannerState` for next `scanner::scan()`. 330 | let last_window_leftover = as_str_unchecked_no_borrow_check!( 331 | fc.output_buffer_bytes 332 | [decoder_output_start - last_window_leftover_len..decoder_output_start] 333 | ); 334 | // Update inner state for next `scan()` run. 335 | ss.last_scan_run_leftover = String::from(last_window_leftover); 336 | ss.last_run_str_was_printed_and_is_maybe_cut_str = 337 | last_window_str_was_printed_and_is_maybe_cut_str; 338 | ss.consumed_bytes += decoder_input_start as ByteCounter; 339 | 340 | // Now we pin the `FindingCollection`. 341 | Box::pin(fc) 342 | } 343 | 344 | /// Clears the buffer to make more space after buffer overflow. Tag the 345 | /// collection as overflowed. 346 | pub fn clear_and_mark_incomplete(&mut self) { 347 | self.v.clear(); 348 | self.str_buf_overflow = true; 349 | } 350 | 351 | /// This method formats and dumps a `FindingCollection` to the output 352 | /// channel, usually `stdout`. 353 | #[allow(dead_code)] 354 | pub fn print(&self, out: &mut dyn Write) -> Result<(), Box> { 355 | if self.str_buf_overflow { 356 | eprint!("Warning: output buffer overflow! Some findings might got lost."); 357 | eprintln!( 358 | "in input chunk 0x{:x}-0x{:x}.", 359 | self.first_byte_position, 360 | self.first_byte_position + INPUT_BUF_LEN as ByteCounter 361 | ); 362 | } 363 | for finding in &self.v { 364 | finding.print(out)?; 365 | } 366 | Ok(()) 367 | } 368 | } 369 | 370 | /// This allows us to create an iterator from a `FindingCollection`. 371 | impl<'a> IntoIterator for &'a Pin>> { 372 | type Item = &'a Finding<'a>; 373 | type IntoIter = FindingCollectionIterator<'a>; 374 | 375 | fn into_iter(self) -> Self::IntoIter { 376 | FindingCollectionIterator { fc: self, index: 0 } 377 | } 378 | } 379 | 380 | /// This allows iterating over `Finding`-objects in a `FindingCollection::v`. 381 | /// The state of this iterator must hold the whole `FindingCollection` and not 382 | /// only `FindingCollection::v`! This is required because `next()` produces a 383 | /// link to `Finding`, whose member `Finding::s` is a `&str`. The content of this 384 | /// `&str` is part of `FindingCollection::output_buffer_bytes`, thus the need for 385 | /// the whole object `FindingCollection`. 386 | 387 | pub struct FindingCollectionIterator<'a> { 388 | fc: &'a FindingCollection<'a>, 389 | index: usize, 390 | } 391 | 392 | /// This allows us to iterate over `FindingCollection`. It is needed 393 | /// by `kmerge()`. 394 | impl<'a> Iterator for FindingCollectionIterator<'a> { 395 | type Item = &'a Finding<'a>; 396 | fn next(&mut self) -> Option<&'a Finding<'a>> { 397 | let result = if self.index < self.fc.v.len() { 398 | Some(&self.fc.v[self.index]) 399 | } else { 400 | None 401 | }; 402 | self.index += 1; 403 | result 404 | } 405 | } 406 | 407 | /// We consider the "content" of a `FindingCollection` 408 | /// to be `FindingCollection::v` which is a `Vec`. 409 | impl<'a> Deref for FindingCollection<'a> { 410 | type Target = Vec>; 411 | 412 | fn deref(&self) -> &Self::Target { 413 | &self.v 414 | } 415 | } 416 | 417 | #[cfg(test)] 418 | mod tests { 419 | use super::*; 420 | use crate::finding::Precision; 421 | use crate::finding_collection::FindingCollection; 422 | use crate::mission::Mission; 423 | use crate::scanner::tests::MISSION_ALL_X_USER_DEFINED; 424 | use crate::scanner::tests::MISSION_ASCII; 425 | use std::str; 426 | 427 | // To see println!() output in test run, launch 428 | // cargo test -- --nocapture 429 | 430 | #[test] 431 | fn test_ascii_emulation() { 432 | let m: &'static Mission = &MISSION_ALL_X_USER_DEFINED; 433 | 434 | let mut ss = ScannerState::new(m); 435 | 436 | let input = b"abcdefg\x58\x59\x80\x82h\x83ijk\x89\x90"; 437 | 438 | let fc = FindingCollection::from(&mut ss, Some(0), input, true); 439 | 440 | //println!("{:#?}", fc.v); 441 | 442 | assert_eq!(fc.first_byte_position, 10_000); 443 | assert!(!fc.str_buf_overflow); 444 | assert_eq!(fc.v.len(), 2); 445 | 446 | assert_eq!(fc.v[0].position, 10_000); 447 | assert_eq!(fc.v[0].position_precision, Precision::Exact); 448 | assert_eq!(fc.v[0].s, "abcdefgXY\u{f780}"); 449 | // Next output line. 450 | 451 | assert_eq!(fc.v[1].position, 10_000); 452 | assert_eq!(fc.v[1].position_precision, Precision::After); 453 | assert_eq!(fc.v[1].s, "\u{f782}h\u{f783}ijk\u{f789}\u{f790}"); 454 | 455 | assert_eq!( 456 | // We only compare the first 35 bytes, the others are 0 anyway. 457 | unsafe { str::from_utf8_unchecked(&fc.output_buffer_bytes[..35]) }, 458 | "abcdefg\u{58}\u{59}\u{f780}\u{f782}h\u{f783}ijk\u{f789}\u{f790}\ 459 | \u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}" 460 | ); 461 | 462 | assert_eq!(ss.consumed_bytes, 10000 + 18); 463 | // false, because we told the `FindingCollection::scan()` this is the last run. 464 | assert!(!ss.last_run_str_was_printed_and_is_maybe_cut_str); 465 | assert_eq!(ss.last_scan_run_leftover, ""); 466 | 467 | // Second run. 468 | 469 | let m: &'static Mission = &MISSION_ASCII; 470 | 471 | let mut ss = ScannerState::new(m); 472 | 473 | let input = b"abcdefg\x58\x59\x80\x82h\x83ijk\x89\x90"; 474 | 475 | let fc = FindingCollection::from(&mut ss, Some(0), input, false); 476 | 477 | //println!("{:#?}", fc.v); 478 | 479 | assert_eq!(fc.v.len(), 2); 480 | assert_eq!(fc.first_byte_position, 10000); 481 | assert!(!fc.str_buf_overflow); 482 | 483 | assert_eq!(fc.v[0].position, 10_000); 484 | assert_eq!(fc.v[0].position_precision, Precision::Exact); 485 | assert_eq!(fc.v[0].s, "abcdefgXY"); 486 | // Next output line. 487 | 488 | assert_eq!(fc.v[1].position, 10_000); 489 | assert_eq!(fc.v[1].position_precision, Precision::After); 490 | // Note that `h` is gone. 491 | assert_eq!(fc.v[1].s, "ijk"); 492 | 493 | assert_eq!( 494 | // We only compare the first 35 bytes, the others are 0 anyway. 495 | unsafe { str::from_utf8_unchecked(&fc.output_buffer_bytes[..35]) }, 496 | "abcdefg\u{58}\u{59}\u{f780}\u{f782}h\u{f783}ijk\u{f789}\u{f790}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}\u{0}" 497 | ); 498 | 499 | assert_eq!(ss.consumed_bytes, 10000 + 18); 500 | assert!(!ss.last_run_str_was_printed_and_is_maybe_cut_str); 501 | assert_eq!(ss.last_scan_run_leftover, ""); 502 | } 503 | } 504 | -------------------------------------------------------------------------------- /src/mission.rs: -------------------------------------------------------------------------------- 1 | //! Parse and convert command-line-arguments into static `MISSION` structures, 2 | //! that are mainly used to initialize `ScannerState`-objects. 3 | 4 | #![allow(clippy::too_many_arguments)] 5 | #![allow(clippy::type_complexity)] 6 | 7 | extern crate anyhow; 8 | extern crate encoding_rs; 9 | use crate::input::ByteCounter; 10 | use crate::options::ARGS; 11 | use crate::options::ASCII_ENC_LABEL; 12 | use crate::options::CHARS_MIN_DEFAULT; 13 | use crate::options::COUNTER_OFFSET_DEFAULT; 14 | use crate::options::ENCODING_DEFAULT; 15 | use crate::options::OUTPUT_LINE_CHAR_NB_MAX_DEFAULT; 16 | use crate::options::OUTPUT_LINE_CHAR_NB_MIN; 17 | use anyhow::{anyhow, Context, Result}; 18 | use encoding_rs::*; 19 | use lazy_static::lazy_static; 20 | use std::cmp; 21 | use std::cmp::{Eq, Ord}; 22 | use std::fmt; 23 | use std::ops::Deref; 24 | use std::process; 25 | use std::str; 26 | use std::str::FromStr; 27 | 28 | /// A filter for ASCII encoding searches only. No control character pass, but 29 | /// whitespace is allowed. This works like the traditional `stringsext`mode. 30 | /// Unless otherwise specified on the command line, his filter is default for 31 | /// ASCII-encoding searches. 32 | pub const UTF8_FILTER_ASCII_MODE_DEFAULT: Utf8Filter = Utf8Filter { 33 | af: AF_ALL & !AF_CTRL, 34 | ubf: UBF_NONE, 35 | grep_char: None, 36 | }; 37 | 38 | /// A default filter for all non-ASCII encoding searches. 39 | /// For single-byte-characters (`af`-filter), no control character 40 | /// pass, but whitespace is allowed. This works like the traditional 41 | /// `stringsext`mode. 42 | /// For multi-byte-characters we allow only Latin characters 43 | /// with all kind of accents. 44 | /// Unless otherwise specified on the command line, this filter 45 | /// is default for non-ASCII-encoding searches. 46 | pub const UTF8_FILTER_NON_ASCII_MODE_DEFAULT: Utf8Filter = Utf8Filter { 47 | af: AF_ALL & !AF_CTRL, 48 | ubf: UBF_COMMON, 49 | grep_char: None, 50 | }; 51 | 52 | /// A filter that let pass all valid Unicode codepoints. 53 | /// Useful for debugging. 54 | #[cfg(test)] 55 | pub const UTF8_FILTER_ALL_VALID: Utf8Filter = Utf8Filter { 56 | af: AF_ALL, 57 | ubf: UBF_ALL & !UBF_INVALID, 58 | grep_char: None, 59 | }; 60 | 61 | /// A filter for Latin and accents. 62 | /// Useful for debugging. 63 | #[cfg(test)] 64 | pub const UTF8_FILTER_LATIN: Utf8Filter = Utf8Filter { 65 | af: AF_ALL & !AF_CTRL | AF_WHITESPACE, 66 | ubf: UBF_LATIN | UBF_ACCENTS, 67 | grep_char: None, 68 | }; 69 | /// Unicode-block-filter: 70 | /// No leading bytes are filtered. 71 | #[allow(dead_code)] 72 | pub const UBF_ALL_VALID: u64 = UBF_ALL & !UBF_INVALID; 73 | /// Unicode-block-filter: 74 | /// A filter that let pass all valid Unicode codepoints, except for ASCII where 75 | /// it behaves like the original `strings`. No leading bytes are filtered. 76 | #[allow(dead_code)] 77 | pub const UBF_ALL: u64 = 0xffff_ffff_ffff_ffff; 78 | /// Unicode-block-filter: 79 | /// No leading byte > 0x7F is accepted. 80 | /// Therefor no multi-byte-characters in UTF-8, which means 81 | /// this is an ASCII-filter. 82 | #[allow(dead_code)] 83 | pub const UBF_NONE: u64 = 0x0000_0000_0000_0000; 84 | /// Unicode-block-filter: 85 | /// These leading bytes are alway invalid in UTF-8 86 | #[allow(dead_code)] 87 | pub const UBF_INVALID: u64 = 0xffe0_0000_0000_0003; 88 | /// Unicode-block-filter: 89 | /// Latin: (U+80..U+240). 90 | /// Usually used together with `UBF_ACCENTS`. 91 | #[allow(dead_code)] 92 | pub const UBF_LATIN: u64 = 0x0000_0000_0000_01fc; 93 | /// Unicode-block-filter: 94 | /// Accents: (U+300..U+380). 95 | #[allow(dead_code)] 96 | pub const UBF_ACCENTS: u64 = 0x0000_0000_0000_3000; 97 | /// Unicode-block-filter: 98 | /// Greek: (U+380..U+400). 99 | #[allow(dead_code)] 100 | pub const UBF_GREEK: u64 = 0x0000_0000_0000_C000; 101 | /// Unicode-block-filter: 102 | /// IPA: (U+240..U+300). 103 | #[allow(dead_code)] 104 | pub const UBF_IPA: u64 = 0x0000_0000_0000_0700; 105 | /// Unicode-block-filter: 106 | /// Cyrillic: (U+400..U+540) 107 | #[allow(dead_code)] 108 | pub const UBF_CYRILLIC: u64 = 0x0000_0000_001f_0000; 109 | /// Unicode-block-filter: 110 | /// Armenian: (U+540..U+580) 111 | #[allow(dead_code)] 112 | pub const UBF_ARMENIAN: u64 = 0x0000_0000_0020_0000; 113 | /// Unicode-block-filter: 114 | /// Hebrew: (U+580..U+600) 115 | #[allow(dead_code)] 116 | pub const UBF_HEBREW: u64 = 0x0000_0000_00c0_0000; 117 | /// Unicode-block-filter: 118 | /// Arabic: (U+600..U+700, U+740..U+780) 119 | #[allow(dead_code)] 120 | pub const UBF_ARABIC: u64 = 0x0000_0000_2f00_0000; 121 | /// Unicode-block-filter: 122 | /// Syriac: (U+700..U+740) 123 | #[allow(dead_code)] 124 | pub const UBF_SYRIAC: u64 = 0x0000_0000_1000_0000; 125 | /// Unicode-block-filter: 126 | /// Armenian: (U+0540..), Hebrew: (U+0580..), Arabic: (U+0600..), 127 | /// Syriac: (U+0700..), Arabic: (U+0740..), Thaana: (U+0780..), N'Ko: (U+07C0..U+800) 128 | #[allow(dead_code)] 129 | pub const UBF_AFRICAN: u64 = 0x0000_0000_ffe0_0000; 130 | /// Unicode-block-filter: 131 | /// All 2-byte UFT-8 (U+07C0..U+800) 132 | /// #[allow(dead_code)] 133 | pub const UBF_COMMON: u64 = 0x0000_0000_ffff_fffc; 134 | /// Unicode-block-filter: 135 | /// Kana: (U+3000..U+4000). 136 | #[allow(dead_code)] 137 | pub const UBF_KANA: u64 = 0x0000_0008_0000_0000; 138 | /// Unicode-block-filter: 139 | /// CJK: (U+3000..A000). 140 | #[allow(dead_code)] 141 | pub const UBF_CJK: u64 = 0x0000_03f0_0000_0000; 142 | /// Unicode-block-filter: 143 | /// Hangul: (U+B000..E000). 144 | #[allow(dead_code)] 145 | pub const UBF_HANGUL: u64 = 0x0000_3800_0000_0000; 146 | /// Unicode-block-filter: 147 | /// Kana: (U+3000..), CJK: (U+4000..), Asian: (U+A000..), Hangul: (U+B000..U+E000). 148 | #[allow(dead_code)] 149 | pub const UBF_ASIAN: u64 = 0x0000_3ffc_0000_0000; 150 | /// Unicode-block-filter: 151 | /// Private use area (U+E00..F00), (U+10_0000..U+14_0000). 152 | #[allow(dead_code)] 153 | pub const UBF_PUA: u64 = 0x0010_4000_0000_0000; 154 | /// Unicode-block-filter: 155 | /// Misc: (U+1000..), Symbol:(U+2000..U+3000), Forms:(U+F000..U+10000). 156 | #[allow(dead_code)] 157 | pub const UBF_MISC: u64 = 0x0000_8006_0000_0000; 158 | /// Unicode-block-filter: 159 | /// Besides PUA, more very uncommon planes: (U+10_000-U+C0_000). 160 | #[allow(dead_code)] 161 | pub const UBF_UNCOMMON: u64 = 0x000f_0000_0000_0000; 162 | 163 | /// Shortcuts for the hexadecimal representation of a unicode block filter. 164 | /// The array is defined as `(key, value)` tuples. 165 | /// For value see chapter *Codepage layout* in 166 | /// [UTF-8 - Wikipedia](https://en.wikipedia.org/wiki/UTF-8) 167 | pub const UNICODE_BLOCK_FILTER_ALIASSE: [([u8; 12], u64, [u8; 25]); 18] = [ 168 | (*b"African ", UBF_AFRICAN, *b"all in U+540..U+800 "), 169 | ( 170 | *b"All-Asian ", 171 | UBF_ALL & !UBF_INVALID & !UBF_ASIAN, 172 | *b"all, except Asian ", 173 | ), 174 | ( 175 | *b"All ", 176 | UBF_ALL & !UBF_INVALID, 177 | *b"all valid multibyte UTF-8", 178 | ), 179 | ( 180 | *b"Arabic ", 181 | UBF_ARABIC | UBF_SYRIAC, 182 | *b"Arabic+Syriac ", 183 | ), 184 | ( 185 | *b"Armenian ", 186 | UBF_ARMENIAN, 187 | *b"Armenian ", 188 | ), 189 | (*b"Asian ", UBF_ASIAN, *b"all in U+3000..U+E000 "), 190 | (*b"Cjk ", UBF_CJK, *b"CJK: U+4000..U+A000 "), 191 | (*b"Common ", UBF_COMMON, *b"all 2-byte-UFT-8 "), 192 | ( 193 | *b"Cyrillic ", 194 | UBF_CYRILLIC, 195 | *b"Cyrillic ", 196 | ), 197 | ( 198 | *b"Default ", 199 | UBF_ALL & !UBF_INVALID, 200 | *b"all valid multibyte UTF-8", 201 | ), 202 | (*b"Greek ", UBF_GREEK, *b"Greek "), 203 | (*b"Hangul ", UBF_HANGUL, *b"Hangul: U+B000..U+E000 "), 204 | (*b"Hebrew ", UBF_HEBREW, *b"Hebrew "), 205 | (*b"Kana ", UBF_KANA, *b"Kana: U+3000..U+4000 "), 206 | ( 207 | *b"Latin ", 208 | UBF_LATIN | UBF_ACCENTS, 209 | *b"Latin + accents ", 210 | ), 211 | (*b"None ", !UBF_ALL, *b"block all multibyte UTF-8"), 212 | (*b"Private ", UBF_PUA, *b"private use areas "), 213 | ( 214 | *b"Uncommon ", 215 | UBF_UNCOMMON | UBF_PUA, 216 | *b"private + all>=U+10_000 ", 217 | ), 218 | ]; 219 | 220 | /// ASCII filter: 221 | /// Let all ASCII pass the filter (0x01..0x100) 222 | /// except Null (0x00) which is "end of string" marker. 223 | /// [Null character - Wikipedia](https://en.wikipedia.org/wiki/Null_character) 224 | #[allow(dead_code)] 225 | pub const AF_ALL: u128 = 0xffff_ffff_ffff_ffff_ffff_ffff_ffff_fffe; 226 | 227 | /// ASCII filter: 228 | /// Nothing passes ASCII pass filter 229 | #[allow(dead_code)] 230 | pub const AF_NONE: u128 = 0x0000_0000_0000_0000_0000_0000_0000_0000; 231 | 232 | /// ASCII filter: 233 | /// Controls: (0x00..0x20, 0x7F) 234 | /// [C0 and C1 control codes - Wikipedia]() 235 | /// Unlike traditional `strings` we exclude "Space" (0x20) here, as 236 | /// it can appear in filenames. Instead, we consider "Space" to be 237 | /// a regular character. 238 | #[allow(dead_code)] 239 | pub const AF_CTRL: u128 = 0x8000_0000_0000_0000_0000_0000_ffff_ffff; 240 | 241 | /// ASCII filter: 242 | /// White-space 243 | /// (0x09..=0x0c, 0x20) 244 | /// [C0 and C1 control codes - Wikipedia]() 245 | /// It do not include "Carriage Return" (0x0d) here. This way strings are 246 | /// divided into shorter chunks and we get more location information. 247 | #[allow(dead_code)] 248 | pub const AF_WHITESPACE: u128 = 0x0000_0000_0000_0000_0000_0001_0000_1e00; 249 | 250 | /// ASCII filter: 251 | /// Set defaults close to those in traditional `strings`. 252 | #[allow(dead_code)] 253 | pub const AF_DEFAULT: u128 = AF_ALL & !AF_CTRL; 254 | 255 | pub const ASCII_FILTER_ALIASSE: [([u8; 12], u128, [u8; 25]); 6] = [ 256 | (*b"All ", AF_ALL, *b"all ASCII = pass all "), 257 | ( 258 | *b"All-Ctrl ", 259 | AF_ALL & !AF_CTRL, 260 | *b"all-control ", 261 | ), 262 | ( 263 | *b"All-Ctrl+Wsp", 264 | AF_ALL & !AF_CTRL | AF_WHITESPACE, 265 | *b"all-control+whitespace ", 266 | ), 267 | (*b"Default ", AF_DEFAULT, *b"all-control "), 268 | (*b"None ", AF_NONE, *b"block all 1-byte UTF-8 "), 269 | ( 270 | *b"Wsp ", 271 | AF_WHITESPACE, 272 | *b"only white-space ", 273 | ), 274 | ]; 275 | 276 | lazy_static! { 277 | pub static ref MISSIONS: Missions = Missions::new( 278 | ARGS.counter_offset.as_ref(), 279 | &ARGS.encoding, 280 | ARGS.chars_min.as_ref(), 281 | ARGS.same_unicode_block, 282 | ARGS.ascii_filter.as_ref(), 283 | ARGS.unicode_block_filter.as_ref(), 284 | ARGS.grep_char.as_ref(), 285 | ARGS.output_line_len.as_ref(), 286 | ) 287 | .unwrap_or_else(|error| { 288 | eprintln!("Error while parsing command-line arguments: {:?}", error); 289 | process::exit(1); 290 | }); 291 | } 292 | 293 | /// When the decoder finds a valid Unicode character, it decodes it into UTF-8. 294 | /// The leading byte of this UTF-8 multi-byte-character must then pass an 295 | /// additional filter before being printed: the so called `Utf8Filter`. It comes 296 | /// with three independant filter criteria: 297 | /// 298 | /// 1. The Ascii-Filter `Utf8Filter::asf`, 299 | /// 2. the Unicode-block-filter `Utf8Filter::ubf`, 300 | /// 3. and the `Utf8::must_hame`-filter. 301 | /// 302 | /// The Ascii-Filter `Utf8Filter::asf` and the Unicode-block-filter 303 | /// `Utf8Filter::ubf` are implemented by the `Utf8Filter::pass_filter()` 304 | /// function. The `Utf8::grep_char`-filter is implemented by the 305 | /// `helper::SplitStr::next()` iterator function. 306 | 307 | #[derive(Eq, PartialEq, Copy, Clone)] 308 | pub struct Utf8Filter { 309 | /// Every bit `0..=127` of the `Utf8Filer::af` filter parameter maps to one 310 | /// ASCII-code-position `0x00..=0x7F` that is checked by `pass_filter()` 311 | /// against the UTF-8 leading byte of the incoming stream. For example if the 312 | /// leading byte's code is 32 and the `Utf8Filter::af` has bit number 32 set, 313 | /// then the character passes the filter. If not, it is rejected. 314 | pub af: u128, 315 | 316 | /// Every bit `0..=63` maps to one leading-byte's code position 317 | /// `0xC0..0xFF`, e.g. bit 0 is set -> all characters with leading byte `0xC0` 318 | /// pass the filter, 319 | /// If bit 1 is set -> all characters with all leading byte `0xC1`, ... 320 | /// pass the filter. Otherwise, the character is rejected. 321 | pub ubf: u64, 322 | 323 | /// If `Some()`, a finding must have at least one leading byte equal to the 324 | /// `grep_char` ASCII code. This is useful when you grep for path-strings: 325 | /// e.g. "0x2f" or "0x5c". 326 | pub grep_char: Option, 327 | } 328 | 329 | impl Utf8Filter { 330 | /// This function applies the Ascii-Filter `Utf8Filter::asf` to the 331 | /// UTF-8 leading byte `b`. It assumes that `b<=0x7f`! 332 | #[inline] 333 | pub fn pass_af_filter(&self, b: u8) -> bool { 334 | debug_assert!(b & 0x80 == 0x00); 335 | // We treat b values 0-128 here. 336 | 1 << b & self.af != 0 337 | } 338 | /// This function applies the Unicode-Block-Filter `Utf8Filter::ubf` to the 339 | /// UTF-8 leading byte `b`. It assumes that `b>0x7f`! 340 | #[inline] 341 | pub fn pass_ubf_filter(&self, b: u8) -> bool { 342 | debug_assert!(b & 0x80 == 0x80); 343 | // We do not have to check for invalid continuation-bytes here, because we know the 344 | // input is valid UTF-8 and therefor the continuation-byte-codes `0x80..0xBF` can not 345 | // appear here. We treat b values of 192-255 here (128-191 can not occur in leading 346 | // UTF-8 bytes). We first map values 192-255 -> 0-128 with (b & 0x3f) 347 | 1 << (b & 0x3f) & self.ubf != 0 348 | } 349 | } 350 | 351 | impl fmt::Debug for Utf8Filter { 352 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 353 | write!( 354 | f, 355 | "af: 0x{:x}, ubf: 0x{:x}, grep_char: {:?}", 356 | self.af, self.ubf, self.grep_char 357 | ) 358 | } 359 | } 360 | 361 | /// Needed for merging. 362 | impl PartialOrd for Utf8Filter { 363 | fn partial_cmp(&self, other: &Self) -> Option { 364 | Some(self.cmp(other)) 365 | } 366 | } 367 | 368 | /// Needed for merging. 369 | impl Ord for Utf8Filter { 370 | fn cmp(&self, other: &Self) -> cmp::Ordering { 371 | if self.ubf != other.ubf { 372 | self.ubf.cmp(&other.ubf) 373 | } else { 374 | (!self.af).cmp(&!other.af) 375 | } 376 | } 377 | } 378 | 379 | /// `Mission` represents the instruction parameters used mainly in `scanner::scan()`. 380 | /// Each thread gets its own instance and stores it in `ScannerState`. 381 | #[derive(Debug, Clone)] 382 | pub struct Mission { 383 | /// An identifier for this mission. We use its position index in the 384 | /// `Missions.v` vector. 385 | pub mission_id: u8, 386 | 387 | /// Start offset for the input-stream-byte-counter. This is useful in case 388 | /// the input comes split in separate files, that should be analyzed with 389 | /// separate `stringsext` runs. Note: in general it is better to treat all 390 | /// input in one `stringsext` run and provide all split input-files as 391 | /// command-line-parameter for one `stringsext` run. This way `stringsext` 392 | /// can concatenate the split input files and is able to recognize split 393 | /// strings at the cutting edge between two input files. 394 | pub counter_offset: ByteCounter, 395 | /// Every thread gets a constant encoding to search for. 396 | /// 397 | pub encoding: &'static Encoding, 398 | 399 | /// Minimum required string length in Bytes for a finding to be printed. 400 | pub chars_min_nb: u8, 401 | 402 | /// When true imposes an addition condition for findings: 403 | /// Advises the filter to only accept multi-characters in a finding with 404 | /// the same leading byte. This does not affect 1-byte ASCII characters. 405 | pub require_same_unicode_block: bool, 406 | 407 | /// A filter, defining additional criteria for a finding to be printed. 408 | pub filter: Utf8Filter, 409 | 410 | /// Maximum length of output-lines in UTF-8 characters. Findings that do not 411 | /// fit, will be wrapped to two or more lines. The label `+` indicates that 412 | /// this line is the continuation of the previous line. 413 | pub output_line_char_nb_max: usize, 414 | 415 | /// The `encoding_rs` decoder has no direct support for ASCII. As a 416 | /// workaround, we simulate the missing ASCII-decoder with the 417 | /// `x-user-defined`-decoder and a special filter. With this flag is set, we 418 | /// indicate this case. It is later used to print out the label `ascii` 419 | /// instead of `x-user-defined`. 420 | pub print_encoding_as_ascii: bool, 421 | } 422 | 423 | /// A collection to bundle all `Mission`-objects. 424 | #[derive(Debug)] 425 | pub struct Missions { 426 | /// Vector of `Mission`s. 427 | pub v: Vec, 428 | } 429 | 430 | /// Access `Mission` without `.v`. 431 | impl Deref for Missions { 432 | type Target = Vec; 433 | 434 | fn deref(&self) -> &Self::Target { 435 | &self.v 436 | } 437 | } 438 | 439 | /// Parses a filter expression from some hexadecimal string or 440 | /// number string to an integer value. 441 | /// 442 | /// `$s` is `Option` to be parsed. 443 | /// `$x_from_str_radix` is either `u128::from_str_radix` or u64::from_str_radix`. 444 | /// `$x_from_str` is e.g. `u8::from_str` or usize::from_str`, ... 445 | /// 446 | /// The marco returns a filter integer value in `Option` and 447 | /// returns early when parsing is not successful. 448 | #[macro_export] 449 | macro_rules! parse_integer { 450 | ($s:expr, $x_from_str_radix:expr, $x_from_str:expr) => {{ 451 | match $s { 452 | Some(s) if s.is_empty() => None, 453 | Some(s) if s.trim().len() >= 2 && s.trim()[..2] == *"0x" => Some( 454 | $x_from_str_radix(&s.trim()[2..], 16) 455 | .with_context(|| format!("failed to parse hexadecimal number: `{}`", s))?, 456 | ), 457 | Some(s) => Some( 458 | $x_from_str(s.trim()).with_context(|| format!("failed to parse number: {}", s))?, 459 | ), 460 | None => None, 461 | } 462 | }}; 463 | } 464 | 465 | /// Parses a filter expression from some hexadecimal string or from some 466 | /// filter-alias-name in `$list` to a filter-integer value. 467 | /// 468 | /// `$s` is `Option` to be parsed. 469 | /// `$list` is either `ASCII_FILTER_ALIASSE` or `UNICODE_BLOCK_FILTER_ALIASSE`. 470 | /// `$x_from_str_radix` is either `u128::from_str_radix` or u64::from_str_radix`. 471 | /// 472 | /// The marco returns a filter integer value in `Option` and 473 | /// returns early when parsing is not successful. 474 | #[macro_export] 475 | macro_rules! parse_filter_parameter { 476 | ($s:expr, $x_from_str_radix:expr, $list:expr) => {{ 477 | match $s { 478 | Some(s) if s.trim().len() >= 2 && s.trim()[..2] == *"0x" => Some( 479 | $x_from_str_radix(&s.trim()[2..], 16) 480 | .with_context(|| format!("failed to parse hexadecimal number: `{}`", s))?, 481 | ), 482 | Some(s) if s.is_empty() => None, 483 | Some(s) => { 484 | let s = s.trim(); 485 | let mut oubf = None; 486 | for (ubf_name, ubf, _) in $list.iter() { 487 | if s.len() <= ubf_name.len() && *s.as_bytes() == ubf_name[..s.len()] { 488 | oubf = Some(*ubf); 489 | break; 490 | }; 491 | } 492 | if oubf.is_some() { 493 | oubf 494 | } else { 495 | return Err(anyhow!( 496 | "filter name `{}` is not valid, try `--list-encodings`", 497 | s 498 | )); 499 | } 500 | } 501 | None => None, 502 | } 503 | }}; 504 | } 505 | 506 | impl Missions { 507 | /// As `Mission` does not have its own constructor, the `Missions` 508 | /// constructor creates all `Mission`-objects in one row and stores them in 509 | /// some vector `Missions::v`. We guarantee that at least one (default) 510 | /// `Mission`-object will be created. The initialisation data coming from 511 | /// `options::ARGS` is completed with default values, then parsed and syntax 512 | /// checked before creating a `Mission`-object. 513 | 514 | pub fn new( 515 | flag_counter_offset: Option<&String>, 516 | flag_encoding: &[String], 517 | flag_chars_min_nb: Option<&String>, 518 | flag_same_unicode_block: bool, 519 | flag_ascii_filter: Option<&String>, 520 | flag_unicode_block_filter: Option<&String>, 521 | flag_grep_char: Option<&String>, 522 | flag_output_line_len: Option<&String>, 523 | ) -> Result { 524 | let flag_counter_offset = parse_integer!( 525 | flag_counter_offset, 526 | ByteCounter::from_str_radix, 527 | ByteCounter::from_str 528 | ); 529 | 530 | let flag_chars_min_nb = parse_integer!(flag_chars_min_nb, u8::from_str_radix, u8::from_str); 531 | 532 | // Parse from `Option` to `Option` 533 | let flag_ascii_filter = parse_filter_parameter!( 534 | flag_ascii_filter, 535 | u128::from_str_radix, 536 | ASCII_FILTER_ALIASSE 537 | ); 538 | 539 | // Parse from `Option` to `Option` 540 | let flag_unicode_block_filter = parse_filter_parameter!( 541 | flag_unicode_block_filter, 542 | u64::from_str_radix, 543 | UNICODE_BLOCK_FILTER_ALIASSE 544 | ); 545 | 546 | let flag_grep_char = parse_integer!(flag_grep_char, u8::from_str_radix, u8::from_str); 547 | if let Some(m) = flag_grep_char { 548 | if m > 127 { 549 | return Err(anyhow!( 550 | "you can only `--grep-char` for ASCII codes < 128, \ 551 | you tried: `{}`.", 552 | m 553 | )); 554 | } 555 | } 556 | 557 | let flag_output_line_len = 558 | parse_integer!(flag_output_line_len, usize::from_str_radix, usize::from_str); 559 | if let Some(m) = flag_output_line_len { 560 | if m < OUTPUT_LINE_CHAR_NB_MIN { 561 | return Err(anyhow!( 562 | "minimum for `--output-line-len` is `{}`, \ 563 | you tried: `{}`.", 564 | OUTPUT_LINE_CHAR_NB_MIN, 565 | m 566 | )); 567 | } 568 | } 569 | 570 | let mut v = Vec::new(); 571 | let encoding_default: &[String; 1] = &[ENCODING_DEFAULT.to_string()]; 572 | 573 | let enc_iter = if flag_encoding.is_empty() { 574 | encoding_default.iter() 575 | } else { 576 | flag_encoding.iter() 577 | }; 578 | 579 | for (mission_id, enc_opt) in enc_iter.enumerate() { 580 | let (enc_name, chars_min_nb, filter_af, filter_ubf, filter_grep_char) = 581 | Self::parse_enc_opt(enc_opt)?; 582 | 583 | // DEFINE DEFAULTS 584 | 585 | let mut enc_name = match enc_name { 586 | Some(s) => s, 587 | None => ENCODING_DEFAULT, 588 | }; 589 | 590 | let counter_offset = match flag_counter_offset { 591 | Some(n) => n, 592 | None => COUNTER_OFFSET_DEFAULT, 593 | }; 594 | 595 | // If `char_min_nb` is not defined in `enc_opt` 596 | // use the command-line option. 597 | let chars_min_nb = match chars_min_nb { 598 | Some(n) => n, 599 | None => match flag_chars_min_nb { 600 | Some(n) => n, 601 | None => CHARS_MIN_DEFAULT, 602 | }, 603 | }; 604 | 605 | let require_same_unicode_block = flag_same_unicode_block; 606 | 607 | let output_line_char_nb_max = match flag_output_line_len { 608 | Some(n) => n, 609 | None => OUTPUT_LINE_CHAR_NB_MAX_DEFAULT, 610 | }; 611 | 612 | if output_line_char_nb_max < OUTPUT_LINE_CHAR_NB_MIN { 613 | return Err(anyhow!( 614 | "Scanner {}: \ 615 | minimum for `--output-line-len` is `{}`, \ 616 | you tried: `{}`.", 617 | char::from((mission_id + 97) as u8), 618 | OUTPUT_LINE_CHAR_NB_MIN, 619 | output_line_char_nb_max, 620 | )); 621 | } 622 | 623 | // "ascii" encoding is missing in "encoding.rs". We emulate it with 624 | // "x-user-defined" and the `UTF8_FILTER_ASCII_MODE_DEFAULT`-filter, 625 | // if not otherwise specified. 626 | 627 | let filter_af = filter_af.unwrap_or_else(|| { 628 | flag_ascii_filter.unwrap_or(if enc_name == ASCII_ENC_LABEL { 629 | UTF8_FILTER_ASCII_MODE_DEFAULT.af 630 | } else { 631 | UTF8_FILTER_NON_ASCII_MODE_DEFAULT.af 632 | }) 633 | }); 634 | 635 | let filter_ubf = filter_ubf.unwrap_or_else(|| { 636 | flag_unicode_block_filter.unwrap_or(if enc_name == ASCII_ENC_LABEL { 637 | UTF8_FILTER_ASCII_MODE_DEFAULT.ubf 638 | } else { 639 | UTF8_FILTER_NON_ASCII_MODE_DEFAULT.ubf 640 | }) 641 | }); 642 | 643 | let filter_grep_char = match filter_grep_char { 644 | Some(f) => Some(f), 645 | None => match flag_grep_char { 646 | Some(f) => Some(f), 647 | None => { 648 | if enc_name == ASCII_ENC_LABEL { 649 | UTF8_FILTER_ASCII_MODE_DEFAULT.grep_char 650 | } else { 651 | UTF8_FILTER_NON_ASCII_MODE_DEFAULT.grep_char 652 | } 653 | } 654 | }, 655 | }; 656 | 657 | if let Some(m) = filter_grep_char { 658 | if m > 127 { 659 | return Err(anyhow!( 660 | "Scanner {}: \ 661 | you can only grep for ASCII codes < 128, \ 662 | you tried: `{}`.", 663 | char::from((mission_id + 97) as u8), 664 | m 665 | )); 666 | } 667 | } 668 | 669 | let filter = Utf8Filter { 670 | af: filter_af, 671 | ubf: filter_ubf, 672 | grep_char: filter_grep_char, 673 | }; 674 | 675 | let mut print_encoding_as_ascii = false; 676 | if enc_name == ASCII_ENC_LABEL { 677 | print_encoding_as_ascii = true; 678 | enc_name = "x-user-defined" 679 | }; 680 | 681 | let encoding = &Encoding::for_label((enc_name).as_bytes()).with_context(|| { 682 | format!( 683 | "Scanner {}: \ 684 | invalid input encoding name `{}`, try flag `--list-encodings`.", 685 | char::from((mission_id + 97) as u8), 686 | enc_name 687 | ) 688 | })?; 689 | 690 | v.push(Mission { 691 | counter_offset, 692 | encoding, 693 | chars_min_nb, 694 | require_same_unicode_block, 695 | filter, 696 | output_line_char_nb_max, 697 | mission_id: mission_id as u8, 698 | print_encoding_as_ascii, 699 | }); 700 | } 701 | 702 | Ok(Missions { v }) 703 | } 704 | 705 | /// Return the number of `Mission`s stored. 706 | #[allow(dead_code)] 707 | pub fn len(&self) -> usize { 708 | self.v.len() 709 | } 710 | 711 | /// Helper function to parse `enc_opt`. 712 | #[inline] 713 | fn parse_enc_opt( 714 | enc_opt: &str, 715 | ) -> Result< 716 | ( 717 | Option<&str>, 718 | Option, 719 | Option, 720 | Option, 721 | Option, 722 | ), 723 | anyhow::Error, 724 | > { 725 | // Parse ',' separated strings 726 | let mut i = enc_opt.split_terminator(','); 727 | 728 | let enc_name = match i.next() { 729 | Some("") => None, 730 | Some(s) => Some(s.trim()), 731 | None => None, 732 | }; 733 | 734 | let chars_min_nb = parse_integer!(i.next(), u8::from_str_radix, u8::from_str); 735 | 736 | let filter_af = 737 | parse_filter_parameter!(i.next(), u128::from_str_radix, ASCII_FILTER_ALIASSE); 738 | 739 | let filter_ubf = 740 | parse_filter_parameter!(i.next(), u64::from_str_radix, UNICODE_BLOCK_FILTER_ALIASSE); 741 | 742 | let grep_char = parse_integer!(i.next(), u8::from_str_radix, u8::from_str); 743 | 744 | if i.next().is_some() { 745 | return Err(anyhow!("Too many items in `{}`.", enc_opt)); 746 | } 747 | 748 | Ok((enc_name, chars_min_nb, filter_af, filter_ubf, grep_char)) 749 | } 750 | } 751 | 752 | #[cfg(test)] 753 | mod tests { 754 | use super::*; 755 | use crate::mission::Utf8Filter; 756 | 757 | #[test] 758 | fn test_pass_filter() { 759 | // We filter Latin 1 760 | let utf8f = Utf8Filter { 761 | af: AF_ALL, 762 | ubf: UBF_LATIN, 763 | grep_char: None, 764 | }; 765 | 766 | // Check lower bits 767 | assert!(utf8f.pass_af_filter("A".as_bytes()[0])); 768 | assert!(!utf8f.pass_ubf_filter("€".as_bytes()[0])); 769 | // Check upper bits 770 | // first byte of © in UTF-8 is 0xC2. 0xC2 & 0x80 = bit 0x42 771 | assert!(utf8f.pass_ubf_filter("©".as_bytes()[0])); 772 | // first byte of © in UTF-8 is 0xE2. 0xE2 & 0x80 = bit 0x62 773 | assert!(!utf8f.pass_ubf_filter("€".as_bytes()[0])); 774 | } 775 | 776 | #[test] 777 | fn test_enc_opt_parser() { 778 | assert_eq!( 779 | super::Missions::parse_enc_opt("ascii").unwrap(), 780 | (Some("ascii"), None, None, None, None) 781 | ); 782 | 783 | assert_eq!( 784 | super::Missions::parse_enc_opt("utf-8,10,0x89AB,0xCDEF,0x2f").unwrap(), 785 | ( 786 | Some("utf-8"), 787 | Some(10), 788 | Some(0x89AB), 789 | Some(0xCDEF), 790 | Some(0x2f) 791 | ) 792 | ); 793 | 794 | assert_eq!( 795 | super::Missions::parse_enc_opt("utf-8,10,0x89AB,0xCDEF,211").unwrap(), 796 | ( 797 | Some("utf-8"), 798 | Some(10), 799 | Some(0x89AB), 800 | Some(0xCDEF), 801 | Some(211) 802 | ) 803 | ); 804 | 805 | assert_eq!( 806 | super::Missions::parse_enc_opt(",,,,,").unwrap(), 807 | (None, None, None, None, None) 808 | ); 809 | 810 | assert_eq!( 811 | super::Missions::parse_enc_opt("ascii,10,0x89AB").unwrap(), 812 | (Some("ascii"), Some(10), Some(0x89AB), None, None) 813 | ); 814 | 815 | assert!(super::Missions::parse_enc_opt("ascii, 10n").is_err()); 816 | 817 | assert!(super::Missions::parse_enc_opt("ascii,10,0x89,0x?B").is_err()); 818 | 819 | assert!(super::Missions::parse_enc_opt("ascii,10,0x?9,0xAB").is_err()); 820 | 821 | assert!(super::Missions::parse_enc_opt("ascii,1000000000000000000000,0x1,0x2").is_err()); 822 | 823 | assert!(super::Missions::parse_enc_opt("ascii,10,0x1,0x2,0x3,0x4").is_err()); 824 | 825 | assert!(super::Missions::parse_enc_opt("ascii,10,123").is_err()); 826 | 827 | assert!(super::Missions::parse_enc_opt("ascii,10,,123").is_err()); 828 | 829 | assert_eq!( 830 | super::Missions::parse_enc_opt("ascii,10,Default").unwrap(), 831 | (Some("ascii"), Some(10), Some(AF_DEFAULT), None, None) 832 | ); 833 | 834 | assert_eq!( 835 | super::Missions::parse_enc_opt("ascii,10,,Latin").unwrap(), 836 | ( 837 | Some("ascii"), 838 | Some(10), 839 | None, 840 | Some(UBF_LATIN | UBF_ACCENTS), 841 | None 842 | ) 843 | ); 844 | 845 | assert!(super::Missions::parse_enc_opt("ascii,10,my-no-encoding").is_err()); 846 | 847 | assert!(super::Missions::parse_enc_opt("ascii,10,,my-no-encoding").is_err()); 848 | 849 | assert_eq!( 850 | super::Missions::parse_enc_opt("ascii,10,0x89AB").unwrap(), 851 | (Some("ascii"), Some(10), Some(0x89AB), None, None) 852 | ); 853 | } 854 | } 855 | --------------------------------------------------------------------------------