├── .cargo └── audit.toml ├── .envrc ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md └── workflows │ ├── ci.yml │ └── release.yml ├── .gitignore ├── .typos.toml ├── .vscode └── launch.json ├── CHANGELOG.md ├── Cargo.lock ├── Cargo.toml ├── LICENSE.md ├── README.md ├── ci ├── macos-install-packages └── ubuntu-install-packages ├── doc ├── config.default.jsonc ├── demodir.png ├── notes.md ├── rga-fzf.gif └── update-readme.sh ├── exampledir ├── decompress │ ├── test.log │ ├── test.log.bz2 │ ├── test.log.gz │ ├── test.log.xz │ ├── test.log.zst │ └── testlogbutwithoutextension ├── demo │ ├── greeting.mkv │ ├── hello.odt │ ├── hello.sqlite3 │ └── somearchive.zip ├── droste.zip ├── encoding │ ├── utf16le.txt │ ├── utf8.txt │ └── zip.tar.gz ├── exif.png ├── formatting.epub ├── mail_nested.eml ├── mail_pdf_attach.eml ├── screenshot.png ├── short.pdf ├── sqlitedb ├── tar │ ├── exampledir.tar.gz │ ├── test.tar │ ├── test.tar.bz2 │ └── test.tar.zip ├── test.djvu ├── test.zip ├── test │ ├── github_email.eml │ ├── hello.gz │ ├── hello.sqlite3 │ ├── hello.tar │ ├── mail_with_attachment.mbox │ ├── only-seek-zip.zip │ ├── short.pdf │ ├── short.pdf.gz │ ├── test.mbx │ └── twoblankpages.pdf ├── wasteland.docx ├── wasteland.epub ├── wasteland.fb2 ├── wasteland.mkv ├── wasteland.mobi ├── wasteland.odt └── wasteland.pdf ├── flake.lock ├── flake.nix ├── rust-toolchain.toml ├── rustfmt.toml └── src ├── adapted_iter.rs ├── adapters.rs ├── adapters ├── custom.rs ├── decompress.rs ├── ffmpeg.rs ├── mbox.rs ├── postproc.rs ├── sqlite.rs ├── tar.rs ├── writing.rs └── zip.rs ├── bin ├── rga-fzf-open.rs ├── rga-fzf.rs ├── rga-preproc.rs └── rga.rs ├── caching_writer.rs ├── config.rs ├── expand.rs ├── lib.rs ├── matching.rs ├── preproc.rs ├── preproc_cache.rs ├── recurse.rs └── test_utils.rs /.cargo/audit.toml: -------------------------------------------------------------------------------- 1 | [yanked] 2 | enabled = false # doesn't work in Nix sandbox 3 | update_index = false # crates.io index managed by Nix 4 | -------------------------------------------------------------------------------- /.envrc: -------------------------------------------------------------------------------- 1 | use flake 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | 12 | 13 | **To Reproduce** 14 | 15 | Attach example file: 16 | 17 | Run command: 18 | 19 | **Output** 20 | 21 | **Screenshots** 22 | If applicable, add screenshots to help explain your problem. 23 | 24 | **Operating System and Version** 25 | 26 | 27 | **Output of `rga --version`** 28 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | # Based on https://github.com/actions-rs/meta/blob/master/recipes/quickstart.md 2 | # 3 | # While our "example" application has platform-specific code, 4 | # for simplicity we are compiling and testing everything in a nix-on-Linux environment only. 5 | 6 | on: [push, pull_request] 7 | 8 | name: ci 9 | 10 | jobs: 11 | nix-flake-check: 12 | name: nix flake check 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: Checkout sources 16 | uses: actions/checkout@v4 17 | 18 | - name: Install nix 19 | uses: cachix/install-nix-action@v21 20 | 21 | - name: Ensure the build succeeds 22 | run: nix build 23 | 24 | - name: Run `nix flake check` to run formatters, linters, and tests 25 | run: nix flake check --print-build-logs 26 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | # adapted from https://github.com/BurntSushi/ripgrep/blob/master/.github/workflows/release.yml 2 | # The way this works is a little weird. But basically, the create-release job 3 | # runs purely to initialize the GitHub release itself. Once done, the upload 4 | # URL of the release is saved as an artifact. 5 | # 6 | # The build-release job runs only once create-release is finished. It gets 7 | # the release upload URL by downloading the corresponding artifact (which was 8 | # uploaded by create-release). It then builds the release executables for each 9 | # supported platform and attaches them as release assets to the previously 10 | # created release. 11 | # 12 | # The key here is that we create the release only once. 13 | 14 | name: release 15 | on: 16 | push: 17 | # Enable when testing release infrastructure on a branch. 18 | # branches: 19 | # - ag/release 20 | tags: 21 | - "v[0-9]+.[0-9]+.[0-9]+*" 22 | jobs: 23 | create-release: 24 | permissions: write-all 25 | name: create-release 26 | runs-on: ubuntu-latest 27 | # env: 28 | # Set to force version number, e.g., when no tag exists. 29 | # RG_VERSION: TEST-0.0.0 30 | steps: 31 | - name: Create artifacts directory 32 | run: mkdir artifacts 33 | 34 | - name: Get the release version from the tag 35 | if: env.RG_VERSION == '' 36 | run: | 37 | # Apparently, this is the right way to get a tag name. Really? 38 | # 39 | # See: https://github.community/t5/GitHub-Actions/How-to-get-just-the-tag-name/m-p/32167/highlight/true#M1027 40 | echo "RG_VERSION=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV 41 | echo "version is: ${{ env.RG_VERSION }}" 42 | 43 | - name: Create GitHub release 44 | id: release 45 | uses: actions/create-release@v1 46 | env: 47 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 48 | with: 49 | tag_name: ${{ env.RG_VERSION }} 50 | release_name: ${{ env.RG_VERSION }} 51 | 52 | - name: Save release upload URL to artifact 53 | run: echo "${{ steps.release.outputs.upload_url }}" > artifacts/release-upload-url 54 | 55 | - name: Save version number to artifact 56 | run: echo "${{ env.RG_VERSION }}" > artifacts/release-version 57 | 58 | - name: Upload artifacts 59 | uses: actions/upload-artifact@v4 60 | with: 61 | name: artifacts 62 | path: artifacts 63 | 64 | build-release: 65 | name: build-release 66 | needs: ["create-release"] 67 | runs-on: ${{ matrix.os }} 68 | env: 69 | # For some builds, we use cross to test on 32-bit and big-endian 70 | # systems. 71 | CARGO: cargo 72 | # When CARGO is set to CROSS, this is set to `--target matrix.target`. 73 | TARGET_FLAGS: 74 | # When CARGO is set to CROSS, TARGET_DIR includes matrix.target. 75 | TARGET_DIR: ./target 76 | # Emit backtraces on panics. 77 | RUST_BACKTRACE: 1 78 | strategy: 79 | matrix: 80 | build: [linux, linux-arm, macos, win-msvc] 81 | include: 82 | - build: linux 83 | os: ubuntu-22.04 84 | rust: nightly 85 | target: x86_64-unknown-linux-musl 86 | - build: linux-arm 87 | os: ubuntu-22.04 88 | rust: nightly 89 | target: arm-unknown-linux-gnueabihf 90 | - build: macos 91 | os: macos-latest 92 | rust: nightly 93 | target: x86_64-apple-darwin 94 | - build: win-msvc 95 | os: windows-2019 96 | rust: nightly 97 | target: x86_64-pc-windows-msvc 98 | #- build: win-gnu 99 | # os: windows-2019 100 | # rust: nightly-x86_64-gnu 101 | # target: x86_64-pc-windows-gnu 102 | 103 | steps: 104 | - name: Checkout repository 105 | uses: actions/checkout@v4 106 | with: 107 | fetch-depth: 1 108 | 109 | - name: Install packages (Ubuntu) 110 | if: matrix.os == 'ubuntu-22.04' 111 | run: | 112 | ci/ubuntu-install-packages 113 | 114 | - name: Install packages (macOS) 115 | if: matrix.os == 'macos-latest' 116 | run: | 117 | ci/macos-install-packages 118 | 119 | - name: Install Rust 120 | uses: actions-rs/toolchain@v1 121 | with: 122 | toolchain: ${{ matrix.rust }} 123 | profile: minimal 124 | override: true 125 | target: ${{ matrix.target }} 126 | 127 | - name: Use Cross 128 | shell: bash 129 | run: | 130 | cargo install cross 131 | echo "CARGO=cross" >> $GITHUB_ENV 132 | echo "TARGET_FLAGS=--target ${{ matrix.target }}" >> $GITHUB_ENV 133 | echo "TARGET_DIR=./target/${{ matrix.target }}" >> $GITHUB_ENV 134 | 135 | - name: Show command used for Cargo 136 | run: | 137 | echo "cargo command is: ${{ env.CARGO }}" 138 | echo "target flag is: ${{ env.TARGET_FLAGS }}" 139 | echo "target dir is: ${{ env.TARGET_DIR }}" 140 | 141 | - name: Get release download URL 142 | uses: actions/download-artifact@v4 143 | with: 144 | name: artifacts 145 | path: artifacts 146 | 147 | - name: Set release upload URL and release version 148 | shell: bash 149 | run: | 150 | echo "RELEASE_UPLOAD_URL=$(cat artifacts/release-upload-url)" >> $GITHUB_ENV 151 | echo "release upload url: $RELEASE_UPLOAD_URL" 152 | echo "RELEASE_VERSION=$(cat artifacts/release-version)" >> $GITHUB_ENV 153 | echo "release version: $RELEASE_VERSION" 154 | 155 | - name: Build release binary 156 | run: ${{ env.CARGO }} build --verbose --release ${{ env.TARGET_FLAGS }} 157 | 158 | - name: Strip release binary (linux and macos) 159 | if: matrix.build == 'linux' || matrix.build == 'macos' 160 | run: | 161 | strip "target/${{ matrix.target }}/release/rga" \ 162 | "target/${{ matrix.target }}/release/rga-preproc" \ 163 | "target/${{ matrix.target }}/release/rga-fzf" \ 164 | "target/${{ matrix.target }}/release/rga-fzf-open" 165 | 166 | - name: Strip release binary (arm) 167 | if: matrix.build == 'linux-arm' 168 | run: | 169 | docker run --rm -v \ 170 | "$PWD/target:/target:Z" \ 171 | rustembedded/cross:arm-unknown-linux-gnueabihf \ 172 | arm-linux-gnueabihf-strip \ 173 | /target/arm-unknown-linux-gnueabihf/release/rga \ 174 | /target/arm-unknown-linux-gnueabihf/release/rga-preproc \ 175 | /target/arm-unknown-linux-gnueabihf/release/rga-fzf \ 176 | /target/arm-unknown-linux-gnueabihf/release/rga-fzf-open 177 | 178 | - name: Build archive 179 | shell: bash 180 | run: | 181 | staging="ripgrep_all-${{ env.RELEASE_VERSION }}-${{ matrix.target }}" 182 | mkdir -p "$staging"/doc 183 | 184 | cp {README.md,LICENSE.md} "$staging/" 185 | cp CHANGELOG.md "$staging/doc/" 186 | 187 | if [ "${{ matrix.os }}" = "windows-2019" ]; then 188 | cp "target/${{ matrix.target }}/release/rga.exe" "$staging/" 189 | cp "target/${{ matrix.target }}/release/rga-preproc.exe" "$staging/" 190 | cp "target/${{ matrix.target }}/release/rga-fzf.exe" "$staging/" 191 | cp "target/${{ matrix.target }}/release/rga-fzf-open.exe" "$staging/" 192 | 7z a "$staging.zip" "$staging" 193 | echo "ASSET=$staging.zip" >> $GITHUB_ENV 194 | else 195 | cp "target/${{ matrix.target }}/release/rga" "$staging/" 196 | cp "target/${{ matrix.target }}/release/rga-preproc" "$staging/" 197 | cp "target/${{ matrix.target }}/release/rga-fzf" "$staging/" 198 | cp "target/${{ matrix.target }}/release/rga-fzf-open" "$staging/" 199 | tar czf "$staging.tar.gz" "$staging" 200 | echo "ASSET=$staging.tar.gz" >> $GITHUB_ENV 201 | fi 202 | 203 | - name: Upload release archive 204 | uses: actions/upload-release-asset@v1.0.1 205 | env: 206 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 207 | with: 208 | upload_url: ${{ env.RELEASE_UPLOAD_URL }} 209 | asset_path: ${{ env.ASSET }} 210 | asset_name: ${{ env.ASSET }} 211 | asset_content_type: application/octet-stream 212 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /result 2 | /target 3 | /exampledir.2 4 | /.idea 5 | /.pre-commit-config.yaml 6 | /.vscode/settings.json 7 | **/*.rs.bk 8 | -------------------------------------------------------------------------------- /.typos.toml: -------------------------------------------------------------------------------- 1 | [default.extend-words] 2 | als = "als" 3 | 4 | [files] 5 | extend-exclude = ["exampledir/*"] 6 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "type": "lldb", 9 | "request": "attach", 10 | "name": "Attach", 11 | "program": "${workspaceFolder}/target/release/rga-preproc" 12 | }, 13 | { 14 | "type": "lldb", 15 | "request": "launch", 16 | "name": "Debug unit tests in library 'rga'", 17 | "cargo": { 18 | "args": ["test", "--no-run", "--lib", "--package=rga"], 19 | "filter": { 20 | "name": "rga", 21 | "kind": "lib" 22 | } 23 | }, 24 | "args": [], 25 | "cwd": "${workspaceFolder}" 26 | }, 27 | { 28 | "type": "lldb", 29 | "request": "launch", 30 | "name": "Debug executable 'rga'", 31 | "cargo": { 32 | "args": ["build", "--bin=rga"], 33 | "filter": { 34 | "name": "rga", 35 | "kind": "bin" 36 | } 37 | }, 38 | "args": [], 39 | "cwd": "${workspaceFolder}" 40 | }, 41 | { 42 | "type": "lldb", 43 | "request": "launch", 44 | "name": "Debug unit tests in executable 'rga'", 45 | "cargo": { 46 | "args": ["test", "--no-run", "--bin=rga", "--package=ripgrep-all"], 47 | "filter": { 48 | "name": "rga", 49 | "kind": "bin" 50 | } 51 | }, 52 | "args": [], 53 | "cwd": "${workspaceFolder}" 54 | }, 55 | { 56 | "type": "lldb", 57 | "request": "launch", 58 | "name": "Debug executable 'rga-preproc'", 59 | "cargo": { 60 | "args": ["build", "--bin=rga-preproc"], 61 | "filter": { 62 | "name": "rga-preproc", 63 | "kind": "bin" 64 | } 65 | }, 66 | "args": ["exampledir/tar/test.tar.bz2"], 67 | "cwd": "${workspaceFolder}" 68 | }, 69 | { 70 | "type": "lldb", 71 | "request": "launch", 72 | "name": "Debug unit tests in executable 'rga-preproc'", 73 | "cargo": { 74 | "args": ["test", "--no-run", "--bin=rga-preproc", "--package=rga"], 75 | "filter": { 76 | "name": "rga-preproc", 77 | "kind": "bin" 78 | } 79 | }, 80 | "args": [], 81 | "cwd": "${workspaceFolder}" 82 | } 83 | ] 84 | } 85 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # 0.10.5 (2024-01-16) 2 | 3 | - return the same exit status as rg 4 | 5 | # 0.10.4 (2024-01-16) 6 | 7 | - add `--rga-no-prefix-filenames` flag (https://github.com/phiresky/ripgrep-all/issues/154) 8 | 9 | # 0.10.3 (2024-01-15) 10 | 11 | This was originally supposed to be version 1.0.0, but I don't feel confident enough in the stability to call it that. 12 | 13 | Highlights: 14 | 15 | - rga is now configurable via a config file (~/.config/ripgrep-all/config.jsonc) that is generated on first use, including schema. 16 | - Custom subprocess-spawning adapters can be defined via config file. See https://github.com/phiresky/ripgrep-all/wiki 17 | - External adapters can be shared with the community at https://github.com/phiresky/ripgrep-all/discussions 18 | 19 | Others: 20 | 21 | - mbox adapter (@FliegendeWurst https://github.com/phiresky/ripgrep-all/pull/104) 22 | - auto generate parts of the readme 23 | - add loads of debug logs and performance timings when `--debug` is used 24 | - better error messages via `anyhow` 25 | - add cross-platform rga-fzf binary 26 | - change whole code base to be async 27 | - change adapter interface from `(&Read, &Write) -> ()` to `AsyncRead -> AsyncRead` to allow chaining of adapters 28 | 29 | # 0.9.6 (2020-05-19) 30 | 31 | - Fix windows builds 32 | - Case insensitive file extension matching 33 | - Move to Github Actions instead of Travis 34 | - Fix searching for words that are hyphenated in PDFs (#44) 35 | - Always load rga-preproc binary from location where rga is 36 | 37 | # 0.9.5 (2020-04-08) 38 | 39 | - Allow search in pdf files without extension (https://github.com/phiresky/ripgrep-all/issues/39) 40 | - Prefer shipped binaries to system-installed ones (https://github.com/phiresky/ripgrep-all/issues/32) 41 | - Upgrade dependencies 42 | 43 | # 0.9.3 (2019-09-19) 44 | 45 | - Fix compilation on new Rust by updating rusqlite ([#25](https://github.com/phiresky/ripgrep-all/pull/25)) 46 | 47 | # 0.9.2 (2019-06-17) 48 | 49 | - Fix file ending regex ([#13](https://github.com/phiresky/ripgrep-all/issues/13)) 50 | - Fix decoding of UTF16 with BOM ([#5](https://github.com/phiresky/ripgrep-all/issues/5)) 51 | - Shorten the output on failure to two lines (https://github.com/phiresky/ripgrep-all/issues/7), you can use `--no-messages` to completely suppress errors. 52 | - Better installations instructions in readme for each OS 53 | - Add windows binaries! Including all dependencies! 54 | 55 | # 0.9.1 (2019-06-16) 56 | 57 | - Add enabled adapters to cache key if caching for archive 58 | - Prevent empty trailing page output in pdf reader 59 | 60 | # 0.9.0 (2019-06-16) 61 | 62 | - Split decompress and tar adapter so we can also read pure .bz2 files etc 63 | - Add mime type detection to decompress so we can read e.g. /boot/initramfs.img which is a bz2 file without ending 64 | 65 | # 0.8.9 (2019-06-15) 66 | 67 | - Finally fix linux binary package 68 | - add readme to crates.io 69 | 70 | # 0.8.7 (2019-06-15) 71 | 72 | Minor fixes 73 | 74 | - Correctly wrap help text 75 | - Show own help when no arguments given 76 | - Hopefully package the rga binary correctly 77 | 78 | # 0.8.5 79 | 80 | previous changes not documented 81 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | authors = ["phiresky "] 3 | description = "rga: ripgrep, but also search in PDFs, E-Books, Office documents, zip, tar.gz, etc." 4 | edition = "2024" 5 | exclude = [ 6 | "exampledir/*", 7 | ] 8 | homepage = "https://github.com/phiresky/ripgrep-all" 9 | license = "AGPL-3.0-or-later" 10 | name = "ripgrep_all" 11 | readme = "README.md" 12 | repository = "https://github.com/phiresky/ripgrep-all" 13 | version = "0.10.9" 14 | 15 | [features] 16 | default = ["perf-literal"] 17 | perf-literal = ["regex/perf-literal"] 18 | 19 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 20 | 21 | [dependencies] 22 | anyhow = {version = "1.0.71", features = ["backtrace"]} 23 | async-compression = { version = "0.4.0", features = ["all", "all-algorithms", "tokio"] } 24 | async-stream = "0.3.5" 25 | async-trait = "0.1.68" 26 | async_zip = {version = "0.0.12", features = ["full"]} 27 | bincode = "1.3.3" 28 | bytes = "1.4.0" 29 | clap = {version = "4.3.0", features = ["wrap_help"]} 30 | crossbeam = "0.8.2" 31 | crossbeam-channel = "0.5.8" 32 | derive_more = "0.99.17" 33 | directories-next = "2.0.0" 34 | dyn-clonable = "0.9.0" 35 | dyn-clone = "1.0.11" 36 | encoding_rs = "0.8.32" 37 | encoding_rs_io = "0.1.7" 38 | env_logger = "0.10.0" 39 | glob = "0.3.1" 40 | json_comments = "0.2.1" 41 | lazy_static = "1.4.0" 42 | log = "0.4.17" 43 | mailparse = "0.14.0" 44 | memchr = "2.5.0" 45 | mime2ext = "0.1.52" 46 | open = "5" 47 | paste = "1.0.12" 48 | path-clean = "1.0.1" 49 | pretty-bytes = "0.2.2" 50 | regex = "1.8.2" 51 | rusqlite = {version = "0.30.0", features = ["vtab", "bundled"]} 52 | schemars = {version = "0.8.12", features = ["preserve_order"]} 53 | serde = {version = "1.0.163", features = ["derive"]} 54 | serde_json = "1.0.96" 55 | size_format = "1.0.2" 56 | structopt = "0.3.26" 57 | tempfile = "3.5.0" 58 | tokio = {version = "1.28.1", features = ["full"]} 59 | tokio-rusqlite = "0.5.0" 60 | tokio-stream = {version = "0.1.14", features = ["io-util", "tokio-util"]} 61 | astral-tokio-tar = "0.5.1" 62 | tokio-util = {version = "0.7.8", features = ["io", "full"]} 63 | tree_magic = {package = "tree_magic_mini", version = "3.0.3"} 64 | 65 | [dev-dependencies] 66 | async-recursion = "1.0.4" 67 | ctor = "0.2.0" 68 | pretty_assertions = "1.3.0" 69 | tempfile = "3.5.0" 70 | tokio-test = "0.4.2" 71 | 72 | [profile.release] 73 | debug = true 74 | lto = "thin" 75 | split-debuginfo = "packed" 76 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # rga: ripgrep, but also search in PDFs, E-Books, Office documents, zip, tar.gz, etc. 2 | 3 | rga is a line-oriented search tool that allows you to look for a regex in a multitude of file types. rga wraps the awesome [ripgrep] and enables it to search in pdf, docx, sqlite, jpg, movie subtitles (mkv, mp4), etc. 4 | 5 | [ripgrep]: https://github.com/BurntSushi/ripgrep 6 | 7 | [![github repo](https://img.shields.io/badge/repo-github.com%2Fphiresky%2Fripgrep--all-informational.svg)](https://github.com/phiresky/ripgrep-all) 8 | [![Crates.io](https://img.shields.io/crates/v/ripgrep-all.svg)](https://crates.io/crates/ripgrep-all) 9 | [![fearless concurrency](https://img.shields.io/badge/concurrency-fearless-success.svg)](https://www.reddit.com/r/rustjerk/top/?sort=top&t=all) 10 | 11 | For more detail, see this introductory blogpost: https://phiresky.github.io/blog/2019/rga--ripgrep-for-zip-targz-docx-odt-epub-jpg/ 12 | 13 | rga will recursively descend into archives and match text in every file type it knows. 14 | 15 | Here is an [example directory](https://github.com/phiresky/ripgrep-all/tree/master/exampledir/demo) with different file types: 16 | 17 | ``` 18 | demo/ 19 | ├── greeting.mkv 20 | ├── hello.odt 21 | ├── hello.sqlite3 22 | └── somearchive.zip 23 | ├── dir 24 | │ ├── greeting.docx 25 | │ └── inner.tar.gz 26 | │ └── greeting.pdf 27 | └── greeting.epub 28 | ``` 29 | 30 | ![rga output](doc/demodir.png) 31 | 32 | ## Integration with fzf 33 | 34 | ![rga-fzf](doc/rga-fzf.gif) 35 | 36 | See [the wiki](https://github.com/phiresky/ripgrep-all/wiki/fzf-Integration) for instructions of integrating rga with fzf. 37 | 38 | ## INSTALLATION 39 | 40 | Linux x64, macOS and Windows binaries are available [in GitHub Releases][latestrelease]. 41 | 42 | [latestrelease]: https://github.com/phiresky/ripgrep-all/releases/latest 43 | 44 | ### Linux 45 | 46 | #### Arch Linux 47 | 48 | `pacman -S ripgrep-all` 49 | 50 | #### Gentoo Linux 51 | 52 | `emerge sys-apps/ripgrep-all` 53 | 54 | #### Nix 55 | 56 | `nix-env -iA nixpkgs.ripgrep-all` 57 | 58 | #### Debian-based 59 | 60 | download the [rga binary][latestrelease] and get the dependencies like this: 61 | 62 | `apt install ripgrep pandoc poppler-utils ffmpeg` 63 | 64 | If ripgrep is not included in your package sources, get it from [here](https://github.com/BurntSushi/ripgrep/releases). 65 | 66 | rga will search for all binaries it calls in \$PATH and the directory itself is in. 67 | 68 | ### Windows 69 | 70 | Note that installing via [chocolatey](https://chocolatey.org/packages/ripgrep-all) or [scoop](https://github.com/ScoopInstaller/Main/blob/master/bucket/rga.json) is the only supported download method. If you download the binary from releases manually, you will not get the dependencies (for example pdftotext from poppler). 71 | 72 | If you get an error like `VCRUNTIME140.DLL could not be found`, you need to install [vc_redist.x64.exe](https://support.microsoft.com/en-us/help/2977003/the-latest-supported-visual-c-downloads). 73 | 74 | #### Chocolatey 75 | 76 | ``` 77 | choco install ripgrep-all 78 | ``` 79 | 80 | #### Scoop 81 | 82 | ``` 83 | scoop install rga 84 | ``` 85 | 86 | ### Homebrew/Linuxbrew 87 | 88 | `rga` can be installed with [Homebrew](https://formulae.brew.sh/formula/ripgrep-all#default): 89 | 90 | `brew install rga` 91 | 92 | To install the dependencies that are each not strictly necessary but very useful: 93 | 94 | `brew install pandoc poppler ffmpeg` 95 | 96 | ### MacPorts 97 | 98 | `rga` can also be installed on macOS via [MacPorts](https://ports.macports.org/port/ripgrep-all/): 99 | 100 | `sudo port install ripgrep-all` 101 | 102 | ### Compile from source 103 | 104 | rga should compile with stable Rust (v1.75.0+, check with `rustc --version`). To build it, run the following (or the equivalent in your OS): 105 | 106 | ``` 107 | ~$ apt install build-essential pandoc poppler-utils ffmpeg ripgrep cargo 108 | ~$ cargo install --locked ripgrep_all 109 | ~$ rga --version # this should work now 110 | ``` 111 | 112 | ## Available Adapters 113 | 114 | rga works with _adapters_ that adapt various file formats. It comes with a few adapters integrated: 115 | 116 | ``` 117 | rga --rga-list-adapters 118 | ``` 119 | 120 | You can also add **custom adapters**. See [the wiki](https://github.com/phiresky/ripgrep-all/wiki) for more information. 121 | 122 | 123 | 124 | Adapters: 125 | 126 | - **pandoc** 127 | Uses pandoc to convert binary/unreadable text documents to plain markdown-like text 128 | Runs: pandoc --from= --to=plain --wrap=none --markdown-headings=atx 129 | Extensions: .epub, .odt, .docx, .fb2, .ipynb, .html, .htm 130 | 131 | - **poppler** 132 | Uses pdftotext (from poppler-utils) to extract plain text from PDF files 133 | Runs: pdftotext - - 134 | Extensions: .pdf 135 | Mime Types: application/pdf 136 | 137 | - **postprocpagebreaks** 138 | Adds the page number to each line for an input file that specifies page breaks as ascii page break character. 139 | Mainly to be used internally by the poppler adapter. 140 | Extensions: .asciipagebreaks 141 | 142 | - **ffmpeg** 143 | Uses ffmpeg to extract video metadata/chapters, subtitles, lyrics, and other metadata 144 | Extensions: .mkv, .mp4, .avi, .mp3, .ogg, .flac, .webm 145 | 146 | - **zip** 147 | Reads a zip file as a stream and recurses down into its contents 148 | Extensions: .zip, .jar 149 | Mime Types: application/zip 150 | 151 | - **decompress** 152 | Reads compressed file as a stream and runs a different extractor on the contents. 153 | Extensions: .als, .bz2, .gz, .tbz, .tbz2, .tgz, .xz, .zst 154 | Mime Types: application/gzip, application/x-bzip, application/x-xz, application/zstd 155 | 156 | - **tar** 157 | Reads a tar file as a stream and recurses down into its contents 158 | Extensions: .tar 159 | 160 | - **sqlite** 161 | Uses sqlite bindings to convert sqlite databases into a simple plain text format 162 | Extensions: .db, .db3, .sqlite, .sqlite3 163 | Mime Types: application/x-sqlite3 164 | 165 | The following adapters are disabled by default, and can be enabled using '--rga-adapters=+foo,bar': 166 | 167 | - **mail** 168 | Reads mailbox/mail files and runs extractors on the contents and attachments. 169 | Extensions: .mbox, .mbx, .eml 170 | Mime Types: application/mbox, message/rfc822 171 | 172 | ## USAGE: 173 | 174 | > rga \[RGA OPTIONS\] \[RG OPTIONS\] PATTERN \[PATH \...\] 175 | 176 | 177 | ## FLAGS: 178 | 179 | **\--rga-accurate** 180 | 181 | > Use more accurate but slower matching by mime type 182 | 183 | > By default, rga will match files using file extensions. Some programs, 184 | > such as sqlite3, don\'t care about the file extension at all, so users 185 | > sometimes use any or no extension at all. With this flag, rga will try 186 | > to detect the mime type of input files using the magic bytes (similar 187 | > to the \`file\` utility), and use that to choose the adapter. 188 | > Detection is only done on the first 8KiB of the file, since we can\'t 189 | > always seek on the input (in archives). 190 | 191 | **\--rga-no-cache** 192 | 193 | > Disable caching of results 194 | 195 | > By default, rga caches the extracted text, if it is small enough, to a 196 | > database in \${XDG_CACHE_DIR-\~/.cache}/ripgrep-all on Linux, 197 | > _\~/Library/Caches/ripgrep-all_ on macOS, or 198 | > C:\\Users\\username\\AppData\\Local\\ripgrep-all on Windows. This way, 199 | > repeated searches on the same set of files will be much faster. If you 200 | > pass this flag, all caching will be disabled. 201 | 202 | **-h**, **\--help** 203 | 204 | > Prints help information 205 | 206 | **\--rga-list-adapters** 207 | 208 | > List all known adapters 209 | 210 | **\--rga-print-config-schema** 211 | 212 | > Print the JSON Schema of the configuration file 213 | 214 | **\--rg-help** 215 | 216 | > Show help for ripgrep itself 217 | 218 | **\--rg-version** 219 | 220 | > Show version of ripgrep itself 221 | 222 | **-V**, **\--version** 223 | 224 | > Prints version information 225 | 226 | ## OPTIONS: 227 | 228 | **\--rga-adapters=**\\... 229 | 230 | > Change which adapters to use and in which priority order (descending) 231 | 232 | > \"foo,bar\" means use only adapters foo and bar. \"-bar,baz\" means 233 | > use all default adapters except for bar and baz. \"+bar,baz\" means 234 | > use all default adapters and also bar and baz. 235 | 236 | **\--rga-cache-compression-level=**\ 237 | 238 | > ZSTD compression level to apply to adapter outputs before storing in 239 | > cache db 240 | 241 | > Ranges from 1 - 22 \[default: 12\] 242 | 243 | **\--rga-config-file=**\ 244 | 245 | **\--rga-max-archive-recursion=**\ 246 | 247 | > Maximum nestedness of archives to recurse into \[default: 5\] 248 | 249 | **\--rga-cache-max-blob-len=**\ 250 | 251 | > Max compressed size to cache 252 | 253 | > Longest byte length (after compression) to store in cache. Longer 254 | > adapter outputs will not be cached and recomputed every time. 255 | 256 | > Allowed suffixes on command line: k M G \[default: 2000000\] 257 | 258 | **\--rga-cache-path=**\ 259 | 260 | > Path to store cache db \[default: /home/phire/.cache/ripgrep-all\] 261 | 262 | **-h** shows a concise overview, **\--help** shows more detail and 263 | advanced options. 264 | 265 | All other options not shown here are passed directly to rg, especially 266 | \[PATTERN\] and \[PATH \...\] 267 | 268 | 269 | 270 | ## Config 271 | The config file location leverage the mechanisms defined by 272 | - the [XDG base directory](https://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html) and 273 | the [XDG user directory](https://www.freedesktop.org/wiki/Software/xdg-user-dirs/) specifications on Linux (ex: `~/.config/ripgrep-all/config.jsonc`) 274 | - the [Known Folder](https://msdn.microsoft.com/en-us/library/windows/desktop/dd378457.aspx) API on Windows (ex: `C:\Users\Alice\AppData\Roaming\ripgrep-all/config.jsonc`) 275 | - the [Standard Directories](https://developer.apple.com/library/content/documentation/FileManagement/Conceptual/FileSystemProgrammingGuide/FileSystemOverview/FileSystemOverview.html#//apple_ref/doc/uid/TP40010672-CH2-SW6) 276 | guidelines on macOS (ex: `~/Library/Application Support/ripgrep-all/config.jsonc`) 277 | 278 | 279 | ## Development 280 | 281 | To enable debug logging: 282 | 283 | ```bash 284 | export RUST_LOG=debug 285 | export RUST_BACKTRACE=1 286 | ``` 287 | 288 | Also remember to disable caching with `--rga-no-cache` or clear the cache 289 | (`~/Library/Caches/rga` on macOS, `~/.cache/rga` on other Unixes, 290 | or `C:\Users\username\AppData\Local\rga` on Windows) 291 | to debug the adapters. 292 | 293 | ### Nix and Direnv 294 | 295 | You can use the provided [`flake.nix`](./flake.nix) to setup all build- and 296 | run-time dependencies: 297 | 298 | 1. Enable [Flakes](https://wiki.nixos.org/wiki/Flakes) in your Nix configuration. 299 | 1. Add [`direnv`](https://direnv.net/) to your profile: 300 | `nix profile install nixpkgs#direnv` 301 | 1. `cd` into the directory where you have cloned this directory. 302 | 1. Allow use of [`.envrc`](./.envrc): `direnv allow` 303 | 1. After the dependencies have been installed, your shell will now have all of 304 | the necessary development dependencies. 305 | -------------------------------------------------------------------------------- /ci/macos-install-packages: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | brew install poppler 4 | -------------------------------------------------------------------------------- /ci/ubuntu-install-packages: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | sudo apt-get update 4 | sudo apt-get install -y --no-install-recommends \ 5 | poppler-utils 6 | -------------------------------------------------------------------------------- /doc/config.default.jsonc: -------------------------------------------------------------------------------- 1 | { 2 | // This file follows the JSON schema defined below. 3 | // If you use an editor that supports JSON schema (e.g. VS Code), 4 | // you should be getting IntelliSense and validation. 5 | "$schema": "./config.v1.schema.json", 6 | // The default config and schema will be regenerated if they are missing 7 | // https://github.com/phiresky/ripgrep-all/blob/master/doc/config.default.jsonc 8 | 9 | // The config options are the same as the command line options, 10 | // but with --rga- prefix removed and - and . replaced with _. 11 | // e.g. --rga-no-cache becomes `"no_cache": true. 12 | // The only exception is the `custom_adapters` option, which can only be set in this file. 13 | 14 | "custom_adapters": [ 15 | // See https://github.com/phiresky/ripgrep-all/wiki for more information 16 | // to verify if your custom adapters are picked up correctly, run `rga --rga-list-adapters` 17 | ] 18 | } 19 | -------------------------------------------------------------------------------- /doc/demodir.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/doc/demodir.png -------------------------------------------------------------------------------- /doc/notes.md: -------------------------------------------------------------------------------- 1 | ## schema -> ui generation 2 | 3 | https://json-schema.org/implementations.html#web-ui-generation 4 | 5 | - https://github.com/guillotinaweb/ngx-schema-form 6 | - https://github.com/hamzahamidi/ajsf angular igh 7 | - https://github.com/dashjoin/json-schema-form 8 | - https://github.com/json-editor/json-editor 9 | - https://github.com/jsonform/jsonform 10 | - https://github.com/vazco/uniforms 11 | 12 | ## json schema is ridiculous 13 | 14 | "mimetypes": { 15 | "description": "if not null and --rga-accurate is enabled, mime type matching is used instead of file name matching", 16 | "type": [ 17 | "array", 18 | "null" 19 | ], 20 | "items": { 21 | "type": "string" 22 | } 23 | }, 24 | 25 | what the fuck???? 26 | this is the only thing required to see that json schema has horrible design 27 | -------------------------------------------------------------------------------- /doc/rga-fzf.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/doc/rga-fzf.gif -------------------------------------------------------------------------------- /doc/update-readme.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | content=$( 4 | cat < 6 | $(cargo run --bin rga -- --rga-list-adapters) 7 | 8 | $(help2man -N "cargo run --bin rga --" | pandoc -f man -t markdown --markdown-headings=atx | rg --multiline "## USAGE:(.|\n)*") 9 | 10 | END 11 | ) 12 | 13 | rg --passthrough --multiline '.*update-readme.sh(.|\n)*update-readme.sh.*' README.md --replace "$content" | sponge README.md 14 | prettier --write README.md 15 | -------------------------------------------------------------------------------- /exampledir/decompress/test.log: -------------------------------------------------------------------------------- 1 | hello world 2 | this is a test 3 | -------------------------------------------------------------------------------- /exampledir/decompress/test.log.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/decompress/test.log.bz2 -------------------------------------------------------------------------------- /exampledir/decompress/test.log.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/decompress/test.log.gz -------------------------------------------------------------------------------- /exampledir/decompress/test.log.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/decompress/test.log.xz -------------------------------------------------------------------------------- /exampledir/decompress/test.log.zst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/decompress/test.log.zst -------------------------------------------------------------------------------- /exampledir/decompress/testlogbutwithoutextension: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/decompress/testlogbutwithoutextension -------------------------------------------------------------------------------- /exampledir/demo/greeting.mkv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/demo/greeting.mkv -------------------------------------------------------------------------------- /exampledir/demo/hello.odt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/demo/hello.odt -------------------------------------------------------------------------------- /exampledir/demo/hello.sqlite3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/demo/hello.sqlite3 -------------------------------------------------------------------------------- /exampledir/demo/somearchive.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/demo/somearchive.zip -------------------------------------------------------------------------------- /exampledir/droste.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/droste.zip -------------------------------------------------------------------------------- /exampledir/encoding/utf16le.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/encoding/utf16le.txt -------------------------------------------------------------------------------- /exampledir/encoding/utf8.txt: -------------------------------------------------------------------------------- 1 | hello wörld! 2 | -------------------------------------------------------------------------------- /exampledir/encoding/zip.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/encoding/zip.tar.gz -------------------------------------------------------------------------------- /exampledir/exif.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/exif.png -------------------------------------------------------------------------------- /exampledir/formatting.epub: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/formatting.epub -------------------------------------------------------------------------------- /exampledir/mail_nested.eml: -------------------------------------------------------------------------------- 1 | To: submit.t4eseGWSvG1JST3r@spam.spamcop.net 2 | From: 2012gdwu <2012gdwu@posteo.de> 3 | Subject: Postbank Spam 4 | Autocrypt: addr=2012gdwu@posteo.de; keydata= 5 | mDMEXXjwiRYJKwYBBAHaRw8BAQdAmjXRazNXXy5tK05Dwl5mSRbdth9JkQq92V/QVyqjdgm0 6 | I0FybmUgS2VsbGVyIDxhcm5lLmtlbGxlckBwb3N0ZW8uZGU+iJYEExYIAD4WIQR2UN3HoAGx 7 | KI0B7Eih+UCxBQvPLgUCXXjwiQIbAwUJCWYBgAULCQgHAgYVCgkICwIEFgIDAQIeAQIXgAAK 8 | CRCh+UCxBQvPLpPfAP4gs6Oky3+UO2LU2XxweeQO+YEWXK0QtM2+ajzrGaF3HAD+LBfmyB9+ 9 | Wom2KP0CwxUzI4d6zmiAMSKOnGGgzd65igm4OARdePCJEgorBgEEAZdVAQUBAQdAncxZ3Rox 10 | wmvm+/qCkCm9+PU2HmWr08M3qdqkf2L4IngDAQgHiH4EGBYIACYWIQR2UN3HoAGxKI0B7Eih 11 | +UCxBQvPLgUCXXjwiQIbDAUJCWYBgAAKCRCh+UCxBQvPLpQkAQCgYOlOftMNi+sfn+XQvfOc 12 | ULQWp+cgOBMcyVCdpJEQCwD9HBuwuHobl8FPm0PbRtlCn/7GY4WK+Hh4+3BKmhRn8wU= 13 | Message-ID: <1530ae05-33a7-fa40-9473-ca625a14385a@posteo.de> 14 | Date: Mon, 20 Jul 2020 07:35:55 +0200 15 | User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 16 | Thunderbird/68.10.0 17 | MIME-Version: 1.0 18 | Content-Type: multipart/mixed; 19 | boundary="------------6670F92201FB126ED9472803" 20 | Content-Language: de-DE 21 | 22 | This is a multi-part message in MIME format. 23 | --------------6670F92201FB126ED9472803 24 | Content-Type: text/plain; charset=utf-8 25 | Content-Transfer-Encoding: 7bit 26 | 27 | here you go 28 | 29 | 30 | --------------6670F92201FB126ED9472803 31 | Content-Type: message/rfc822; 32 | name="postbank.eml" 33 | Content-Transfer-Encoding: 7bit 34 | Content-Disposition: attachment; 35 | filename="postbank.eml" 36 | 37 | Return-Path: 38 | Delivered-To: arne.keller@posteo.de 39 | Received: from proxy02.posteo.name ([127.0.0.1]) 40 | by dovecot12 (Dovecot) with LMTP id EaKBGxv9FF+9mwEAJesNpQ 41 | for ; Mon, 20 Jul 2020 04:15:27 +0200 42 | Received: from proxy02.posteo.de ([127.0.0.1]) 43 | by proxy02.posteo.name (Dovecot) with LMTP id 31UFGtHsFF+T4gMAGFAyLg 44 | ; Mon, 20 Jul 2020 04:15:27 +0200 45 | Received: from mailin05.posteo.de (unknown [10.0.1.5]) 46 | by proxy02.posteo.de (Postfix) with ESMTPS id 4B950v2JYGz11fk 47 | for ; Mon, 20 Jul 2020 04:15:27 +0200 (CEST) 48 | Received: from mx03.posteo.de (mailin05.posteo.de [127.0.0.1]) 49 | by mailin05.posteo.de (Postfix) with ESMTPS id 4270120F15 50 | for ; Mon, 20 Jul 2020 04:15:27 +0200 (CEST) 51 | X-Virus-Scanned: amavisd-new at posteo.de 52 | X-Spam-Flag: NO 53 | X-Spam-Score: 2.639 54 | X-Spam-Level: ** 55 | X-Spam-Status: No, score=2.639 tagged_above=-1000 required=8 56 | tests=[AV:Heuristics.Phishing.Email.SpoofedDomain=0.1, ALL_TRUSTED=-1, 57 | FROM_LOCAL_NOVOWEL=0.5, HK_RANDOM_ENVFROM=0.626, HK_RANDOM_FROM=0.999, 58 | HTML_FONT_LOW_CONTRAST=0.001, HTML_IMAGE_ONLY_24=1.282, 59 | HTML_MESSAGE=0.001, HTTPS_HTTP_MISMATCH=0.1, POSTEO_GENERICS_IO=0.01, 60 | T_FILL_THIS_FORM_SHORT=0.01, T_REMOTE_IMAGE=0.01] autolearn=disabled 61 | Received: from mout.web.de (mout.web.de [212.227.15.14]) 62 | by mx03.posteo.de (Postfix) with ESMTPS id 4B950t696Mz10nB 63 | for ; Mon, 20 Jul 2020 04:15:26 +0200 (CEST) 64 | Authentication-Results: mx03.posteo.de; dmarc=none (p=none dis=none) header.from=carcarry.de 65 | Received: from [212.227.15.17] ([212.227.15.17]) by mx-ha.web.de (mxweb010 66 | [212.227.15.17]) with ESMTPS (Nemesis) id 1MRloE-1kQNT22I4w-00T9hm for 67 | ; Mon, 20 Jul 2020 04:15:26 +0200 68 | Received: from mout.kundenserver.de ([212.227.17.24]) by mx-ha.web.de 69 | (mxweb010 [212.227.15.17]) with ESMTPS (Nemesis) id 1MINbE-1k0aRm2Hzw-00EOVM 70 | for <2012gdwu@web.de>; Mon, 20 Jul 2020 04:15:26 +0200 71 | Received: from 217.160.251.109 ([217.160.251.109]) by mrelayeu.kundenserver.de 72 | (mreue107 [212.227.15.183]) with ESMTPSA (Nemesis) id 73 | 1MPoPd-1kBHRt0o2F-00MqkS for <2012gdwu@web.de>; Mon, 20 Jul 2020 04:15:26 74 | +0200 75 | From: "=?utf-8?B?UE9TVEJBTs2fS82f?=" 76 | Subject: BsetSign App : Y7P32-HTXU2-FRDG7 77 | To: "2012gdwu" <2012gdwu@web.de> 78 | Content-Type: multipart/alternative; boundary="QHebeB08yNTYquFAhtQnxv=_cOW4Xd528c" 79 | MIME-Version: 1.0 80 | Date: Mon, 20 Jul 2020 02:15:26 +0000 81 | Message-ID: <1M3lHZ-1jyAPt0pTn-000u1I@mrelayeu.kundenserver.de> 82 | X-Provags-ID: V03:K1:68TECBVA88ZKh8HcSl/N+ElwlecL1tc+1AuDDyqm9em66WO295R 83 | IfuHqA9uG7+Vlyr99v+OneGltnr43KfsgRKj9GgOpDj2QelHphKFGPILAvvsQ8vOq6ucC2W 84 | BW3NEOh3JhitB6o4xLEmj+dbivC0ie728/cPMcjj6TwyBzw5nT1or8mBZWoEMSF/zcu+PIr 85 | gGpFY2puzzURN4oKX82/w== 86 | X-Spam-Flag: NO 87 | X-UI-Out-Filterresults: notjunk:1;V03:K0:c01ZANnvlk8=:ouSMGue72FUx2PJOSNnmEW 88 | qI8A89gf6q3aAdJBhLX1Bhd70xio64ljpha9X5ArOYg6Q2RH1JYyvfBSMoTo3HMy37H3L8kaq 89 | ReRCdSPOMD8+llZ/rRpPLl+7PofGOv+Hu3UO7gzgm9v0YqwLZIwh9P2w9TIu+GqVJWeDdmxrs 90 | RDPeHY8lsRL+8AFeSGNiWBYMEHDxKofTqS5Zh7mal1Bm4JbgEEIP36V4oL3c6V1olMHQZzEH9 91 | 7D0T8U6LyLyfSbuu5M6QN2FZ+F6IDJNDUG1uwNt9K12ESY6TweMR3xInFabiZ9fMPmrjPaNwW 92 | hlyKg67tDYL2lfk2fpa/LbhLnlfKEDqSvkgK54CZh+xbIQetju66cZUEFQyCIcGdAOWI8+nty 93 | FdbNUzxhNpZTPBrA7H95gRuc0u2GJBfZZsxdp46jpBwG65yqmJ32pkJrATo8CNbBO9A6hpdyL 94 | UNu5bavZBJp9dsyY6Cnm6vMOIjJ8qMy/vNkrtRXNWBrnVHhuQZ3B+osG8XWLiyq7s4hFOwDxY 95 | WLRgjKL6HgIj+2DLParwiuSsX8TVy5+WhxDUou0UJDzD3C1JmYiryTlo4Vu4CIZFXkgAuAsEq 96 | c55M6L2eUmD3xQNaqgMEJFksT2qXWaSb2Qw6HM7mtLBbSUhuWtSv2oeVrNwgx8XWexWYYZYFv 97 | KAZzICpkVhxpYIntoKRiDtQZxBDejPwGmne2iG81rn34pGJwOOYojf9dFghodE5bZEqVh6KbA 98 | f/38x9FIoYewzA2WuyngX/bXTdkLQM49W1vdlF5DQOlgYuM8Ni7NeJG888VhDZxcUn6vIIJs3 99 | xH0jOWrWCUz0gK9uyyagjcfdXr54Zv1E7i936CTlRq5QnDKN2C9jQFH5ymD4G1W5zX6Xj/05O 100 | M7VaU9Y3mvOM/+82zsKc5zJOFOf9MoI5JBhnPjHWeqaJgpYhNoKgGvPo3QfZFwzk/MHH2PgB1 101 | PLGvjSE8u/cpYeGhJdzTXM00J9ai5yGRNFD71zHoHBOFGCpmZVnJJ8SD+qUd4K4BfSD+DJ5Qd 102 | t1wsCpH5bgodnXgMcN6Zj0q3P/ODk3dnah1hsYMyIWDBFZ0cTlp2QkYhAKZh1HM5WcfSc5UwU 103 | SrcK9HHiG7BKOFYA1r6Rx5YYqwGWeGxr9mlH7MLyfCwI8PlWtfeB7Pj4eEI1hLy9GMnHBCJDj 104 | W8o1yDeE54rgWHR7CtIF6w+qF+quA3ZdwVSPOHwQeH7vS4OaJjeEyeeT4YOJdIMI7UknEasAG 105 | LfMS/PKWx7+YcUNaz0xvO70NwZj1FKJuWqDS6ZTciMSvGkEFTWVOqn5nPlHi8hDbBTVn70aPa 106 | BQi3U68hgdDpJIHlVLLvRcaCYYly3L60NQBgJroag4fRiIvDUSXfDatrDYOv+L4xBYdB3GP+s 107 | wqtsPY82YOwXP5KlRMPVEZcuWX5tWiOuaNjePbEkXpE2iQZUqfkDQTYNUGZR+TTBqHOWjO7R3 108 | hORQB0gOwe85gZv80G1EL32EtRjVxJxQfrHGPCGXb8HRXbvGGV3Xu3wZEE8iuJngBUJtWeDBq 109 | q61rYwZxVuml72lfRM6Lo+OGLAsyqvobxujY9BHpokZH4FNlUstjUoPANTGoAhM+MyQb0fSAV 110 | 8HA/r6n0oJh0B8+2AxJvVokbhEbL/RlJIZIYpCeRceeA+jjBaR7EvuglUoLN3CcB9CrdDH/qz 111 | ymHzEjPVnFar3/sqRjeKyIk71z4yotOKCPQcdD1gTbYWehZiIJwAlDFSpfPdFTQLOJMWd3wuD 112 | 0mHLep6tLtCY+hjhCYWlTyKKQ8CWiBWPTql21bPp7XVWCfc+4u8kZi5Y3dg3pvpSwwmcyRisX 113 | +7+8a+pBzN4VOEuX+dzglKDrNd6h2OL0tBMnk1yqAV27dX9cMRrO941IvtiaZO90BjZtV92oP 114 | XkGxvKnGQuynHus/3yblaw== 115 | 116 | This is a multi-part message in MIME format 117 | 118 | --QHebeB08yNTYquFAhtQnxv=_cOW4Xd528c 119 | Content-Type: text/plain; charset="utf-8" 120 | Content-Transfer-Encoding: quoted-printable 121 | Content-Disposition: inline 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | Sehr geehrter Herr / Frau =E2=80=A6, 130 | 131 | Ab dem 20. Jul 2020 aktualisiert die Postbank alle BestSign-Anwendung= 132 | en. 133 | 134 | 135 | 136 | =C3=96ffnen Sie den unten stehenden Aktivierungslink, um am Upgrade t= 137 | eilzunehmen. Verkn=C3=BCpfung 138 | 139 | 140 | 141 | 142 | https://meine.postbank.de/#/login 143 | 144 | 145 | 146 | 147 | Wir empfehlen dringend, dieses Upgrade durchzuf=C3=BChren. 148 | 149 | Reundliche Gr=C3=BC=C3=9Fe, 150 | 151 | =C2=A9 2020 Postbank=E2=80=93 eine Niederlassung der Deutsche Bank AG= 152 | 153 | 154 | Hypnotiseur/zertifizierter Hypnosecoach (DVH) 155 | Burnoutpr=C3=A4ventionscoach 156 | Modeberater f=C3=BCr Ma=C3=9Fhemden/Ma=C3=9Fblusen 157 | Kurs/Seminarleiter Waldbaden/Waldcoach 158 | Am Wiesengrund 5 159 | 24980 Schafflund 160 | Tel.: 04639-98475 161 | Mob.: 015117317305 162 | Home : www.hypnosepraxis-im-norden.de 163 | Home : www.masshemden-im-norden.de 164 | Home : www.waldbaden-zwischen-den-meeren.de 165 | 166 | 167 | --QHebeB08yNTYquFAhtQnxv=_cOW4Xd528c 168 | Content-Type: text/html; charset="utf-8" 169 | Content-Transfer-Encoding: quoted-printable 170 | Content-Disposition: inline 171 | 172 |

3D""


175 |
176 |
 Sehr geehrter Herr / Frau =E2=80=A6,
177 |
 Ab dem 20. Jul 2020 aktualisiert die Postbank alle BestSign= 178 | -Anwendungen.

179 |
 =C3=96ffnen Sie den unten stehenden Aktivierungslink, um am= 180 | Upgrade teilzunehmen. Verkn=C3=BCpfung

181 |

183 |
 Wir empfehlen dringend, dieses Upgrade durchzuf=C3=BChren.<= 184 | /div> 185 |
 Reundliche Gr=C3=BC=C3=9Fe,
186 |
 =C2=A9 2020 Postbank=E2=80= 187 | =93 eine Niederlassung der Deutsche Bank AG

Hypnotiseur/zertifizierter Hypnosecoach (DVH)= 189 |
Burnoutpr=C3=A4= 190 | ventionscoach
Mo= 191 | deberater f=C3=BCr Ma=C3=9Fhemden/Ma=C3=9Fblusen
Kurs/Seminarleiter Waldbaden/Waldcoac= 193 | h
Am Wiesengrund= 194 | 5
24980 Schaffl= 195 | und
Tel.: 04639-= 196 | 98475
Mob.: 0151= 197 | 17317305
Home : = 198 | www.hypnos= 201 | epraxis-im-norden.de
Home : www.masshemden-im-norden.de
Home : www.waldbaden-zwischen-den-meeren.de<= 210 | /div> 211 |
212 | 213 | 214 | --QHebeB08yNTYquFAhtQnxv=_cOW4Xd528c-- 215 | 216 | 217 | --------------6670F92201FB126ED9472803-- 218 | -------------------------------------------------------------------------------- /exampledir/screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/screenshot.png -------------------------------------------------------------------------------- /exampledir/short.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/short.pdf -------------------------------------------------------------------------------- /exampledir/sqlitedb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/sqlitedb -------------------------------------------------------------------------------- /exampledir/tar/exampledir.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/tar/exampledir.tar.gz -------------------------------------------------------------------------------- /exampledir/tar/test.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/tar/test.tar -------------------------------------------------------------------------------- /exampledir/tar/test.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/tar/test.tar.bz2 -------------------------------------------------------------------------------- /exampledir/tar/test.tar.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/tar/test.tar.zip -------------------------------------------------------------------------------- /exampledir/test.djvu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/test.djvu -------------------------------------------------------------------------------- /exampledir/test.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/test.zip -------------------------------------------------------------------------------- /exampledir/test/github_email.eml: -------------------------------------------------------------------------------- 1 | Return-Path: 2 | Date: Mon, 31 Jul 2023 01:34:57 -0700 3 | From: "github-actions[bot]" 4 | To: KeYProject/key 5 | In-Reply-To: 6 | References: 7 | Subject: Re: [KeYProject/key] Fix more UI bugs (PR #3232) 8 | Mime-Version: 1.0 9 | Content-Type: multipart/alternative; 10 | boundary="--==_mimepart_64c77231630fb_12bafdd0c012685"; 11 | charset=UTF-8 12 | Content-Transfer-Encoding: 7bit 13 | Precedence: list 14 | X-GitHub-Sender: github-actions[bot] 15 | X-GitHub-Recipient: FliegendeWurst 16 | X-GitHub-Reason: author 17 | List-ID: KeYProject/key 18 | List-Archive: https://github.com/KeYProject/key 19 | X-Auto-Response-Suppress: All 20 | destinations: 2012gdwu+github@posteo.de 21 | X-GitHub-Recipient-Address: 2012gdwu+github@posteo.de 22 | 23 | 24 | ----==_mimepart_64c77231630fb_12bafdd0c012685 25 | Content-Type: text/plain; 26 | charset=UTF-8 27 | Content-Transfer-Encoding: 7bit 28 | 29 | Thank you for your contribution. 30 | 31 | The test artifacts are available on [Artiweb](https://keyproject.github.io/artiweb/3232/). 32 | The newest artifact is [here](https://keyproject.github.io/artiweb/3232/833812796/). 33 | 34 | -- 35 | Reply to this email directly or view it on GitHub: 36 | https://github.com/KeYProject/key/pull/3232#issuecomment-1657918122 37 | You are receiving this because you authored the thread. 38 | 39 | Message ID: 40 | ----==_mimepart_64c77231630fb_12bafdd0c012685 41 | Content-Type: text/html; 42 | charset=UTF-8 43 | Content-Transfer-Encoding: 7bit 44 | 45 |

46 |

Thank you for your contribution.

47 |

The test artifacts are available on Artiweb.
48 | The newest artifact is here.

49 | 50 |


Reply to this email directly, view it on GitHub, or unsubscribe.
You are receiving this because you authored the thread.Message ID: <KeYProject/key/pull/3232/c1657918122@github.com>

51 | 69 | ----==_mimepart_64c77231630fb_12bafdd0c012685-- 70 | -------------------------------------------------------------------------------- /exampledir/test/hello.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/test/hello.gz -------------------------------------------------------------------------------- /exampledir/test/hello.sqlite3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/test/hello.sqlite3 -------------------------------------------------------------------------------- /exampledir/test/hello.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/test/hello.tar -------------------------------------------------------------------------------- /exampledir/test/only-seek-zip.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/test/only-seek-zip.zip -------------------------------------------------------------------------------- /exampledir/test/short.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/test/short.pdf -------------------------------------------------------------------------------- /exampledir/test/short.pdf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/test/short.pdf.gz -------------------------------------------------------------------------------- /exampledir/test/test.mbx: -------------------------------------------------------------------------------- 1 | From 2 | Message-ID: <55a23774-4da7-057c-77a7-ec390fed487b@posteo.de> 3 | Date: Mon, 27 Feb 2023 12:05:46 +0100 4 | MIME-Version: 1.0 5 | User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 6 | Thunderbird/102.8.0 7 | From: Arne Keller <2012gdwu@web.de> 8 | Subject: From encoding test 9 | To: arne.keller@posteo.de 10 | Content-Language: de-DE 11 | X-Enigmail-Draft-Status: N00200 12 | X-Mozilla-Draft-Info: internal/draft; vcard=0; receipt=0; DSN=0; uuencode=0; 13 | attachmentreminder=0; deliveryformat=0 14 | X-Identity-Key: id2 15 | Fcc: imap://2012gdwu@imap.web.de/Gesendet 16 | Content-Type: text/html; charset=UTF-8 17 | Content-Transfer-Encoding: 7bit 18 | 19 | 20 | 21 | 22 | 23 | 24 |

>From

25 |

Another word >From
26 |

27 | 28 | 29 | From 30 | Message-ID: <55a23774-4da7-057c-77a7-ec390fed487b@posteo.de> 31 | Date: Mon, 27 Feb 2023 12:06:56 +0100 32 | MIME-Version: 1.0 33 | User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 34 | Thunderbird/102.8.0 35 | From: Arne Keller <2012gdwu@web.de> 36 | Subject: From encoding test 37 | To: arne.keller@posteo.de 38 | Content-Language: de-DE 39 | X-Enigmail-Draft-Status: N00200 40 | X-Mozilla-Draft-Info: internal/draft; vcard=0; receipt=0; DSN=0; uuencode=0; 41 | attachmentreminder=0; deliveryformat=1 42 | X-Identity-Key: id2 43 | Fcc: imap://2012gdwu@imap.web.de/Gesendet 44 | Content-Type: text/html; charset=UTF-8 45 | Content-Transfer-Encoding: 7bit 46 | 47 | 48 | 49 | 50 | 51 | 52 |

>From

53 |

Another word >From
54 |

55 | 56 | 57 | From - Mon Feb 27 12:06:57 2023 58 | X-Mozilla-Status: 0001 59 | X-Mozilla-Status2: 00000000 60 | Message-ID: <55a23774-4da7-057c-77a7-ec390fed487b@posteo.de> 61 | Date: Mon, 27 Feb 2023 12:06:56 +0100 62 | MIME-Version: 1.0 63 | User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 64 | Thunderbird/102.8.0 65 | From: Arne Keller <2012gdwu@web.de> 66 | Subject: From encoding test 67 | To: arne.keller@posteo.de 68 | Content-Language: de-DE 69 | X-Enigmail-Draft-Status: N00200 70 | X-Mozilla-Draft-Info: internal/draft; vcard=0; receipt=0; DSN=0; uuencode=0; 71 | attachmentreminder=0; deliveryformat=1 72 | X-Identity-Key: id2 73 | Fcc: imap://2012gdwu@imap.web.de/Gesendet 74 | Content-Type: text/html; charset=UTF-8 75 | Content-Transfer-Encoding: 7bit 76 | 77 | 78 | 79 | 80 | 81 | 82 |

>From

83 |

Another word >From
84 |

85 | 86 | 87 | -------------------------------------------------------------------------------- /exampledir/test/twoblankpages.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/test/twoblankpages.pdf -------------------------------------------------------------------------------- /exampledir/wasteland.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/wasteland.docx -------------------------------------------------------------------------------- /exampledir/wasteland.epub: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/wasteland.epub -------------------------------------------------------------------------------- /exampledir/wasteland.mkv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/wasteland.mkv -------------------------------------------------------------------------------- /exampledir/wasteland.mobi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/wasteland.mobi -------------------------------------------------------------------------------- /exampledir/wasteland.odt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/wasteland.odt -------------------------------------------------------------------------------- /exampledir/wasteland.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/wasteland.pdf -------------------------------------------------------------------------------- /flake.lock: -------------------------------------------------------------------------------- 1 | { 2 | "nodes": { 3 | "advisory-db": { 4 | "flake": false, 5 | "locked": { 6 | "lastModified": 1740407442, 7 | "narHash": "sha256-EGzWKm5cUDDJbwVzxSB4N/+CIVycwOG60Gh5f1Vp7JM=", 8 | "owner": "rustsec", 9 | "repo": "advisory-db", 10 | "rev": "2e25d9665f10de885c81a9fb9d51a289f625b05f", 11 | "type": "github" 12 | }, 13 | "original": { 14 | "owner": "rustsec", 15 | "repo": "advisory-db", 16 | "type": "github" 17 | } 18 | }, 19 | "crane": { 20 | "locked": { 21 | "lastModified": 1739936662, 22 | "narHash": "sha256-x4syUjNUuRblR07nDPeLDP7DpphaBVbUaSoeZkFbGSk=", 23 | "owner": "ipetkov", 24 | "repo": "crane", 25 | "rev": "19de14aaeb869287647d9461cbd389187d8ecdb7", 26 | "type": "github" 27 | }, 28 | "original": { 29 | "owner": "ipetkov", 30 | "repo": "crane", 31 | "type": "github" 32 | } 33 | }, 34 | "flake-compat": { 35 | "flake": false, 36 | "locked": { 37 | "lastModified": 1696426674, 38 | "narHash": "sha256-kvjfFW7WAETZlt09AgDn1MrtKzP7t90Vf7vypd3OL1U=", 39 | "owner": "edolstra", 40 | "repo": "flake-compat", 41 | "rev": "0f9255e01c2351cc7d116c072cb317785dd33b33", 42 | "type": "github" 43 | }, 44 | "original": { 45 | "owner": "edolstra", 46 | "repo": "flake-compat", 47 | "type": "github" 48 | } 49 | }, 50 | "flake-utils": { 51 | "inputs": { 52 | "systems": "systems" 53 | }, 54 | "locked": { 55 | "lastModified": 1731533236, 56 | "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=", 57 | "owner": "numtide", 58 | "repo": "flake-utils", 59 | "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b", 60 | "type": "github" 61 | }, 62 | "original": { 63 | "owner": "numtide", 64 | "repo": "flake-utils", 65 | "type": "github" 66 | } 67 | }, 68 | "gitignore": { 69 | "inputs": { 70 | "nixpkgs": [ 71 | "pre-commit-hooks", 72 | "nixpkgs" 73 | ] 74 | }, 75 | "locked": { 76 | "lastModified": 1709087332, 77 | "narHash": "sha256-HG2cCnktfHsKV0s4XW83gU3F57gaTljL9KNSuG6bnQs=", 78 | "owner": "hercules-ci", 79 | "repo": "gitignore.nix", 80 | "rev": "637db329424fd7e46cf4185293b9cc8c88c95394", 81 | "type": "github" 82 | }, 83 | "original": { 84 | "owner": "hercules-ci", 85 | "repo": "gitignore.nix", 86 | "type": "github" 87 | } 88 | }, 89 | "nixpkgs": { 90 | "locked": { 91 | "lastModified": 1740711547, 92 | "narHash": "sha256-qvixVB2cFGOX/B//KbjKUndrMbIDEGBx7xphitqnvr8=", 93 | "owner": "NixOS", 94 | "repo": "nixpkgs", 95 | "rev": "2ca95eef7e3b33ea8b858ed025e492373aca8106", 96 | "type": "github" 97 | }, 98 | "original": { 99 | "owner": "NixOS", 100 | "repo": "nixpkgs", 101 | "type": "github" 102 | } 103 | }, 104 | "pre-commit-hooks": { 105 | "inputs": { 106 | "flake-compat": "flake-compat", 107 | "gitignore": "gitignore", 108 | "nixpkgs": [ 109 | "nixpkgs" 110 | ] 111 | }, 112 | "locked": { 113 | "lastModified": 1737465171, 114 | "narHash": "sha256-R10v2hoJRLq8jcL4syVFag7nIGE7m13qO48wRIukWNg=", 115 | "owner": "cachix", 116 | "repo": "pre-commit-hooks.nix", 117 | "rev": "9364dc02281ce2d37a1f55b6e51f7c0f65a75f17", 118 | "type": "github" 119 | }, 120 | "original": { 121 | "owner": "cachix", 122 | "repo": "pre-commit-hooks.nix", 123 | "type": "github" 124 | } 125 | }, 126 | "root": { 127 | "inputs": { 128 | "advisory-db": "advisory-db", 129 | "crane": "crane", 130 | "flake-utils": "flake-utils", 131 | "nixpkgs": "nixpkgs", 132 | "pre-commit-hooks": "pre-commit-hooks", 133 | "rust-overlay": "rust-overlay" 134 | } 135 | }, 136 | "rust-overlay": { 137 | "inputs": { 138 | "nixpkgs": [ 139 | "nixpkgs" 140 | ] 141 | }, 142 | "locked": { 143 | "lastModified": 1740709839, 144 | "narHash": "sha256-4dF++MXIXna/AwlZWDKr7bgUmY4xoEwvkF1GewjNrt0=", 145 | "owner": "oxalica", 146 | "repo": "rust-overlay", 147 | "rev": "b4270835bf43c6f80285adac6f66a26d83f0f277", 148 | "type": "github" 149 | }, 150 | "original": { 151 | "owner": "oxalica", 152 | "repo": "rust-overlay", 153 | "type": "github" 154 | } 155 | }, 156 | "systems": { 157 | "locked": { 158 | "lastModified": 1681028828, 159 | "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", 160 | "owner": "nix-systems", 161 | "repo": "default", 162 | "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", 163 | "type": "github" 164 | }, 165 | "original": { 166 | "owner": "nix-systems", 167 | "repo": "default", 168 | "type": "github" 169 | } 170 | } 171 | }, 172 | "root": "root", 173 | "version": 7 174 | } 175 | -------------------------------------------------------------------------------- /flake.nix: -------------------------------------------------------------------------------- 1 | { 2 | description = "ripgrep, but also search in PDFs, E-Books, Office documents, zip, tar.gz, etc."; 3 | 4 | inputs = { 5 | nixpkgs.url = "github:NixOS/nixpkgs"; 6 | 7 | crane = { 8 | url = "github:ipetkov/crane"; 9 | }; 10 | 11 | flake-utils.url = "github:numtide/flake-utils"; 12 | 13 | rust-overlay = { 14 | url = "github:oxalica/rust-overlay"; 15 | inputs.nixpkgs.follows = "nixpkgs"; 16 | }; 17 | 18 | advisory-db = { 19 | url = "github:rustsec/advisory-db"; 20 | flake = false; 21 | }; 22 | 23 | pre-commit-hooks = { 24 | url = "github:cachix/pre-commit-hooks.nix"; 25 | inputs.nixpkgs.follows = "nixpkgs"; 26 | }; 27 | }; 28 | 29 | outputs = { 30 | self, 31 | nixpkgs, 32 | crane, 33 | flake-utils, 34 | rust-overlay, 35 | advisory-db, 36 | pre-commit-hooks, 37 | }: 38 | flake-utils.lib.eachDefaultSystem (system: let 39 | pkgs = import nixpkgs { 40 | inherit system; 41 | overlays = [(import rust-overlay)]; 42 | }; 43 | 44 | craneLib = 45 | (crane.mkLib pkgs).overrideToolchain 46 | (p: 47 | (p.rust-bin.fromRustupToolchainFile ./rust-toolchain.toml).override { 48 | extensions = [ 49 | "rust-analyzer" 50 | "rust-src" 51 | "rustfmt" 52 | ]; 53 | }); 54 | 55 | src = pkgs.lib.cleanSourceWith { 56 | src = craneLib.path ./.; 57 | filter = pkgs.lib.cleanSourceFilter; 58 | }; 59 | 60 | nativeBuildInputs = pkgs.lib.optionals pkgs.stdenv.isDarwin [ 61 | # Additional darwin specific inputs can be set here 62 | pkgs.libiconv 63 | ]; 64 | 65 | runtimeInputs = with pkgs; [ffmpeg pandoc poppler_utils ripgrep zip]; 66 | 67 | # Build *just* the cargo dependencies, so we can reuse 68 | # all of that work (e.g. via cachix) when running in CI 69 | cargoArtifacts = 70 | craneLib.buildDepsOnly {inherit src nativeBuildInputs;}; 71 | 72 | # Build the actual crate itself, reusing the dependency 73 | # artifacts from above. 74 | rgaBinary = craneLib.buildPackage { 75 | inherit cargoArtifacts src nativeBuildInputs; 76 | buildInputs = runtimeInputs; # needed for tests 77 | }; 78 | 79 | # Provide a shell script of the Rust binary plus runtime dependencies. 80 | rga = pkgs.pkgs.writeShellApplication { 81 | name = "rga"; 82 | text = ''rga "$@"''; 83 | runtimeInputs = runtimeInputs ++ [rgaBinary]; 84 | }; 85 | 86 | pre-commit = pre-commit-hooks.lib."${system}".run; 87 | in { 88 | # `nix flake check` 89 | checks = { 90 | # Build the crate as part of `nix flake check` for convenience 91 | inherit rgaBinary; 92 | 93 | # Run clippy (and deny all warnings) on the crate source, 94 | # again, resuing the dependency artifacts from above. 95 | # 96 | # Note that this is done as a separate derivation so that 97 | # we can block the CI if there are issues here, but not 98 | # prevent downstream consumers from building our crate by itself. 99 | rga-clippy = craneLib.cargoClippy { 100 | inherit cargoArtifacts src; 101 | cargoClippyExtraArgs = "--all-targets -- --deny warnings"; 102 | }; 103 | 104 | rga-doc = craneLib.cargoDoc {inherit cargoArtifacts src;}; 105 | 106 | # Audit dependencies 107 | rga-audit = craneLib.cargoAudit {inherit src advisory-db;}; 108 | 109 | # Run tests with cargo-nextest. 110 | rga-nextest = craneLib.cargoNextest { 111 | inherit cargoArtifacts src nativeBuildInputs; 112 | buildInputs = runtimeInputs; # needed for tests 113 | partitions = 1; 114 | partitionType = "count"; 115 | }; 116 | 117 | pre-commit = pre-commit { 118 | src = ./.; 119 | hooks = { 120 | alejandra.enable = true; 121 | rustfmt = { 122 | enable = true; 123 | packageOverrides.cargo = craneLib.cargo; 124 | packageOverrides.rustfmt = craneLib.rustfmt; 125 | }; 126 | typos = { 127 | enable = true; 128 | settings = { 129 | exclude = "exampledir/*"; 130 | }; 131 | }; 132 | }; 133 | }; 134 | }; 135 | 136 | # `nix build` 137 | packages = { 138 | inherit rgaBinary rga; 139 | default = rga; # `nix build` 140 | }; 141 | 142 | # `nix run` 143 | apps.default = flake-utils.lib.mkApp {drv = rga;}; 144 | 145 | # `nix develop` 146 | devShells.default = craneLib.devShell { 147 | inherit nativeBuildInputs; 148 | inherit (self.checks.${system}.pre-commit) shellHook; 149 | inputsFrom = builtins.attrValues self.checks; 150 | buildInputs = self.checks.${system}.pre-commit.enabledPackages; 151 | packages = runtimeInputs; 152 | }; 153 | }); 154 | } 155 | -------------------------------------------------------------------------------- /rust-toolchain.toml: -------------------------------------------------------------------------------- 1 | [toolchain] 2 | channel = "stable" 3 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | edition = "2018" 2 | -------------------------------------------------------------------------------- /src/adapted_iter.rs: -------------------------------------------------------------------------------- 1 | use std::pin::Pin; 2 | 3 | use tokio_stream::Stream; 4 | 5 | use crate::adapters::AdaptInfo; 6 | 7 | pub trait AdaptedFilesIter: Stream> + Send {} 8 | impl AdaptedFilesIter for T where T: Stream> + Send {} 9 | 10 | pub type AdaptedFilesIterBox = Pin>; 11 | 12 | pub fn one_file(ai: AdaptInfo) -> AdaptedFilesIterBox { 13 | Box::pin(tokio_stream::once(Ok(ai))) 14 | } 15 | -------------------------------------------------------------------------------- /src/adapters.rs: -------------------------------------------------------------------------------- 1 | pub mod custom; 2 | pub mod decompress; 3 | pub mod ffmpeg; 4 | pub mod mbox; 5 | pub mod postproc; 6 | use std::sync::Arc; 7 | pub mod sqlite; 8 | pub mod tar; 9 | pub mod writing; 10 | pub mod zip; 11 | use crate::{adapted_iter::AdaptedFilesIterBox, config::RgaConfig, matching::*}; 12 | use anyhow::{Context, Result, format_err}; 13 | use async_trait::async_trait; 14 | use custom::BUILTIN_SPAWNING_ADAPTERS; 15 | use custom::CustomAdapterConfig; 16 | use log::*; 17 | use tokio::io::AsyncRead; 18 | 19 | use core::fmt::Debug; 20 | use std::borrow::Cow; 21 | use std::collections::HashMap; 22 | use std::iter::Iterator; 23 | use std::path::PathBuf; 24 | use std::pin::Pin; 25 | 26 | use self::postproc::PostprocPageBreaks; 27 | 28 | pub type ReadBox = Pin>; 29 | pub struct AdapterMeta { 30 | /// unique short name of this adapter (a-z0-9 only) 31 | pub name: String, 32 | /// version identifier. used to key cache entries, change if your output format changes 33 | pub version: i32, 34 | pub description: String, 35 | /// indicates whether this adapter can descend (=call rga_preproc again). if true, the cache key needs to include the list of active adapters 36 | pub recurses: bool, 37 | /// list of matchers (interpreted as a OR b OR ...) 38 | pub fast_matchers: Vec, 39 | /// list of matchers when we have mime type detection active (interpreted as ORed) 40 | /// warning: this *overrides* the fast matchers 41 | pub slow_matchers: Option>, 42 | /// if true, slow_matchers is merged with fast matchers if accurate is enabled 43 | /// for example, in sqlite you want this disabled since the db extension can mean other things and the mime type matching is very accurate for sqlite. 44 | /// but for tar you want it enabled, since the tar extension is very accurate but the tar mime matcher can have false negatives 45 | pub keep_fast_matchers_if_accurate: bool, 46 | // if true, adapter is only used when user lists it in `--rga-adapters` 47 | pub disabled_by_default: bool, 48 | } 49 | impl AdapterMeta { 50 | // todo: this is pretty ugly 51 | pub fn get_matchers<'a>( 52 | &'a self, 53 | slow: bool, 54 | ) -> Box> + 'a> { 55 | match ( 56 | slow, 57 | self.keep_fast_matchers_if_accurate, 58 | &self.slow_matchers, 59 | ) { 60 | (true, false, Some(sm)) => Box::new(sm.iter().map(Cow::Borrowed)), 61 | (true, true, Some(sm)) => Box::new( 62 | sm.iter().map(Cow::Borrowed).chain( 63 | self.fast_matchers 64 | .iter() 65 | .map(|e| Cow::Owned(FileMatcher::Fast(e.clone()))), 66 | ), 67 | ), 68 | // don't have slow matchers or slow matching disabled 69 | (true, _, None) | (false, _, _) => Box::new( 70 | self.fast_matchers 71 | .iter() 72 | .map(|e| Cow::Owned(FileMatcher::Fast(e.clone()))), 73 | ), 74 | } 75 | } 76 | } 77 | 78 | pub trait GetMetadata { 79 | fn metadata(&self) -> &AdapterMeta; 80 | } 81 | 82 | #[async_trait] 83 | pub trait FileAdapter: GetMetadata + Send + Sync { 84 | /// adapt a file. 85 | /// 86 | /// detection_reason is the Matcher that was used to identify this file. Unless --rga-accurate was given, it is always a FastMatcher 87 | async fn adapt( 88 | &self, 89 | a: AdaptInfo, 90 | detection_reason: &FileMatcher, 91 | ) -> Result; 92 | } 93 | 94 | pub struct AdaptInfo { 95 | /// file path. May not be an actual file on the file system (e.g. in an archive). Used for matching file extensions. 96 | pub filepath_hint: PathBuf, 97 | /// true if filepath_hint is an actual file on the file system 98 | pub is_real_file: bool, 99 | /// depth at which this file is in archives. 0 for real filesystem 100 | pub archive_recursion_depth: i32, 101 | /// stream to read the file from. can be from a file or from some decoder 102 | pub inp: ReadBox, 103 | /// prefix every output line with this string to better indicate the file's location if it is in some archive 104 | pub line_prefix: String, 105 | pub postprocess: bool, 106 | pub config: RgaConfig, 107 | } 108 | 109 | /// (enabledAdapters, disabledAdapters) 110 | type AdaptersTuple = (Vec>, Vec>); 111 | 112 | pub fn get_all_adapters(custom_adapters: Option>) -> AdaptersTuple { 113 | // order in descending priority 114 | let mut adapters: Vec> = vec![]; 115 | if let Some(custom_adapters) = custom_adapters { 116 | for adapter_config in custom_adapters { 117 | adapters.push(Arc::new(adapter_config.to_adapter())); 118 | } 119 | } 120 | 121 | let internal_adapters: Vec> = vec![ 122 | Arc::new(PostprocPageBreaks::default()), 123 | Arc::new(ffmpeg::FFmpegAdapter::new()), 124 | Arc::new(zip::ZipAdapter::new()), 125 | Arc::new(decompress::DecompressAdapter::new()), 126 | Arc::new(mbox::MboxAdapter::new()), 127 | Arc::new(tar::TarAdapter::new()), 128 | Arc::new(sqlite::SqliteAdapter::new()), 129 | ]; 130 | adapters.extend( 131 | BUILTIN_SPAWNING_ADAPTERS 132 | .iter() 133 | .map(|e| -> Arc { Arc::new(e.to_adapter()) }), 134 | ); 135 | adapters.extend(internal_adapters); 136 | 137 | adapters 138 | .into_iter() 139 | .partition(|e| !e.metadata().disabled_by_default) 140 | } 141 | 142 | /** 143 | * filter adapters by given names: 144 | * 145 | * - "" means use default enabled adapter list 146 | * - "a,b" means use adapters a,b 147 | * - "-a,b" means use default list except for a and b 148 | * - "+a,b" means use default list but also a and b (a,b will be prepended to the list so given higher priority) 149 | */ 150 | pub fn get_adapters_filtered>( 151 | custom_adapters: Option>, 152 | adapter_names: &[T], 153 | ) -> Result>> { 154 | let (def_enabled_adapters, def_disabled_adapters) = get_all_adapters(custom_adapters); 155 | let adapters = if !adapter_names.is_empty() { 156 | let adapters_map: HashMap<_, _> = def_enabled_adapters 157 | .iter() 158 | .chain(def_disabled_adapters.iter()) 159 | .map(|e| (e.metadata().name.clone(), e.clone())) 160 | .collect(); 161 | let mut adapters = vec![]; 162 | let mut subtractive = false; 163 | let mut additive = false; 164 | for (i, name) in adapter_names.iter().enumerate() { 165 | let mut name = name.as_ref(); 166 | if i == 0 && (name.starts_with('-')) { 167 | subtractive = true; 168 | name = &name[1..]; 169 | adapters = def_enabled_adapters.clone(); 170 | } else if i == 0 && (name.starts_with('+')) { 171 | name = &name[1..]; 172 | adapters = def_enabled_adapters.clone(); 173 | additive = true; 174 | } 175 | if subtractive { 176 | let inx = adapters 177 | .iter() 178 | .position(|a| a.metadata().name == name) 179 | .ok_or_else(|| format_err!("Could not remove adapter {}: Not in list", name))?; 180 | adapters.remove(inx); 181 | } else { 182 | let adapter = adapters_map 183 | .get(name) 184 | .ok_or_else(|| { 185 | format_err!( 186 | "Unknown adapter: \"{}\". Known adapters: {}", 187 | name, 188 | adapters_map 189 | .keys() 190 | .map(|e| e.as_ref()) 191 | .collect::>() 192 | .join(", ") 193 | ) 194 | })? 195 | .clone(); 196 | if additive { 197 | adapters.insert(0, adapter); 198 | } else { 199 | adapters.push(adapter); 200 | } 201 | } 202 | } 203 | adapters 204 | } else { 205 | def_enabled_adapters 206 | }; 207 | debug!( 208 | "Chosen available adapters: {}", 209 | adapters 210 | .iter() 211 | .map(|a| a.metadata().name.clone()) 212 | .collect::>() 213 | .join(",") 214 | ); 215 | Ok(adapters) 216 | } 217 | -------------------------------------------------------------------------------- /src/adapters/custom.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | use super::{AdaptInfo, AdapterMeta, FileAdapter, GetMetadata}; 3 | use crate::adapted_iter::one_file; 4 | 5 | use crate::{ 6 | adapted_iter::AdaptedFilesIterBox, 7 | expand::expand_str_ez, 8 | matching::{FastFileMatcher, FileMatcher}, 9 | }; 10 | use crate::{join_handle_to_stream, to_io_err}; 11 | use anyhow::Result; 12 | use async_stream::stream; 13 | use bytes::Bytes; 14 | use lazy_static::lazy_static; 15 | use log::debug; 16 | use schemars::JsonSchema; 17 | use serde::{Deserialize, Serialize}; 18 | use std::path::Path; 19 | use std::process::Stdio; 20 | use tokio::io::AsyncReadExt; 21 | use tokio::process::Child; 22 | use tokio::process::Command; 23 | 24 | use tokio_util::io::StreamReader; 25 | // mostly the same as AdapterMeta + SpawningFileAdapter 26 | #[derive(Debug, Deserialize, Serialize, JsonSchema, Default, PartialEq, Clone)] 27 | pub struct CustomAdapterConfig { 28 | /// The unique identifier and name of this adapter. 29 | /// 30 | /// Must only include a-z, 0-9, _. 31 | pub name: String, 32 | 33 | /// The description of this adapter shown in help. 34 | pub description: String, 35 | 36 | /// If true, the adapter will be disabled by default. 37 | pub disabled_by_default: Option, 38 | 39 | /// Version identifier used to key cache entries. 40 | /// 41 | /// Change this if the configuration or program changes. 42 | pub version: i32, 43 | 44 | /// The file extensions this adapter supports, for example `["epub", "mobi"]`. 45 | pub extensions: Vec, 46 | 47 | /// If not null and `--rga-accurate` is enabled, mimetype matching is used instead of file name matching. 48 | pub mimetypes: Option>, 49 | 50 | /// If `--rga-accurate`, only match by mime types and ignore extensions completely. 51 | pub match_only_by_mime: Option, 52 | 53 | /// The name or path of the binary to run. 54 | pub binary: String, 55 | 56 | /// The arguments to run the program with. 57 | /// Placeholders: 58 | /// - `$input_file_extension`: the file extension (without dot). e.g. foo.tar.gz -> gz 59 | /// - `$input_file_stem`: the file name without the last extension. e.g. foo.tar.gz -> foo.tar 60 | /// - `$input_virtual_path`: the full input file path. 61 | /// Note that this path may not actually exist on disk because it is the result of another adapter. 62 | /// 63 | /// stdin of the program will be connected to the input file, and stdout is assumed to be the converted file 64 | pub args: Vec, 65 | 66 | /// The output path hint. 67 | /// The placeholders are the same as for `.args` 68 | /// 69 | /// If not set, defaults to `"${input_virtual_path}.txt"`. 70 | /// 71 | /// Setting this is useful if the output format is not plain text (.txt) but instead some other format that should be passed to another adapter 72 | pub output_path_hint: Option, 73 | } 74 | 75 | fn strs(arr: &[&str]) -> Vec { 76 | arr.iter().map(ToString::to_string).collect() 77 | } 78 | 79 | lazy_static! { 80 | pub static ref BUILTIN_SPAWNING_ADAPTERS: Vec = vec![ 81 | // from https://github.com/jgm/pandoc/blob/master/src/Text/Pandoc/App/FormatHeuristics.hs 82 | // excluding formats that could cause problems (.db ?= sqlite) or that are already text formats (e.g. xml-based) 83 | //"db" -> Just "docbook" 84 | //"adoc" -> Just "asciidoc" 85 | //"asciidoc" -> Just "asciidoc" 86 | //"context" -> Just "context" 87 | //"ctx" -> Just "context" 88 | //"dokuwiki" -> Just "dokuwiki" 89 | //"htm" -> Just "html" 90 | //"html" -> Just "html" 91 | //"json" -> Just "json" 92 | //"latex" -> Just "latex" 93 | //"lhs" -> Just "markdown+lhs" 94 | //"ltx" -> Just "latex" 95 | //"markdown" -> Just "markdown" 96 | //"md" -> Just "markdown" 97 | //"ms" -> Just "ms" 98 | //"muse" -> Just "muse" 99 | //"native" -> Just "native" 100 | //"opml" -> Just "opml" 101 | //"org" -> Just "org" 102 | //"roff" -> Just "ms" 103 | //"rst" -> Just "rst" 104 | //"s5" -> Just "s5" 105 | //"t2t" -> Just "t2t" 106 | //"tei" -> Just "tei" 107 | //"tei.xml" -> Just "tei" 108 | //"tex" -> Just "latex" 109 | //"texi" -> Just "texinfo" 110 | //"texinfo" -> Just "texinfo" 111 | //"textile" -> Just "textile" 112 | //"text" -> Just "markdown" 113 | //"txt" -> Just "markdown" 114 | //"xhtml" -> Just "html" 115 | //"wiki" -> Just "mediawiki" 116 | CustomAdapterConfig { 117 | name: "pandoc".to_string(), 118 | description: "Uses pandoc to convert binary/unreadable text documents to plain markdown-like text".to_string(), 119 | version: 3, 120 | extensions: strs(&["epub", "odt", "docx", "fb2", "ipynb", "html", "htm"]), 121 | binary: "pandoc".to_string(), 122 | mimetypes: None, 123 | // simpler markdown (with more information loss but plainer text) 124 | //.arg("--to=commonmark-header_attributes-link_attributes-fenced_divs-markdown_in_html_blocks-raw_html-native_divs-native_spans-bracketed_spans") 125 | args: strs(&[ 126 | "--from=$input_file_extension", 127 | "--to=plain", 128 | "--wrap=none", 129 | "--markdown-headings=atx" 130 | ]), 131 | disabled_by_default: None, 132 | match_only_by_mime: None, 133 | output_path_hint: None 134 | }, 135 | CustomAdapterConfig { 136 | name: "poppler".to_owned(), 137 | version: 1, 138 | description: "Uses pdftotext (from poppler-utils) to extract plain text from PDF files" 139 | .to_owned(), 140 | 141 | extensions: strs(&["pdf"]), 142 | mimetypes: Some(strs(&["application/pdf"])), 143 | 144 | binary: "pdftotext".to_string(), 145 | args: strs(&["-", "-"]), 146 | disabled_by_default: None, 147 | match_only_by_mime: None, 148 | output_path_hint: Some("${input_virtual_path}.txt.asciipagebreaks".into()) 149 | } 150 | ]; 151 | } 152 | 153 | /// replace a Command.spawn() error "File not found" with a more readable error 154 | /// to indicate some program is not installed 155 | pub fn map_exe_error(err: std::io::Error, exe_name: &str, help: &str) -> anyhow::Error { 156 | use std::io::ErrorKind::*; 157 | match err.kind() { 158 | NotFound => format_err!("Could not find executable \"{}\". {}", exe_name, help), 159 | _ => anyhow::Error::from(err), 160 | } 161 | } 162 | 163 | fn proc_wait(mut child: Child, context: impl FnOnce() -> String) -> impl AsyncRead { 164 | let s = stream! { 165 | let res = child.wait().await?; 166 | if res.success() { 167 | yield std::io::Result::Ok(Bytes::new()); 168 | } else { 169 | Err(format_err!("{:?}", res)).with_context(context).map_err(to_io_err)?; 170 | } 171 | }; 172 | StreamReader::new(s) 173 | } 174 | 175 | pub fn pipe_output( 176 | _line_prefix: &str, 177 | mut cmd: Command, 178 | inp: ReadBox, 179 | exe_name: &str, 180 | help: &str, 181 | ) -> Result { 182 | let cmd_log = format!("{:?}", cmd); // todo: perf 183 | let mut cmd = cmd 184 | .stdin(Stdio::piped()) 185 | .stdout(Stdio::piped()) 186 | .spawn() 187 | .map_err(|e| map_exe_error(e, exe_name, help))?; 188 | let mut stdi = cmd.stdin.take().expect("is piped"); 189 | let stdo = cmd.stdout.take().expect("is piped"); 190 | 191 | let join = tokio::spawn(async move { 192 | let mut z = inp; 193 | tokio::io::copy(&mut z, &mut stdi).await?; 194 | std::io::Result::Ok(()) 195 | }); 196 | Ok(Box::pin(stdo.chain( 197 | proc_wait(cmd, move || format!("subprocess: {cmd_log}")).chain(join_handle_to_stream(join)), 198 | ))) 199 | } 200 | 201 | pub struct CustomSpawningFileAdapter { 202 | binary: String, 203 | args: Vec, 204 | meta: AdapterMeta, 205 | output_path_hint: Option, 206 | } 207 | impl GetMetadata for CustomSpawningFileAdapter { 208 | fn metadata(&self) -> &AdapterMeta { 209 | &self.meta 210 | } 211 | } 212 | fn arg_replacer(arg: &str, filepath_hint: &Path) -> Result { 213 | expand_str_ez(arg, |s| match s { 214 | "input_virtual_path" => Ok(filepath_hint.to_string_lossy()), 215 | "input_file_stem" => Ok(filepath_hint 216 | .file_stem() 217 | .unwrap_or_default() 218 | .to_string_lossy()), 219 | "input_file_extension" => Ok(filepath_hint 220 | .extension() 221 | .unwrap_or_default() 222 | .to_string_lossy()), 223 | e => Err(anyhow::format_err!("unknown replacer ${{{e}}}")), 224 | }) 225 | } 226 | impl CustomSpawningFileAdapter { 227 | fn command( 228 | &self, 229 | filepath_hint: &std::path::Path, 230 | mut command: tokio::process::Command, 231 | ) -> Result { 232 | command.args( 233 | self.args 234 | .iter() 235 | .map(|arg| arg_replacer(arg, filepath_hint)) 236 | .collect::>>()?, 237 | ); 238 | log::debug!("running command {:?}", command); 239 | Ok(command) 240 | } 241 | } 242 | #[async_trait] 243 | impl FileAdapter for CustomSpawningFileAdapter { 244 | async fn adapt( 245 | &self, 246 | ai: AdaptInfo, 247 | _detection_reason: &FileMatcher, 248 | ) -> Result { 249 | let AdaptInfo { 250 | filepath_hint, 251 | inp, 252 | line_prefix, 253 | archive_recursion_depth, 254 | postprocess, 255 | config, 256 | .. 257 | } = ai; 258 | 259 | let cmd = Command::new(&self.binary); 260 | let cmd = self 261 | .command(&filepath_hint, cmd) 262 | .with_context(|| format!("Could not set cmd arguments for {}", self.binary))?; 263 | debug!("executing {:?}", cmd); 264 | let output = pipe_output(&line_prefix, cmd, inp, &self.binary, "")?; 265 | Ok(one_file(AdaptInfo { 266 | filepath_hint: PathBuf::from(arg_replacer( 267 | self.output_path_hint 268 | .as_deref() 269 | .unwrap_or("${input_virtual_path}.txt"), 270 | &filepath_hint, 271 | )?), 272 | inp: output, 273 | line_prefix, 274 | is_real_file: false, 275 | archive_recursion_depth: archive_recursion_depth + 1, 276 | postprocess, 277 | config, 278 | })) 279 | } 280 | } 281 | impl CustomAdapterConfig { 282 | pub fn to_adapter(&self) -> CustomSpawningFileAdapter { 283 | CustomSpawningFileAdapter { 284 | binary: self.binary.clone(), 285 | args: self.args.clone(), 286 | output_path_hint: self.output_path_hint.clone(), 287 | meta: AdapterMeta { 288 | name: self.name.clone(), 289 | version: self.version, 290 | description: format!( 291 | "{}\nRuns: {} {}", 292 | self.description, 293 | self.binary, 294 | self.args.join(" ") 295 | ), 296 | recurses: true, 297 | fast_matchers: self 298 | .extensions 299 | .iter() 300 | .map(|s| FastFileMatcher::FileExtension(s.to_string())) 301 | .collect(), 302 | slow_matchers: self.mimetypes.as_ref().map(|mimetypes| { 303 | mimetypes 304 | .iter() 305 | .map(|s| FileMatcher::MimeType(s.to_string())) 306 | .collect() 307 | }), 308 | keep_fast_matchers_if_accurate: !self.match_only_by_mime.unwrap_or(false), 309 | disabled_by_default: self.disabled_by_default.unwrap_or(false), 310 | }, 311 | } 312 | } 313 | } 314 | 315 | #[cfg(test)] 316 | mod test { 317 | use super::super::FileAdapter; 318 | use super::*; 319 | use crate::preproc::loop_adapt; 320 | use crate::test_utils::*; 321 | use anyhow::Result; 322 | use pretty_assertions::assert_eq; 323 | use tokio::fs::File; 324 | 325 | #[tokio::test] 326 | async fn poppler() -> Result<()> { 327 | let adapter = poppler_adapter(); 328 | 329 | let filepath = test_data_dir().join("short.pdf"); 330 | 331 | let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?)); 332 | // let r = adapter.adapt(a, &d)?; 333 | let r = loop_adapt(&adapter, d, a).await?; 334 | let o = adapted_to_vec(r).await?; 335 | assert_eq!( 336 | String::from_utf8(o)?, 337 | "PREFIX:Page 1: hello world 338 | PREFIX:Page 1: this is just a test. 339 | PREFIX:Page 1: 340 | PREFIX:Page 1: 1 341 | PREFIX:Page 1: 342 | PREFIX:Page 1: 343 | " 344 | ); 345 | Ok(()) 346 | } 347 | 348 | use crate::{ 349 | adapters::custom::CustomAdapterConfig, 350 | test_utils::{adapted_to_vec, simple_adapt_info}, 351 | }; 352 | use std::io::Cursor; 353 | 354 | #[tokio::test] 355 | async fn streaming() -> anyhow::Result<()> { 356 | // an adapter that converts input line by line (deadlocks if the parent process tries to write everything and only then read it) 357 | let adapter = CustomAdapterConfig { 358 | name: "simple text replacer".to_string(), 359 | description: "oo".to_string(), 360 | disabled_by_default: None, 361 | version: 1, 362 | extensions: vec!["txt".to_string()], 363 | mimetypes: None, 364 | match_only_by_mime: None, 365 | binary: "sed".to_string(), 366 | args: vec!["s/e/u/g".to_string()], 367 | output_path_hint: None, 368 | }; 369 | 370 | let adapter = adapter.to_adapter(); 371 | let input = r#" 372 | This is the story of a 373 | very strange lorry 374 | with a long dead crew 375 | and a witch with the flu 376 | "#; 377 | let input = format!("{input}{input}{input}{input}"); 378 | let input = format!("{input}{input}{input}{input}"); 379 | let input = format!("{input}{input}{input}{input}"); 380 | let input = format!("{input}{input}{input}{input}"); 381 | let input = format!("{input}{input}{input}{input}"); 382 | let input = format!("{input}{input}{input}{input}"); 383 | let (a, d) = simple_adapt_info( 384 | Path::new("foo.txt"), 385 | Box::pin(Cursor::new(Vec::from(input))), 386 | ); 387 | let output = adapter.adapt(a, &d).await.unwrap(); 388 | 389 | let oup = adapted_to_vec(output).await?; 390 | println!("output: {}", String::from_utf8_lossy(&oup)); 391 | Ok(()) 392 | } 393 | } 394 | -------------------------------------------------------------------------------- /src/adapters/decompress.rs: -------------------------------------------------------------------------------- 1 | use crate::adapted_iter::one_file; 2 | 3 | use super::*; 4 | 5 | use anyhow::Result; 6 | use lazy_static::lazy_static; 7 | use tokio::io::BufReader; 8 | 9 | use std::path::{Path, PathBuf}; 10 | 11 | static EXTENSIONS: &[&str] = &["als", "bz2", "gz", "tbz", "tbz2", "tgz", "xz", "zst"]; 12 | static MIME_TYPES: &[&str] = &[ 13 | "application/gzip", 14 | "application/x-bzip", 15 | "application/x-xz", 16 | "application/zstd", 17 | ]; 18 | lazy_static! { 19 | static ref METADATA: AdapterMeta = AdapterMeta { 20 | name: "decompress".to_owned(), 21 | version: 1, 22 | description: 23 | "Reads compressed file as a stream and runs a different extractor on the contents." 24 | .to_owned(), 25 | recurses: true, 26 | fast_matchers: EXTENSIONS 27 | .iter() 28 | .map(|s| FastFileMatcher::FileExtension(s.to_string())) 29 | .collect(), 30 | slow_matchers: Some( 31 | MIME_TYPES 32 | .iter() 33 | .map(|s| FileMatcher::MimeType(s.to_string())) 34 | .collect() 35 | ), 36 | disabled_by_default: false, 37 | keep_fast_matchers_if_accurate: true 38 | }; 39 | } 40 | #[derive(Default)] 41 | pub struct DecompressAdapter; 42 | 43 | impl DecompressAdapter { 44 | pub fn new() -> Self { 45 | Self 46 | } 47 | } 48 | impl GetMetadata for DecompressAdapter { 49 | fn metadata(&self) -> &AdapterMeta { 50 | &METADATA 51 | } 52 | } 53 | 54 | fn decompress_any(reason: &FileMatcher, inp: ReadBox) -> Result { 55 | use FastFileMatcher::*; 56 | use FileMatcher::*; 57 | use async_compression::tokio::bufread; 58 | let gz = |inp: ReadBox| Box::pin(bufread::GzipDecoder::new(BufReader::new(inp))); 59 | let bz2 = |inp: ReadBox| Box::pin(bufread::BzDecoder::new(BufReader::new(inp))); 60 | let xz = |inp: ReadBox| Box::pin(bufread::XzDecoder::new(BufReader::new(inp))); 61 | let zst = |inp: ReadBox| Box::pin(bufread::ZstdDecoder::new(BufReader::new(inp))); 62 | 63 | Ok(match reason { 64 | Fast(FileExtension(ext)) => match ext.as_ref() { 65 | "als" | "gz" | "tgz" => gz(inp), 66 | "bz2" | "tbz" | "tbz2" => bz2(inp), 67 | "zst" => zst(inp), 68 | "xz" => xz(inp), 69 | ext => Err(format_err!("don't know how to decompress {}", ext))?, 70 | }, 71 | MimeType(mime) => match mime.as_ref() { 72 | "application/gzip" => gz(inp), 73 | "application/x-bzip" => bz2(inp), 74 | "application/x-xz" => xz(inp), 75 | "application/zstd" => zst(inp), 76 | mime => Err(format_err!("don't know how to decompress mime {}", mime))?, 77 | }, 78 | }) 79 | } 80 | fn get_inner_filename(filename: &Path) -> PathBuf { 81 | let extension = filename 82 | .extension() 83 | .map(|e| e.to_string_lossy()) 84 | .unwrap_or(Cow::Borrowed("")); 85 | let stem = filename 86 | .file_stem() 87 | .expect("no filename given?") 88 | .to_string_lossy(); 89 | let new_extension = match extension.as_ref() { 90 | "tgz" | "tbz" | "tbz2" => ".tar", 91 | _other => "", 92 | }; 93 | filename.with_file_name(format!("{}{}", stem, new_extension)) 94 | } 95 | 96 | #[async_trait] 97 | impl FileAdapter for DecompressAdapter { 98 | async fn adapt( 99 | &self, 100 | ai: AdaptInfo, 101 | detection_reason: &FileMatcher, 102 | ) -> Result { 103 | Ok(one_file(AdaptInfo { 104 | filepath_hint: get_inner_filename(&ai.filepath_hint), 105 | is_real_file: false, 106 | archive_recursion_depth: ai.archive_recursion_depth + 1, 107 | inp: decompress_any(detection_reason, ai.inp)?, 108 | line_prefix: ai.line_prefix, 109 | config: ai.config.clone(), 110 | postprocess: ai.postprocess, 111 | })) 112 | } 113 | } 114 | 115 | #[cfg(test)] 116 | mod tests { 117 | use super::*; 118 | use crate::preproc::loop_adapt; 119 | use crate::test_utils::*; 120 | use pretty_assertions::assert_eq; 121 | use tokio::fs::File; 122 | 123 | #[test] 124 | fn test_inner_filename() { 125 | for (a, b) in &[ 126 | ("hi/test.tgz", "hi/test.tar"), 127 | ("hi/hello.gz", "hi/hello"), 128 | ("a/b/initramfs", "a/b/initramfs"), 129 | ("hi/test.tbz2", "hi/test.tar"), 130 | ("hi/test.tbz", "hi/test.tar"), 131 | ("hi/test.hi.bz2", "hi/test.hi"), 132 | ("hello.tar.gz", "hello.tar"), 133 | ] { 134 | assert_eq!(get_inner_filename(&PathBuf::from(a)), PathBuf::from(*b)); 135 | } 136 | } 137 | 138 | #[tokio::test] 139 | async fn gz() -> Result<()> { 140 | let adapter = DecompressAdapter; 141 | 142 | let filepath = test_data_dir().join("hello.gz"); 143 | 144 | let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?)); 145 | let r = adapter.adapt(a, &d).await?; 146 | let o = adapted_to_vec(r).await?; 147 | assert_eq!(String::from_utf8(o)?, "hello\n"); 148 | Ok(()) 149 | } 150 | 151 | #[tokio::test] 152 | async fn pdf_gz() -> Result<()> { 153 | let adapter = DecompressAdapter; 154 | 155 | let filepath = test_data_dir().join("short.pdf.gz"); 156 | 157 | let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?)); 158 | let r = loop_adapt(&adapter, d, a).await?; 159 | let o = adapted_to_vec(r).await?; 160 | assert_eq!( 161 | String::from_utf8(o)?, 162 | "PREFIX:Page 1: hello world 163 | PREFIX:Page 1: this is just a test. 164 | PREFIX:Page 1: 165 | PREFIX:Page 1: 1 166 | PREFIX:Page 1: 167 | PREFIX:Page 1: 168 | " 169 | ); 170 | Ok(()) 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /src/adapters/ffmpeg.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | use super::{custom::map_exe_error, writing::async_writeln}; 3 | use anyhow::*; 4 | use async_trait::async_trait; 5 | use lazy_static::lazy_static; 6 | use regex::Regex; 7 | use serde::{Deserialize, Serialize}; 8 | use std::process::Stdio; 9 | use tokio::io::AsyncWrite; 10 | use tokio::io::{AsyncBufReadExt, BufReader}; 11 | use tokio::process::Command; 12 | use writing::WritingFileAdapter; 13 | // todo: 14 | // maybe todo: read list of extensions from 15 | // ffmpeg -demuxers | tail -n+5 | awk '{print $2}' | while read demuxer; do echo MUX=$demuxer; ffmpeg -h demuxer=$demuxer | grep 'Common extensions'; done 2>/dev/null 16 | // but really, the probability of getting useful information from a .flv is low 17 | static EXTENSIONS: &[&str] = &["mkv", "mp4", "avi", "mp3", "ogg", "flac", "webm"]; 18 | 19 | lazy_static! { 20 | static ref METADATA: AdapterMeta = AdapterMeta { 21 | name: "ffmpeg".to_owned(), 22 | version: 1, 23 | description: 24 | "Uses ffmpeg to extract video metadata/chapters, subtitles, lyrics, and other metadata" 25 | .to_owned(), 26 | recurses: false, 27 | fast_matchers: EXTENSIONS 28 | .iter() 29 | .map(|s| FastFileMatcher::FileExtension(s.to_string())) 30 | .collect(), 31 | slow_matchers: None, 32 | disabled_by_default: false, 33 | keep_fast_matchers_if_accurate: true 34 | }; 35 | } 36 | 37 | #[derive(Default, Clone)] 38 | pub struct FFmpegAdapter; 39 | 40 | impl FFmpegAdapter { 41 | pub fn new() -> Self { 42 | Self 43 | } 44 | } 45 | impl GetMetadata for FFmpegAdapter { 46 | fn metadata(&self) -> &AdapterMeta { 47 | &METADATA 48 | } 49 | } 50 | 51 | #[derive(Serialize, Deserialize)] 52 | struct FFprobeOutput { 53 | streams: Vec, 54 | } 55 | #[derive(Serialize, Deserialize)] 56 | struct FFprobeStream { 57 | index: i32, // stream index 58 | } 59 | 60 | #[async_trait] 61 | impl WritingFileAdapter for FFmpegAdapter { 62 | async fn adapt_write( 63 | ai: AdaptInfo, 64 | _detection_reason: &FileMatcher, 65 | mut oup: Pin>, 66 | ) -> Result<()> { 67 | let AdaptInfo { 68 | is_real_file, 69 | filepath_hint, 70 | line_prefix, 71 | .. 72 | } = ai; 73 | if !is_real_file { 74 | // we *could* probably adapt this to also work based on streams, 75 | // it would require using a BufReader to read at least part of the file to memory 76 | // but really when would you want to search for videos within archives? 77 | // So instead, we only run this adapter if the file is a actual file on disk for now 78 | async_writeln!(oup, "{line_prefix}[rga: skipping video in archive]\n")?; 79 | return Ok(()); 80 | } 81 | let inp_fname = filepath_hint; 82 | let spawn_fail = |e| map_exe_error(e, "ffprobe", "Make sure you have ffmpeg installed."); 83 | let subtitle_streams = { 84 | let probe = Command::new("ffprobe") 85 | .args(vec![ 86 | "-v", 87 | "error", // show all errors 88 | "-select_streams", 89 | "s", // show only subtitle streams 90 | "-of", 91 | "json", // use json as output format 92 | "-show_entries", 93 | "stream=index", // show index of subtitle streams 94 | ]) 95 | .arg("-i") 96 | .arg(&inp_fname) 97 | .output() 98 | .await 99 | .map_err(spawn_fail)?; 100 | if !probe.status.success() { 101 | return Err(format_err!( 102 | "ffprobe failed: {:?}\n{}", 103 | probe.status, 104 | String::from_utf8_lossy(&probe.stderr) 105 | )); 106 | } 107 | let p: FFprobeOutput = serde_json::from_slice(&probe.stdout)?; 108 | p.streams 109 | }; 110 | { 111 | // extract file metadata (especially chapter names in a greppable format) 112 | let mut probe = Command::new("ffprobe") 113 | .args(vec![ 114 | "-v", 115 | "error", 116 | "-show_format", 117 | "-show_streams", 118 | "-of", 119 | "flat", 120 | // "-show_data", 121 | "-show_error", 122 | "-show_programs", 123 | "-show_chapters", 124 | // "-count_frames", 125 | //"-count_packets", 126 | ]) 127 | .arg("-i") 128 | .arg(&inp_fname) 129 | .stdout(Stdio::piped()) 130 | .spawn()?; 131 | let mut lines = BufReader::new(probe.stdout.as_mut().unwrap()).lines(); 132 | while let Some(line) = lines.next_line().await? { 133 | let line = line.replace("\\r\\n", "\n").replace("\\n", "\n"); // just unescape newlines 134 | async_writeln!(oup, "metadata: {line}")?; 135 | } 136 | let exit = probe.wait().await?; 137 | if !exit.success() { 138 | return Err(format_err!("ffprobe failed: {:?}", exit)); 139 | } 140 | } 141 | if !subtitle_streams.is_empty() { 142 | let time_re = Regex::new(r".*\d.*-->.*\d.*").unwrap(); 143 | for probe_stream in subtitle_streams.iter() { 144 | // extract subtitles 145 | let mut cmd = Command::new("ffmpeg"); 146 | cmd.arg("-hide_banner") 147 | .arg("-loglevel") 148 | .arg("panic") 149 | .arg("-i") 150 | .arg(&inp_fname) 151 | .arg("-map") 152 | .arg(format!("0:{}", probe_stream.index)) // 0 for first input 153 | .arg("-f") 154 | .arg("webvtt") 155 | .arg("-"); 156 | let mut cmd = cmd.stdout(Stdio::piped()).spawn().map_err(spawn_fail)?; 157 | let stdo = cmd.stdout.as_mut().expect("is piped"); 158 | let mut time: String = "".to_owned(); 159 | // rewrite subtitle times so they are shown as a prefix in every line 160 | let mut lines = BufReader::new(stdo).lines(); 161 | while let Some(line) = lines.next_line().await? { 162 | // 09:55.195 --> 09:56.730 163 | if time_re.is_match(&line) { 164 | time = line.to_owned(); 165 | } else if line.is_empty() { 166 | async_writeln!(oup)?; 167 | } else { 168 | async_writeln!(oup, "{time}: {line}")?; 169 | } 170 | } 171 | } 172 | } 173 | Ok(()) 174 | } 175 | } 176 | -------------------------------------------------------------------------------- /src/adapters/mbox.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | use anyhow::Result; 4 | use async_stream::stream; 5 | use lazy_static::lazy_static; 6 | use mime2ext::mime2ext; 7 | use regex::bytes::Regex; 8 | use tokio::io::AsyncReadExt; 9 | 10 | use std::{collections::VecDeque, io::Cursor}; 11 | 12 | static EXTENSIONS: &[&str] = &["mbox", "mbx", "eml"]; 13 | static MIME_TYPES: &[&str] = &["application/mbox", "message/rfc822"]; 14 | lazy_static! { 15 | static ref METADATA: AdapterMeta = AdapterMeta { 16 | name: "mail".to_owned(), 17 | version: 1, 18 | description: 19 | "Reads mailbox/mail files and runs extractors on the contents and attachments." 20 | .to_owned(), 21 | recurses: true, 22 | fast_matchers: EXTENSIONS 23 | .iter() 24 | .map(|s| FastFileMatcher::FileExtension(s.to_string())) 25 | .collect(), 26 | slow_matchers: Some( 27 | MIME_TYPES 28 | .iter() 29 | .map(|s| FileMatcher::MimeType(s.to_string())) 30 | .collect() 31 | ), 32 | disabled_by_default: true, 33 | keep_fast_matchers_if_accurate: true 34 | }; 35 | static ref FROM_REGEX: Regex = Regex::new("\r?\nFrom [^\n]+\n").unwrap(); 36 | } 37 | #[derive(Default)] 38 | pub struct MboxAdapter; 39 | 40 | impl MboxAdapter { 41 | pub fn new() -> Self { 42 | Self 43 | } 44 | } 45 | impl GetMetadata for MboxAdapter { 46 | fn metadata(&self) -> &AdapterMeta { 47 | &METADATA 48 | } 49 | } 50 | 51 | #[async_trait] 52 | impl FileAdapter for MboxAdapter { 53 | async fn adapt( 54 | &self, 55 | ai: AdaptInfo, 56 | _detection_reason: &FileMatcher, 57 | ) -> Result { 58 | let AdaptInfo { 59 | filepath_hint, 60 | mut inp, 61 | line_prefix, 62 | archive_recursion_depth, 63 | config, 64 | postprocess, 65 | .. 66 | } = ai; 67 | 68 | let mut content = Vec::new(); 69 | let s = stream! { 70 | inp.read_to_end(&mut content).await?; 71 | 72 | let mut ais = vec![]; 73 | for mail_bytes in FROM_REGEX.splitn(&content, usize::MAX) { 74 | let mail_content = mail_bytes.splitn(2, |x| *x == b'\n').nth(1).unwrap(); 75 | let mail = mailparse::parse_mail(mail_content); 76 | if mail.is_err() { 77 | continue; 78 | } 79 | let mail = mail.unwrap(); 80 | 81 | let mut todos = VecDeque::new(); 82 | todos.push_back(mail); 83 | 84 | while let Some(mail) = todos.pop_front() { 85 | let mut path = filepath_hint.clone(); 86 | let filename = mail.get_content_disposition().params.get("filename").cloned(); 87 | match &*mail.ctype.mimetype { 88 | x if x.starts_with("multipart/") => { 89 | todos.extend(mail.subparts); 90 | continue; 91 | } 92 | mime => { 93 | if let Some(name) = filename { 94 | path.push(name); 95 | } else if let Some(extension) = mime2ext(mime) { 96 | path.push(format!("data.{extension}")); 97 | } else { 98 | path.push("data"); 99 | } 100 | } 101 | } 102 | 103 | let mut config = config.clone(); 104 | config.accurate = true; 105 | 106 | let raw_body = mail.get_body_raw(); 107 | if raw_body.is_err() { 108 | continue; 109 | } 110 | let ai2: AdaptInfo = AdaptInfo { 111 | filepath_hint: path, 112 | is_real_file: false, 113 | archive_recursion_depth: archive_recursion_depth + 1, 114 | inp: Box::pin(Cursor::new(raw_body.unwrap())), 115 | line_prefix: line_prefix.to_string(), 116 | config, 117 | postprocess, 118 | }; 119 | ais.push(ai2); 120 | } 121 | } 122 | for a in ais { 123 | yield(Ok(a)); 124 | } 125 | }; 126 | Ok(Box::pin(s)) 127 | } 128 | } 129 | 130 | #[cfg(test)] 131 | mod tests { 132 | use super::*; 133 | use crate::preproc::loop_adapt; 134 | use crate::test_utils::*; 135 | use pretty_assertions::assert_eq; 136 | use tokio::fs::File; 137 | use tokio_stream::StreamExt; 138 | 139 | #[tokio::test] 140 | async fn mail_simple() -> Result<()> { 141 | let adapter = MboxAdapter; 142 | 143 | let filepath = test_data_dir().join("github_email.eml"); 144 | 145 | let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?)); 146 | let mut r = adapter.adapt(a, &d).await?; 147 | let mut count = 0; 148 | while let Some(file) = r.next().await { 149 | let mut file = file?; 150 | let mut buf = Vec::new(); 151 | file.inp.read_to_end(&mut buf).await?; 152 | match file 153 | .filepath_hint 154 | .components() 155 | .last() 156 | .unwrap() 157 | .as_os_str() 158 | .to_str() 159 | .unwrap() 160 | { 161 | "data.txt" | "data.html" => { 162 | assert!(String::from_utf8(buf)?.contains("Thank you for your contribution")); 163 | } 164 | x => panic!("unexpected filename {x:?}"), 165 | } 166 | count += 1; 167 | } 168 | assert_eq!(2, count); 169 | Ok(()) 170 | } 171 | 172 | #[tokio::test] 173 | async fn mbox_simple() -> Result<()> { 174 | let adapter = MboxAdapter; 175 | 176 | let filepath = test_data_dir().join("test.mbx"); 177 | 178 | let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?)); 179 | let mut r = adapter.adapt(a, &d).await?; 180 | let mut count = 0; 181 | while let Some(file) = r.next().await { 182 | let mut file = file?; 183 | assert_eq!( 184 | "data.html", 185 | file.filepath_hint.components().last().unwrap().as_os_str() 186 | ); 187 | let mut buf = Vec::new(); 188 | file.inp.read_to_end(&mut buf).await?; 189 | assert_eq!( 190 | "\r\n \r\n \r\n \r\n \r\n

>From

\r\n

Another word >From
\r\n

\r\n \r\n", 191 | String::from_utf8(buf)?.trim() 192 | ); 193 | count += 1; 194 | } 195 | assert_eq!(3, count); 196 | Ok(()) 197 | } 198 | 199 | #[tokio::test] 200 | async fn mbox_attachment() -> Result<()> { 201 | init_logging(); 202 | 203 | let adapter = MboxAdapter; 204 | 205 | let filepath = test_data_dir().join("mail_with_attachment.mbox"); 206 | 207 | let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?)); 208 | let mut r = loop_adapt(&adapter, d, a).await?; 209 | let mut count = 0; 210 | while let Some(file) = r.next().await { 211 | let mut file = file?; 212 | let path = file 213 | .filepath_hint 214 | .components() 215 | .last() 216 | .unwrap() 217 | .as_os_str() 218 | .to_str() 219 | .unwrap(); 220 | let mut buf = Vec::new(); 221 | file.inp.read_to_end(&mut buf).await?; 222 | match path { 223 | "data.html.txt" => { 224 | assert_eq!( 225 | "PREFIX:regular text\nPREFIX:\n", 226 | String::from_utf8(buf).unwrap_or("err".to_owned()) 227 | ); 228 | } 229 | "short.pdf.txt" => { 230 | assert_eq!( 231 | "PREFIX:Page 1: hello world\nPREFIX:Page 1: this is just a test.\nPREFIX:Page 1: \nPREFIX:Page 1: 1\nPREFIX:Page 1: \nPREFIX:Page 1: \n", 232 | String::from_utf8(buf).unwrap_or("err".to_owned()) 233 | ); 234 | } 235 | _ => { 236 | panic!("unrelated {path:?}"); 237 | } 238 | } 239 | count += 1; 240 | } 241 | assert_eq!(2, count); // one message + one attachment 242 | Ok(()) 243 | } 244 | } 245 | -------------------------------------------------------------------------------- /src/adapters/postproc.rs: -------------------------------------------------------------------------------- 1 | //trait RunFnAdapter: GetMetadata {} 2 | 3 | //impl FileAdapter for T where T: RunFnAdapter {} 4 | 5 | use anyhow::Result; 6 | use async_stream::stream; 7 | use async_trait::async_trait; 8 | use bytes::Bytes; 9 | use encoding_rs::Encoding; 10 | use encoding_rs_io::DecodeReaderBytesBuilder; 11 | use tokio_util::io::SyncIoBridge; 12 | 13 | use std::io::Cursor; 14 | use std::path::PathBuf; 15 | use std::pin::Pin; 16 | use tokio::io::{AsyncRead, AsyncReadExt}; 17 | use tokio_util::io::ReaderStream; 18 | use tokio_util::io::StreamReader; 19 | 20 | use crate::adapted_iter::AdaptedFilesIterBox; 21 | use crate::adapted_iter::one_file; 22 | use crate::matching::FastFileMatcher; 23 | 24 | use super::{AdaptInfo, AdapterMeta, FileAdapter, GetMetadata}; 25 | 26 | fn add_newline(ar: impl AsyncRead + Send) -> impl AsyncRead + Send { 27 | ar.chain(Cursor::new(b"\n")) 28 | } 29 | 30 | pub struct PostprocPrefix {} 31 | impl GetMetadata for PostprocPrefix { 32 | fn metadata(&self) -> &super::AdapterMeta { 33 | lazy_static::lazy_static! { 34 | static ref METADATA: AdapterMeta = AdapterMeta { 35 | name: "postprocprefix".to_owned(), 36 | version: 1, 37 | description: "Adds the line prefix to each line (e.g. the filename within a zip)".to_owned(), 38 | recurses: false, 39 | fast_matchers: vec![], 40 | slow_matchers: None, 41 | keep_fast_matchers_if_accurate: false, 42 | disabled_by_default: false 43 | }; 44 | } 45 | &METADATA 46 | } 47 | } 48 | #[async_trait] 49 | impl FileAdapter for PostprocPrefix { 50 | async fn adapt( 51 | &self, 52 | a: super::AdaptInfo, 53 | _detection_reason: &crate::matching::FileMatcher, 54 | ) -> Result { 55 | let read = add_newline(postproc_prefix( 56 | &a.line_prefix, 57 | postproc_encoding(&a.line_prefix, a.inp).await?, 58 | )); 59 | // keep adapt info (filename etc) except replace inp 60 | let ai = AdaptInfo { 61 | inp: Box::pin(read), 62 | postprocess: false, 63 | ..a 64 | }; 65 | Ok(one_file(ai)) 66 | } 67 | } 68 | 69 | /*struct ReadErr { 70 | err: Fn() -> std::io::Error, 71 | } 72 | impl Read for ReadErr { 73 | fn read(&mut self, buf: &mut [u8]) -> std::io::Result { 74 | Err(self.err()) 75 | } 76 | }*/ 77 | 78 | /** 79 | * Detects and converts encodings other than utf-8 to utf-8. 80 | * If the input stream does not contain valid text, returns the string `[rga: binary data]` instead 81 | */ 82 | async fn postproc_encoding( 83 | _line_prefix: &str, 84 | inp: Pin>, 85 | ) -> Result>> { 86 | // check for binary content in first 8kB 87 | // read the first 8kB into a buffer, check for null bytes, then return the buffer concatenated with the rest of the file 88 | let mut fourk = Vec::with_capacity(1 << 13); 89 | let mut beginning = inp.take(1 << 13); 90 | 91 | beginning.read_to_end(&mut fourk).await?; 92 | let has_binary = fourk.contains(&0u8); 93 | 94 | let enc = Encoding::for_bom(&fourk); 95 | let inp = Cursor::new(fourk).chain(beginning.into_inner()); 96 | match enc { 97 | Some((enc, _)) if enc != encoding_rs::UTF_8 => { 98 | // detected UTF16LE or UTF16BE, convert to UTF8 in separate thread 99 | // TODO: parse these options from ripgrep's configuration 100 | let encoding = None; // detect bom but usually assume utf8 101 | let bom_sniffing = true; 102 | let mut decode_builder = DecodeReaderBytesBuilder::new(); 103 | // https://github.com/BurntSushi/ripgrep/blob/a7d26c8f144a4957b75f71087a66692d0b25759a/grep-searcher/src/searcher/mod.rs#L706 104 | // this detects utf-16 BOMs and transcodes to utf-8 if they are present 105 | // it does not detect any other char encodings. that would require https://github.com/hsivonen/chardetng or similar but then binary detection is hard (?) 106 | let mut inp = decode_builder 107 | .encoding(encoding) 108 | .utf8_passthru(true) 109 | .strip_bom(bom_sniffing) 110 | .bom_override(true) 111 | .bom_sniffing(bom_sniffing) 112 | .build(SyncIoBridge::new(inp)); 113 | let oup = tokio::task::spawn_blocking(move || -> Result> { 114 | let mut oup = Vec::new(); 115 | std::io::Read::read_to_end(&mut inp, &mut oup)?; 116 | Ok(oup) 117 | }) 118 | .await??; 119 | Ok(Box::pin(Cursor::new(oup))) 120 | } 121 | _ => { 122 | if has_binary { 123 | log::debug!("detected binary"); 124 | return Ok(Box::pin(Cursor::new("[rga: binary data]"))); 125 | } 126 | Ok(Box::pin(inp)) 127 | } 128 | } 129 | } 130 | 131 | /// Adds the given prefix to each line in an `AsyncRead`. 132 | pub fn postproc_prefix( 133 | line_prefix: &str, 134 | inp: T, 135 | ) -> impl AsyncRead + Send + use { 136 | let line_prefix_n = format!("\n{line_prefix}"); // clone since we need it later 137 | let line_prefix_o = Bytes::copy_from_slice(line_prefix.as_bytes()); 138 | let regex = regex::bytes::Regex::new("\n").unwrap(); 139 | let inp_stream = ReaderStream::new(inp); 140 | let oup_stream = stream! { 141 | yield Ok(line_prefix_o); 142 | for await chunk in inp_stream { 143 | match chunk { 144 | Err(e) => yield Err(e), 145 | Ok(chunk) => { 146 | if chunk.contains(&b'\n') { 147 | yield Ok(Bytes::copy_from_slice(®ex.replace_all(&chunk, line_prefix_n.as_bytes()))); 148 | } else { 149 | yield Ok(chunk); 150 | } 151 | } 152 | } 153 | } 154 | }; 155 | Box::pin(StreamReader::new(oup_stream)) 156 | } 157 | 158 | #[derive(Default)] 159 | pub struct PostprocPageBreaks {} 160 | 161 | impl GetMetadata for PostprocPageBreaks { 162 | fn metadata(&self) -> &super::AdapterMeta { 163 | lazy_static::lazy_static! { 164 | static ref METADATA: AdapterMeta = AdapterMeta { 165 | name: "postprocpagebreaks".to_owned(), 166 | version: 1, 167 | description: "Adds the page number to each line for an input file that specifies page breaks as ascii page break character.\nMainly to be used internally by the poppler adapter.".to_owned(), 168 | recurses: false, 169 | fast_matchers: vec![FastFileMatcher::FileExtension("asciipagebreaks".to_string())], 170 | slow_matchers: None, 171 | keep_fast_matchers_if_accurate: false, 172 | disabled_by_default: false 173 | }; 174 | } 175 | &METADATA 176 | } 177 | } 178 | #[async_trait] 179 | impl FileAdapter for PostprocPageBreaks { 180 | async fn adapt( 181 | &self, 182 | a: super::AdaptInfo, 183 | _detection_reason: &crate::matching::FileMatcher, 184 | ) -> Result { 185 | let read = postproc_pagebreaks(postproc_encoding(&a.line_prefix, a.inp).await?); 186 | // keep adapt info (filename etc) except replace inp 187 | let ai = AdaptInfo { 188 | inp: Box::pin(read), 189 | archive_recursion_depth: a.archive_recursion_depth + 1, 190 | filepath_hint: a 191 | .filepath_hint 192 | .parent() 193 | .map(PathBuf::from) 194 | .unwrap_or_default() 195 | .join(a.filepath_hint.file_stem().unwrap_or_default()), 196 | ..a 197 | }; 198 | Ok(one_file(ai)) 199 | } 200 | } 201 | /// Adds the prefix "Page N: " to each line, 202 | /// where N starts at one and is incremented for each ASCII Form Feed character in the input stream. 203 | /// ASCII form feeds are the page delimiters output by `pdftotext`. 204 | pub fn postproc_pagebreaks(input: impl AsyncRead + Send) -> impl AsyncRead + Send { 205 | let regex_linefeed = regex::bytes::Regex::new(r"\x0c").unwrap(); 206 | let regex_newline = regex::bytes::Regex::new("\n").unwrap(); 207 | let mut page_count: i32 = 1; 208 | let mut page_prefix: String = format!("\nPage {page_count}: "); 209 | 210 | let input_stream = ReaderStream::new(input); 211 | let output_stream = stream! { 212 | yield std::io::Result::Ok(Bytes::copy_from_slice(format!("Page {page_count}: ").as_bytes())); 213 | // store Page X: line prefixes in pending and only write it to the output when there is more text to be written 214 | // this is needed since pdftotext outputs a \x0c at the end of the last page 215 | let mut pending: Option = None; 216 | 217 | for await read_chunk in input_stream { 218 | let read_chunk = read_chunk?; 219 | let page_chunks = regex_linefeed.split(&read_chunk); 220 | for (chunk_idx, page_chunk) in page_chunks.enumerate() { 221 | if chunk_idx != 0 { 222 | page_count += 1; 223 | page_prefix = format!("\nPage {page_count}: "); 224 | if let Some(p) = pending.take() { 225 | yield Ok(p); 226 | } 227 | pending = Some(Bytes::copy_from_slice(page_prefix.as_bytes())); 228 | } 229 | if !page_chunk.is_empty() { 230 | if let Some(p) = pending.take() { 231 | yield Ok(p); 232 | } 233 | yield Ok(Bytes::copy_from_slice(®ex_newline.replace_all(page_chunk, page_prefix.as_bytes()))); 234 | } 235 | 236 | } 237 | } 238 | 239 | 240 | }; 241 | Box::pin(StreamReader::new(output_stream)) 242 | } 243 | 244 | #[cfg(test)] 245 | mod tests { 246 | use crate::preproc::loop_adapt; 247 | use crate::test_utils::*; 248 | 249 | use super::*; 250 | use anyhow::Result; 251 | use pretty_assertions::assert_eq; 252 | use tokio::fs::File; 253 | use tokio::pin; 254 | use tokio_test::io::Builder; 255 | use tokio_test::io::Mock; 256 | 257 | #[tokio::test] 258 | async fn test_with_pagebreaks() { 259 | let mut output: Vec = Vec::new(); 260 | let mock: Mock = Builder::new() 261 | .read(b"Hello\nWorld\x0cFoo Bar\n\x0cTest\x0c") 262 | .build(); 263 | let res = postproc_pagebreaks(mock).read_to_end(&mut output).await; 264 | println!("{}", String::from_utf8_lossy(&output)); 265 | assert!(res.is_ok()); 266 | assert_eq!( 267 | String::from_utf8_lossy(&output), 268 | "Page 1: Hello\nPage 1: World\nPage 2: Foo Bar\nPage 2: \nPage 3: Test" 269 | ); 270 | } 271 | 272 | #[tokio::test] 273 | async fn test_with_pagebreaks_chunks() { 274 | let mut output: Vec = Vec::new(); 275 | let mock: Mock = Builder::new() 276 | .read(b"Hello\nWo") 277 | .read(b"rld\x0c") 278 | .read(b"Foo Bar\n") 279 | .read(b"\x0cTest\x0c") 280 | .build(); 281 | let res = postproc_pagebreaks(mock).read_to_end(&mut output).await; 282 | println!("{}", String::from_utf8_lossy(&output)); 283 | assert!(res.is_ok()); 284 | assert_eq!( 285 | String::from_utf8_lossy(&output), 286 | "Page 1: Hello\nPage 1: World\nPage 2: Foo Bar\nPage 2: \nPage 3: Test" 287 | ); 288 | } 289 | 290 | #[tokio::test] 291 | async fn test_pdf_twoblank() -> Result<()> { 292 | let adapter = poppler_adapter(); 293 | let fname = test_data_dir().join("twoblankpages.pdf"); 294 | let rd = File::open(&fname).await?; 295 | let (a, d) = simple_adapt_info(&fname, Box::pin(rd)); 296 | let res = loop_adapt(&adapter, d, a).await?; 297 | 298 | let buf = adapted_to_vec(res).await?; 299 | 300 | assert_eq!( 301 | String::from_utf8(buf)?, 302 | "PREFIX:Page 1: 303 | PREFIX:Page 2: 304 | PREFIX:Page 3: HelloWorld 305 | PREFIX:Page 3: 306 | PREFIX:Page 3: 307 | ", 308 | ); 309 | 310 | Ok(()) 311 | } 312 | 313 | #[tokio::test] 314 | async fn test_postproc_prefix() { 315 | let mut output: Vec = Vec::new(); 316 | let mock: Mock = Builder::new().read(b"Hello\nWorld").build(); 317 | let res = postproc_prefix("prefix: ", mock) 318 | .read_to_end(&mut output) 319 | .await; 320 | println!("{}", String::from_utf8_lossy(&output)); 321 | assert!(res.is_ok()); 322 | assert_eq!(output, b"prefix: Hello\nprefix: World"); 323 | } 324 | 325 | async fn test_from_strs( 326 | pagebreaks: bool, 327 | line_prefix: &str, 328 | a: &'static str, 329 | b: &str, 330 | ) -> Result<()> { 331 | test_from_bytes(pagebreaks, line_prefix, a.as_bytes(), b).await 332 | } 333 | 334 | async fn test_from_bytes( 335 | pagebreaks: bool, 336 | line_prefix: &str, 337 | a: &'static [u8], 338 | b: &str, 339 | ) -> Result<()> { 340 | let mut oup = Vec::new(); 341 | let inp = Box::pin(Cursor::new(a)); 342 | let inp = postproc_encoding("", inp).await?; 343 | if pagebreaks { 344 | postproc_pagebreaks(inp).read_to_end(&mut oup).await?; 345 | } else { 346 | let x = postproc_prefix(line_prefix, inp); 347 | pin!(x); 348 | x.read_to_end(&mut oup).await?; 349 | } 350 | let c = String::from_utf8_lossy(&oup); 351 | assert_eq!(c, b, "source: {}", String::from_utf8_lossy(a)); 352 | 353 | Ok(()) 354 | } 355 | 356 | #[tokio::test] 357 | async fn test_utf16() -> Result<()> { 358 | let utf16lebom: &[u8] = &[ 359 | 0xff, 0xfe, 0x68, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x20, 0x00, 360 | 0x77, 0x00, 0x6f, 0x00, 0x72, 0x00, 0x6c, 0x00, 0x64, 0x00, 0x20, 0x00, 0x3d, 0xd8, 361 | 0xa9, 0xdc, 0x0a, 0x00, 362 | ]; 363 | let utf16bebom: &[u8] = &[ 364 | 0xfe, 0xff, 0x00, 0x68, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x20, 365 | 0x00, 0x77, 0x00, 0x6f, 0x00, 0x72, 0x00, 0x6c, 0x00, 0x64, 0x00, 0x20, 0xd8, 0x3d, 366 | 0xdc, 0xa9, 0x00, 0x0a, 367 | ]; 368 | test_from_bytes(false, "", utf16lebom, "hello world 💩\n").await?; 369 | test_from_bytes(false, "", utf16bebom, "hello world 💩\n").await?; 370 | Ok(()) 371 | } 372 | 373 | #[tokio::test] 374 | async fn post1() -> Result<()> { 375 | let inp = "What is this\nThis is a test\nFoo"; 376 | let oup = "Page 1: What is this\nPage 1: This is a test\nPage 1: Foo"; 377 | 378 | test_from_strs(true, "", inp, oup).await?; 379 | 380 | println!("\n\n\n\n"); 381 | 382 | let inp = "What is this\nThis is a test\nFoo\x0c\nHelloooo\nHow are you?\x0c\nGreat!"; 383 | let oup = "Page 1: What is this\nPage 1: This is a test\nPage 1: Foo\nPage 2: \nPage 2: Helloooo\nPage 2: How are you?\nPage 3: \nPage 3: Great!"; 384 | 385 | test_from_strs(true, "", inp, oup).await?; 386 | 387 | let inp = "What is this\nThis is a test\nFoo\x0c\nHelloooo\nHow are you?\x0c\nGreat!"; 388 | let oup = "foo.pdf:What is this\nfoo.pdf:This is a test\nfoo.pdf:Foo\x0c\nfoo.pdf:Helloooo\nfoo.pdf:How are you?\x0c\nfoo.pdf:Great!"; 389 | 390 | test_from_strs(false, "foo.pdf:", inp, oup).await?; 391 | 392 | Ok(()) 393 | } 394 | 395 | #[tokio::test] 396 | async fn test_binary_content() -> Result<()> { 397 | test_from_strs( 398 | false, 399 | "foo:", 400 | "this is a test \n\n \0 foo", 401 | "foo:[rga: binary data]", 402 | ) 403 | .await?; 404 | test_from_strs(false, "foo:", "\0", "foo:[rga: binary data]").await?; 405 | Ok(()) 406 | } 407 | 408 | /*#[test] 409 | fn chardet() -> Result<()> { 410 | let mut d = chardetng::EncodingDetector::new(); 411 | let mut v = Vec::new(); 412 | std::fs::File::open("/home/phire/passwords-2018.kdbx.old").unwrap().read_to_end(&mut v).unwrap(); 413 | d.feed(&v, false); 414 | println!("foo {:?}", d.guess(None, true)); 415 | Ok(()) 416 | }*/ 417 | } 418 | -------------------------------------------------------------------------------- /src/adapters/sqlite.rs: -------------------------------------------------------------------------------- 1 | use super::{writing::WritingFileAdapter, *}; 2 | use anyhow::Result; 3 | use async_trait::async_trait; 4 | use lazy_static::lazy_static; 5 | use log::*; 6 | use rusqlite::types::ValueRef; 7 | use rusqlite::*; 8 | use std::{convert::TryInto, io::Write}; 9 | use tokio::io::AsyncWrite; 10 | 11 | use tokio_util::io::SyncIoBridge; 12 | 13 | static EXTENSIONS: &[&str] = &["db", "db3", "sqlite", "sqlite3"]; 14 | 15 | lazy_static! { 16 | static ref METADATA: AdapterMeta = AdapterMeta { 17 | name: "sqlite".to_owned(), 18 | version: 1, 19 | description: 20 | "Uses sqlite bindings to convert sqlite databases into a simple plain text format" 21 | .to_owned(), 22 | recurses: false, // set to true if we decide to make sqlite blobs searchable (gz blob in db is kinda common I think) 23 | fast_matchers: EXTENSIONS 24 | .iter() 25 | .map(|s| FastFileMatcher::FileExtension(s.to_string())) 26 | .collect(), 27 | slow_matchers: Some(vec![FileMatcher::MimeType( 28 | "application/x-sqlite3".to_owned() 29 | )]), 30 | keep_fast_matchers_if_accurate: false, 31 | disabled_by_default: false 32 | }; 33 | } 34 | 35 | #[derive(Default, Clone)] 36 | pub struct SqliteAdapter; 37 | 38 | impl SqliteAdapter { 39 | pub fn new() -> Self { 40 | Self 41 | } 42 | } 43 | impl GetMetadata for SqliteAdapter { 44 | fn metadata(&self) -> &AdapterMeta { 45 | &METADATA 46 | } 47 | } 48 | 49 | fn format_blob(b: ValueRef) -> String { 50 | use ValueRef::*; 51 | match b { 52 | Null => "NULL".to_owned(), 53 | Integer(i) => format!("{}", i), 54 | Real(i) => format!("{}", i), 55 | Text(i) => format!("'{}'", String::from_utf8_lossy(i).replace('\'', "''")), 56 | Blob(b) => format!( 57 | "[blob {}B]", 58 | size_format::SizeFormatterSI::new( 59 | // can't be larger than 2GB anyways 60 | b.len().try_into().unwrap() 61 | ) 62 | ), 63 | } 64 | } 65 | 66 | fn synchronous_dump_sqlite(ai: AdaptInfo, mut s: impl Write) -> Result<()> { 67 | let AdaptInfo { 68 | is_real_file, 69 | filepath_hint, 70 | line_prefix, 71 | .. 72 | } = ai; 73 | if !is_real_file { 74 | // db is in an archive 75 | // todo: read to memory and then use that blob if size < max 76 | writeln!(s, "{line_prefix}[rga: skipping sqlite in archive]",)?; 77 | return Ok(()); 78 | } 79 | let inp_fname = filepath_hint; 80 | let conn = Connection::open_with_flags(&inp_fname, OpenFlags::SQLITE_OPEN_READ_ONLY) 81 | .with_context(|| format!("opening sqlite connection to {}", inp_fname.display()))?; 82 | let tables: Vec = conn 83 | .prepare("select name from sqlite_master where type='table'") 84 | .context("while preparing query")? 85 | .query_map([], |r| r.get::<_, String>(0)) 86 | .context("while executing query")? 87 | .filter_map(|e| e.ok()) 88 | .collect(); 89 | debug!("db has {} tables", tables.len()); 90 | for table in tables { 91 | // can't use query param at that position 92 | let mut sel = conn.prepare(&format!( 93 | "select * from {}", 94 | rusqlite::vtab::escape_double_quote(&table) 95 | ))?; 96 | let col_names: Vec = sel 97 | .column_names() 98 | .into_iter() 99 | .map(|e| e.to_owned()) 100 | .collect(); 101 | let mut z = sel.query([])?; 102 | // writeln!(oup, "{}: {}", table, cols.join(", "))?; 103 | 104 | // kind of shitty (lossy) output. maybe output real csv or something? 105 | while let Some(row) = z.next()? { 106 | let row_str = col_names 107 | .iter() 108 | .enumerate() 109 | .map(|(i, e)| Ok(format!("{}={}", e, format_blob(row.get_ref(i)?)))) 110 | .collect::>>()? 111 | .join(", "); 112 | writeln!(s, "{line_prefix}{table}: {row_str}",)?; 113 | } 114 | } 115 | Ok(()) 116 | } 117 | 118 | #[async_trait] 119 | impl WritingFileAdapter for SqliteAdapter { 120 | async fn adapt_write( 121 | ai: AdaptInfo, 122 | _detection_reason: &FileMatcher, 123 | oup: Pin>, 124 | ) -> Result<()> { 125 | if ai.filepath_hint.file_name().and_then(|e| e.to_str()) == Some("Thumbs.db") { 126 | // skip windows thumbnail cache 127 | return Ok(()); 128 | } 129 | let oup_sync = SyncIoBridge::new(oup); 130 | tokio::task::spawn_blocking(|| synchronous_dump_sqlite(ai, oup_sync)) 131 | .await? 132 | .context("in synchronous sqlite task")?; 133 | Ok(()) 134 | } 135 | } 136 | 137 | #[cfg(test)] 138 | mod test { 139 | use super::*; 140 | use crate::test_utils::*; 141 | use pretty_assertions::assert_eq; 142 | 143 | #[tokio::test] 144 | async fn simple() -> Result<()> { 145 | let adapter: Box = Box::::default(); 146 | let fname = test_data_dir().join("hello.sqlite3"); 147 | let (a, d) = simple_fs_adapt_info(&fname).await?; 148 | let res = adapter.adapt(a, &d).await?; 149 | 150 | let buf = adapted_to_vec(res).await?; 151 | 152 | assert_eq!( 153 | String::from_utf8(buf)?, 154 | "PREFIX:tbl: greeting='hello', from='sqlite database!'\nPREFIX:tbl2: x=123, y=456.789\n", 155 | ); 156 | 157 | Ok(()) 158 | } 159 | } 160 | -------------------------------------------------------------------------------- /src/adapters/tar.rs: -------------------------------------------------------------------------------- 1 | use crate::{ 2 | adapted_iter::AdaptedFilesIterBox, 3 | adapters::AdapterMeta, 4 | matching::{FastFileMatcher, FileMatcher}, 5 | print_bytes, 6 | }; 7 | use anyhow::*; 8 | use async_stream::stream; 9 | use async_trait::async_trait; 10 | use lazy_static::lazy_static; 11 | use log::*; 12 | use std::path::PathBuf; 13 | 14 | use tokio_stream::StreamExt; 15 | 16 | use super::{AdaptInfo, FileAdapter, GetMetadata}; 17 | 18 | static EXTENSIONS: &[&str] = &["tar"]; 19 | 20 | lazy_static! { 21 | static ref METADATA: AdapterMeta = AdapterMeta { 22 | name: "tar".to_owned(), 23 | version: 1, 24 | description: "Reads a tar file as a stream and recurses down into its contents".to_owned(), 25 | recurses: true, 26 | fast_matchers: EXTENSIONS 27 | .iter() 28 | .map(|s| FastFileMatcher::FileExtension(s.to_string())) 29 | .collect(), 30 | slow_matchers: None, 31 | keep_fast_matchers_if_accurate: true, 32 | disabled_by_default: false 33 | }; 34 | } 35 | #[derive(Default, Clone)] 36 | pub struct TarAdapter; 37 | 38 | impl TarAdapter { 39 | pub fn new() -> Self { 40 | Self 41 | } 42 | } 43 | impl GetMetadata for TarAdapter { 44 | fn metadata(&self) -> &AdapterMeta { 45 | &METADATA 46 | } 47 | } 48 | 49 | #[async_trait] 50 | impl FileAdapter for TarAdapter { 51 | async fn adapt( 52 | &self, 53 | ai: AdaptInfo, 54 | _detection_reason: &FileMatcher, 55 | ) -> Result { 56 | let AdaptInfo { 57 | filepath_hint, 58 | inp, 59 | line_prefix, 60 | archive_recursion_depth, 61 | config, 62 | postprocess, 63 | .. 64 | } = ai; 65 | let mut archive = ::tokio_tar::Archive::new(inp); 66 | 67 | let mut entries = archive.entries()?; 68 | let s = stream! { 69 | while let Some(entry) = entries.next().await { 70 | let file = entry?; 71 | if tokio_tar::EntryType::Regular == file.header().entry_type() { 72 | let path = PathBuf::from(file.path()?.to_owned()); 73 | debug!( 74 | "{}|{}: {}", 75 | filepath_hint.display(), 76 | path.display(), 77 | print_bytes(file.header().size().unwrap_or(0) as f64), 78 | ); 79 | let line_prefix = &format!("{}{}: ", line_prefix, path.display()); 80 | let ai2: AdaptInfo = AdaptInfo { 81 | filepath_hint: path, 82 | is_real_file: false, 83 | archive_recursion_depth: archive_recursion_depth + 1, 84 | inp: Box::pin(file), 85 | line_prefix: line_prefix.to_string(), 86 | config: config.clone(), 87 | postprocess, 88 | }; 89 | yield Ok(ai2); 90 | } 91 | } 92 | }; 93 | 94 | Ok(Box::pin(s)) 95 | } 96 | } 97 | 98 | #[cfg(test)] 99 | mod tests { 100 | use super::*; 101 | use crate::{preproc::loop_adapt, test_utils::*}; 102 | use pretty_assertions::assert_eq; 103 | use tokio::fs::File; 104 | 105 | #[tokio::test] 106 | async fn test_simple_tar() -> Result<()> { 107 | let filepath = test_data_dir().join("hello.tar"); 108 | 109 | let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?)); 110 | 111 | let adapter = TarAdapter::new(); 112 | let r = loop_adapt(&adapter, d, a).await.context("adapt")?; 113 | let o = adapted_to_vec(r).await.context("adapted_to_vec")?; 114 | assert_eq!( 115 | String::from_utf8(o).context("parsing utf8")?, 116 | "PREFIX:dir/file-b.pdf: Page 1: hello world 117 | PREFIX:dir/file-b.pdf: Page 1: this is just a test. 118 | PREFIX:dir/file-b.pdf: Page 1: 119 | PREFIX:dir/file-b.pdf: Page 1: 1 120 | PREFIX:dir/file-b.pdf: Page 1: 121 | PREFIX:dir/file-b.pdf: Page 1: 122 | PREFIX:dir/file-a.pdf: Page 1: hello world 123 | PREFIX:dir/file-a.pdf: Page 1: this is just a test. 124 | PREFIX:dir/file-a.pdf: Page 1: 125 | PREFIX:dir/file-a.pdf: Page 1: 1 126 | PREFIX:dir/file-a.pdf: Page 1: 127 | PREFIX:dir/file-a.pdf: Page 1: 128 | " 129 | ); 130 | Ok(()) 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /src/adapters/writing.rs: -------------------------------------------------------------------------------- 1 | use std::pin::Pin; 2 | 3 | use crate::{adapted_iter::one_file, join_handle_to_stream, to_io_err}; 4 | 5 | use super::{AdaptInfo, FileAdapter, GetMetadata}; 6 | use anyhow::{Context, Result}; 7 | use async_trait::async_trait; 8 | use tokio::io::{AsyncReadExt, AsyncWrite}; 9 | 10 | #[async_trait] 11 | pub trait WritingFileAdapter: GetMetadata + Send + Sync + Clone { 12 | async fn adapt_write( 13 | a: super::AdaptInfo, 14 | detection_reason: &crate::matching::FileMatcher, 15 | oup: Pin>, 16 | ) -> Result<()>; 17 | } 18 | 19 | macro_rules! async_writeln { 20 | ($dst: expr_2021) => { 21 | { 22 | tokio::io::AsyncWriteExt::write_all(&mut $dst, b"\n").await 23 | } 24 | }; 25 | ($dst: expr_2021, $fmt: expr_2021) => { 26 | { 27 | use std::io::Write; 28 | let mut buf = Vec::::new(); 29 | writeln!(buf, $fmt)?; 30 | tokio::io::AsyncWriteExt::write_all(&mut $dst, &buf).await 31 | } 32 | }; 33 | ($dst: expr_2021, $fmt: expr_2021, $($arg: tt)*) => { 34 | { 35 | use std::io::Write; 36 | let mut buf = Vec::::new(); 37 | writeln!(buf, $fmt, $( $arg )*)?; 38 | tokio::io::AsyncWriteExt::write_all(&mut $dst, &buf).await 39 | } 40 | }; 41 | } 42 | pub(crate) use async_writeln; 43 | 44 | #[async_trait] 45 | impl FileAdapter for T 46 | where 47 | T: WritingFileAdapter, 48 | { 49 | async fn adapt( 50 | &self, 51 | a: super::AdaptInfo, 52 | detection_reason: &crate::matching::FileMatcher, 53 | ) -> Result { 54 | let name = self.metadata().name.clone(); 55 | let (w, r) = tokio::io::duplex(128 * 1024); 56 | let d2 = detection_reason.clone(); 57 | let archive_recursion_depth = a.archive_recursion_depth + 1; 58 | let filepath_hint = format!("{}.txt", a.filepath_hint.to_string_lossy()); 59 | let postprocess = a.postprocess; 60 | let line_prefix = a.line_prefix.clone(); 61 | let config = a.config.clone(); 62 | let joiner = tokio::spawn(async move { 63 | let x = d2; 64 | T::adapt_write(a, &x, Box::pin(w)) 65 | .await 66 | .with_context(|| format!("in {}.adapt_write", name)) 67 | .map_err(to_io_err) 68 | }); 69 | 70 | Ok(one_file(AdaptInfo { 71 | is_real_file: false, 72 | filepath_hint: filepath_hint.into(), 73 | archive_recursion_depth, 74 | config, 75 | inp: Box::pin(r.chain(join_handle_to_stream(joiner))), 76 | line_prefix, 77 | postprocess, 78 | })) 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/adapters/zip.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | use crate::print_bytes; 3 | use anyhow::*; 4 | use async_stream::stream; 5 | use lazy_static::lazy_static; 6 | use log::*; 7 | 8 | // TODO: allow users to configure file extensions instead of hard coding the list 9 | // https://github.com/phiresky/ripgrep-all/pull/208#issuecomment-2173241243 10 | static EXTENSIONS: &[&str] = &["zip", "jar", "xpi", "kra", "snagx"]; 11 | 12 | lazy_static! { 13 | static ref METADATA: AdapterMeta = AdapterMeta { 14 | name: "zip".to_owned(), 15 | version: 1, 16 | description: "Reads a zip file as a stream and recurses down into its contents".to_owned(), 17 | recurses: true, 18 | fast_matchers: EXTENSIONS 19 | .iter() 20 | .map(|s| FastFileMatcher::FileExtension(s.to_string())) 21 | .collect(), 22 | slow_matchers: Some(vec![FileMatcher::MimeType("application/zip".to_owned())]), 23 | keep_fast_matchers_if_accurate: false, 24 | disabled_by_default: false 25 | }; 26 | } 27 | #[derive(Default, Clone)] 28 | pub struct ZipAdapter; 29 | 30 | impl ZipAdapter { 31 | pub fn new() -> Self { 32 | Self 33 | } 34 | } 35 | impl GetMetadata for ZipAdapter { 36 | fn metadata(&self) -> &AdapterMeta { 37 | &METADATA 38 | } 39 | } 40 | 41 | #[async_trait] 42 | impl FileAdapter for ZipAdapter { 43 | async fn adapt( 44 | &self, 45 | ai: AdaptInfo, 46 | _detection_reason: &FileMatcher, 47 | ) -> Result { 48 | // let (s, r) = mpsc::channel(1); 49 | let AdaptInfo { 50 | inp, 51 | filepath_hint, 52 | archive_recursion_depth, 53 | postprocess, 54 | line_prefix, 55 | config, 56 | is_real_file, 57 | .. 58 | } = ai; 59 | if is_real_file { 60 | use async_zip::read::fs::ZipFileReader; 61 | 62 | let zip = ZipFileReader::new(&filepath_hint).await?; 63 | let s = stream! { 64 | for i in 0..zip.file().entries().len() { 65 | let file = zip.get_entry(i)?; 66 | let reader = zip.entry(i).await?; 67 | if file.filename().ends_with('/') { 68 | continue; 69 | } 70 | debug!( 71 | "{}{}|{}: {} ({} packed)", 72 | line_prefix, 73 | filepath_hint.display(), 74 | file.filename(), 75 | print_bytes(file.uncompressed_size() as f64), 76 | print_bytes(file.compressed_size() as f64) 77 | ); 78 | let new_line_prefix = format!("{}{}: ", line_prefix, file.filename()); 79 | let fname = PathBuf::from(file.filename()); 80 | tokio::pin!(reader); 81 | // SAFETY: this should be solvable without unsafe but idk how :( 82 | // the issue is that ZipEntryReader borrows from ZipFileReader, but we need to yield it here into the stream 83 | // but then it can't borrow from the ZipFile 84 | let reader2 = unsafe { 85 | std::intrinsics::transmute::< 86 | Pin<&mut (dyn AsyncRead + Send)>, 87 | Pin<&'static mut (dyn AsyncRead + Send)>, 88 | >(reader) 89 | }; 90 | yield Ok(AdaptInfo { 91 | filepath_hint: fname, 92 | is_real_file: false, 93 | inp: Box::pin(reader2), 94 | line_prefix: new_line_prefix, 95 | archive_recursion_depth: archive_recursion_depth + 1, 96 | postprocess, 97 | config: config.clone(), 98 | }); 99 | } 100 | }; 101 | 102 | Ok(Box::pin(s)) 103 | } else { 104 | use async_zip::read::stream::ZipFileReader; 105 | let mut zip = ZipFileReader::new(inp); 106 | 107 | let s = stream! { 108 | trace!("begin zip"); 109 | while let Some(mut entry) = zip.next_entry().await? { 110 | trace!("zip next entry"); 111 | let file = entry.entry(); 112 | if file.filename().ends_with('/') { 113 | zip = entry.skip().await?; 114 | 115 | continue; 116 | } 117 | debug!( 118 | "{}{}|{}: {} ({} packed)", 119 | line_prefix, 120 | filepath_hint.display(), 121 | file.filename(), 122 | print_bytes(file.uncompressed_size() as f64), 123 | print_bytes(file.compressed_size() as f64) 124 | ); 125 | let new_line_prefix = format!("{}{}: ", line_prefix, file.filename()); 126 | let fname = PathBuf::from(file.filename()); 127 | let reader = entry.reader(); 128 | tokio::pin!(reader); 129 | // SAFETY: this should be solvable without unsafe but idk how :( 130 | // the issue is that ZipEntryReader borrows from ZipFileReader, but we need to yield it here into the stream 131 | // but then it can't borrow from the ZipFile 132 | let reader2 = unsafe { 133 | std::intrinsics::transmute::< 134 | Pin<&mut (dyn AsyncRead + Send)>, 135 | Pin<&'static mut (dyn AsyncRead + Send)>, 136 | >(reader) 137 | }; 138 | yield Ok(AdaptInfo { 139 | filepath_hint: fname, 140 | is_real_file: false, 141 | inp: Box::pin(reader2), 142 | line_prefix: new_line_prefix, 143 | archive_recursion_depth: archive_recursion_depth + 1, 144 | postprocess, 145 | config: config.clone(), 146 | }); 147 | zip = entry.done().await.context("going to next file in zip but entry was not read fully")?; 148 | 149 | } 150 | trace!("zip over"); 151 | }; 152 | 153 | Ok(Box::pin(s)) 154 | } 155 | } 156 | } 157 | 158 | /*struct ZipAdaptIter { 159 | inp: AdaptInfo, 160 | } 161 | impl<'a> AdaptedFilesIter for ZipAdaptIter<'a> { 162 | fn next<'b>(&'b mut self) -> Option> { 163 | let line_prefix = &self.inp.line_prefix; 164 | let filepath_hint = &self.inp.filepath_hint; 165 | let archive_recursion_depth = &self.inp.archive_recursion_depth; 166 | let postprocess = self.inp.postprocess; 167 | ::zip::read::read_zipfile_from_stream(&mut self.inp.inp) 168 | .unwrap() 169 | .and_then(|file| { 170 | if file.is_dir() { 171 | return None; 172 | } 173 | debug!( 174 | "{}{}|{}: {} ({} packed)", 175 | line_prefix, 176 | filepath_hint.to_string_lossy(), 177 | file.name(), 178 | print_bytes(file.size() as f64), 179 | print_bytes(file.compressed_size() as f64) 180 | ); 181 | let line_prefix = format!("{}{}: ", line_prefix, file.name()); 182 | Some(AdaptInfo { 183 | filepath_hint: PathBuf::from(file.name()), 184 | is_real_file: false, 185 | inp: Box::new(file), 186 | line_prefix, 187 | archive_recursion_depth: archive_recursion_depth + 1, 188 | postprocess, 189 | config: RgaConfig::default(), //config.clone(), 190 | }) 191 | }) 192 | } 193 | }*/ 194 | 195 | #[cfg(test)] 196 | mod test { 197 | use async_zip::{Compression, ZipEntryBuilder, write::ZipFileWriter}; 198 | 199 | use super::*; 200 | use crate::{preproc::loop_adapt, test_utils::*}; 201 | use pretty_assertions::assert_eq; 202 | 203 | #[async_recursion::async_recursion] 204 | async fn create_zip(fname: &str, content: &str, add_inner: bool) -> Result> { 205 | let v = Vec::new(); 206 | let mut cursor = std::io::Cursor::new(v); 207 | let mut zip = ZipFileWriter::new(&mut cursor); 208 | 209 | let options = ZipEntryBuilder::new(fname.to_string(), Compression::Stored); 210 | zip.write_entry_whole(options, content.as_bytes()).await?; 211 | 212 | if add_inner { 213 | let opts = ZipEntryBuilder::new("inner.zip".to_string(), Compression::Stored); 214 | zip.write_entry_whole( 215 | opts, 216 | &create_zip("inner.txt", "inner text file", false).await?, 217 | ) 218 | .await?; 219 | } 220 | zip.close().await?; 221 | Ok(cursor.into_inner()) 222 | } 223 | 224 | #[tokio::test] 225 | async fn only_seek_zip_fs() -> Result<()> { 226 | let zip = test_data_dir().join("only-seek-zip.zip"); 227 | let (a, d) = simple_fs_adapt_info(&zip).await?; 228 | let _v = adapted_to_vec(loop_adapt(&ZipAdapter::new(), d, a).await?).await?; 229 | // assert_eq!(String::from_utf8(v)?, ""); 230 | 231 | Ok(()) 232 | } 233 | /*#[tokio::test] 234 | async fn only_seek_zip_mem() -> Result<()> { 235 | let zip = test_data_dir().join("only-seek-zip.zip"); 236 | let (a, d) = simple_adapt_info(&zip, Box::pin(File::open(&zip).await?)); 237 | let v = adapted_to_vec(loop_adapt(&ZipAdapter::new(), d, a)?).await?; 238 | // assert_eq!(String::from_utf8(v)?, ""); 239 | 240 | Ok(()) 241 | }*/ 242 | #[tokio::test] 243 | async fn recurse() -> Result<()> { 244 | let zipfile = create_zip("outer.txt", "outer text file", true).await?; 245 | let adapter = ZipAdapter::new(); 246 | 247 | let (a, d) = simple_adapt_info( 248 | &PathBuf::from("outer.zip"), 249 | Box::pin(std::io::Cursor::new(zipfile)), 250 | ); 251 | let buf = adapted_to_vec(loop_adapt(&adapter, d, a).await?).await?; 252 | 253 | assert_eq!( 254 | String::from_utf8(buf)?, 255 | "PREFIX:outer.txt: outer text file\nPREFIX:inner.zip: inner.txt: inner text file\n", 256 | ); 257 | 258 | Ok(()) 259 | } 260 | } 261 | -------------------------------------------------------------------------------- /src/bin/rga-fzf-open.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Context; 2 | 3 | use std::process::Command; 4 | 5 | // TODO: add --rg-params=..., --rg-preview-params=... and --fzf-params=... params 6 | // TODO: remove passthrough_args 7 | fn main() -> anyhow::Result<()> { 8 | env_logger::init(); 9 | let mut args = std::env::args().skip(1); 10 | let query = args.next().context("no query")?; 11 | let fname = args.next().context("no filename")?; 12 | // let instance_id = std::env::var("RGA_FZF_INSTANCE").unwrap_or("unk".to_string()); 13 | 14 | if fname.ends_with(".pdf") { 15 | use std::io::ErrorKind::*; 16 | 17 | let worked = Command::new("evince") 18 | .arg("--find") 19 | .arg(&query) 20 | .arg(&fname) 21 | .spawn() 22 | .map_or_else( 23 | |err| match err.kind() { 24 | NotFound => Ok(false), 25 | _ => Err(err), 26 | }, 27 | |_| Ok(true), 28 | )?; 29 | if worked { 30 | return Ok(()); 31 | } 32 | } 33 | Ok(open::that_detached(&fname)?) 34 | } 35 | -------------------------------------------------------------------------------- /src/bin/rga-fzf.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Context; 2 | use rga::adapters::custom::map_exe_error; 3 | use ripgrep_all as rga; 4 | 5 | use std::process::{Command, Stdio}; 6 | 7 | // TODO: add --rg-params=..., --rg-preview-params=... and --fzf-params=... params 8 | // TODO: remove passthrough_args 9 | fn main() -> anyhow::Result<()> { 10 | env_logger::init(); 11 | let mut passthrough_args: Vec = std::env::args().skip(1).collect(); 12 | let inx = passthrough_args.iter().position(|e| !e.starts_with('-')); 13 | let initial_query = if let Some(inx) = inx { 14 | passthrough_args.remove(inx) 15 | } else { 16 | "".to_string() 17 | }; 18 | 19 | let exe = std::env::current_exe().context("Could not get executable location")?; 20 | let preproc_exe = exe.with_file_name("rga"); 21 | let preproc_exe = preproc_exe 22 | .to_str() 23 | .context("rga executable is in non-unicode path")?; 24 | let open_exe = exe.with_file_name("rga-fzf-open"); 25 | let open_exe = open_exe 26 | .to_str() 27 | .context("rga-fzf-open executable is in non-unicode path")?; 28 | 29 | let rg_prefix = format!("{preproc_exe} --files-with-matches --rga-cache-max-blob-len=10M"); 30 | 31 | let child = Command::new("fzf") 32 | .arg(format!( 33 | "--preview={preproc_exe} --pretty --context 5 {{q}} --rga-fzf-path=_{{}}" 34 | )) 35 | .arg("--preview-window=70%:wrap") 36 | .arg("--phony") 37 | .arg("--query") 38 | .arg(&initial_query) 39 | .arg("--print-query") 40 | .arg(format!("--bind=change:reload: {rg_prefix} {{q}}")) 41 | .arg(format!("--bind=ctrl-m:execute:{open_exe} {{q}} {{}}")) 42 | .env( 43 | "FZF_DEFAULT_COMMAND", 44 | format!("{} '{}'", rg_prefix, &initial_query), 45 | ) 46 | .env("RGA_FZF_INSTANCE", format!("{}", std::process::id())) // may be useful to open stuff in the same tab 47 | .stdout(Stdio::piped()) 48 | .spawn() 49 | .map_err(|e| map_exe_error(e, "fzf", "Please make sure you have fzf installed."))?; 50 | 51 | let output = child.wait_with_output()?; 52 | let mut x = output.stdout.split(|e| e == &b'\n'); 53 | let final_query = 54 | std::str::from_utf8(x.next().context("fzf output empty")?).context("fzf query not utf8")?; 55 | let selected_file = std::str::from_utf8(x.next().context("fzf output not two line")?) 56 | .context("fzf ofilename not utf8")?; 57 | println!("query='{final_query}', file='{selected_file}'"); 58 | 59 | Ok(()) 60 | } 61 | -------------------------------------------------------------------------------- /src/bin/rga-preproc.rs: -------------------------------------------------------------------------------- 1 | use rga::adapters::*; 2 | use rga::preproc::*; 3 | use rga::print_dur; 4 | use ripgrep_all as rga; 5 | 6 | use anyhow::Context; 7 | use log::debug; 8 | use std::time::Instant; 9 | use tokio::fs::File; 10 | 11 | #[tokio::main] 12 | async fn main() -> anyhow::Result<()> { 13 | env_logger::init(); 14 | let mut arg_arr: Vec = std::env::args_os().collect(); 15 | let last = arg_arr.pop().expect("No filename specified"); 16 | let config = rga::config::parse_args(arg_arr, true)?; 17 | //clap::App::new("rga-preproc").arg(Arg::from_usage()) 18 | let path = { 19 | let filepath = last; 20 | std::env::current_dir()?.join(filepath) 21 | }; 22 | 23 | let i = File::open(&path) 24 | .await 25 | .context("Specified input file not found")?; 26 | let mut o = tokio::io::stdout(); 27 | let ai = AdaptInfo { 28 | inp: Box::pin(i), 29 | filepath_hint: path, 30 | is_real_file: true, 31 | line_prefix: "".to_string(), 32 | archive_recursion_depth: 0, 33 | postprocess: !config.no_prefix_filenames, 34 | config, 35 | }; 36 | 37 | let start = Instant::now(); 38 | let mut oup = rga_preproc(ai).await.context("during preprocessing")?; 39 | debug!("finding and starting adapter took {}", print_dur(start)); 40 | let res = tokio::io::copy(&mut oup, &mut o).await; 41 | if let Err(e) = res { 42 | if e.kind() == std::io::ErrorKind::BrokenPipe { 43 | // happens if e.g. ripgrep detects binary data in the pipe so it cancels reading 44 | debug!("output cancelled (broken pipe)"); 45 | } else { 46 | Err(e).context("copying adapter output to stdout")?; 47 | } 48 | } 49 | debug!("running adapter took {} total", print_dur(start)); 50 | Ok(()) 51 | } 52 | -------------------------------------------------------------------------------- /src/bin/rga.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use rga::adapters::custom::map_exe_error; 3 | use rga::adapters::*; 4 | use rga::config::{RgaConfig, split_args}; 5 | use rga::matching::*; 6 | use rga::print_dur; 7 | use ripgrep_all as rga; 8 | use structopt::StructOpt; 9 | 10 | use schemars::schema_for; 11 | use std::process::Command; 12 | use std::time::Instant; 13 | 14 | fn list_adapters(args: RgaConfig) -> Result<()> { 15 | let (enabled_adapters, disabled_adapters) = get_all_adapters(args.custom_adapters); 16 | 17 | println!("Adapters:\n"); 18 | let print = |adapter: std::sync::Arc| { 19 | let meta = adapter.metadata(); 20 | let matchers = meta 21 | .fast_matchers 22 | .iter() 23 | .map(|m| match m { 24 | FastFileMatcher::FileExtension(ext) => format!(".{ext}"), 25 | }) 26 | .collect::>() 27 | .join(", "); 28 | let slow_matchers = meta 29 | .slow_matchers 30 | .as_ref() 31 | .unwrap_or(&vec![]) 32 | .iter() 33 | .filter_map(|m| match m { 34 | FileMatcher::MimeType(x) => Some(x.to_string()), 35 | FileMatcher::Fast(_) => None, 36 | }) 37 | .collect::>() 38 | .join(", "); 39 | print!( 40 | " - **{name}**\n {desc} \n Extensions: {matchers} \n Mime Types: {mime} \n", 41 | name = meta.name, 42 | desc = meta.description.replace('\n', "\n "), 43 | matchers = matchers, 44 | mime = slow_matchers, 45 | ); 46 | println!(); 47 | }; 48 | for adapter in enabled_adapters { 49 | print(adapter) 50 | } 51 | println!( 52 | "The following adapters are disabled by default, and can be enabled using '--rga-adapters=+foo,bar':\n" 53 | ); 54 | for adapter in disabled_adapters { 55 | print(adapter) 56 | } 57 | Ok(()) 58 | } 59 | fn main() -> anyhow::Result<()> { 60 | // set debugging as early as possible 61 | if std::env::args().any(|e| e == "--debug") { 62 | // TODO: Audit that the environment access only happens in single-threaded code. 63 | unsafe { std::env::set_var("RUST_LOG", "debug") }; 64 | } 65 | 66 | env_logger::init(); 67 | 68 | let (config, mut passthrough_args) = split_args(false)?; 69 | 70 | if config.print_config_schema { 71 | println!("{}", serde_json::to_string_pretty(&schema_for!(RgaConfig))?); 72 | return Ok(()); 73 | } 74 | if config.list_adapters { 75 | return list_adapters(config); 76 | } 77 | if let Some(path) = config.fzf_path { 78 | if path == "_" { 79 | // fzf found no result, ignore everything and return 80 | println!("[no file found]"); 81 | return Ok(()); 82 | } 83 | passthrough_args.push(std::ffi::OsString::from(&path[1..])); 84 | } 85 | 86 | if passthrough_args.is_empty() { 87 | // rg would show help. Show own help instead. 88 | RgaConfig::clap().print_help()?; 89 | println!(); 90 | return Ok(()); 91 | } 92 | 93 | let adapters = get_adapters_filtered(config.custom_adapters.clone(), &config.adapters)?; 94 | 95 | let pre_glob = if !config.accurate { 96 | let extensions = adapters 97 | .iter() 98 | .flat_map(|a| &a.metadata().fast_matchers) 99 | .flat_map(|m| match m { 100 | FastFileMatcher::FileExtension(ext) => vec![ext.clone(), ext.to_ascii_uppercase()], 101 | }) 102 | .collect::>() 103 | .join(","); 104 | format!("*.{{{extensions}}}") 105 | } else { 106 | "*".to_owned() 107 | }; 108 | 109 | add_exe_to_path()?; 110 | 111 | let rg_args = vec![ 112 | "--no-line-number", 113 | // smart case by default because within weird files 114 | // we probably can't really trust casing anyways 115 | "--smart-case", 116 | ]; 117 | 118 | let exe = std::env::current_exe().expect("Could not get executable location"); 119 | let preproc_exe = exe.with_file_name("rga-preproc"); 120 | 121 | let before = Instant::now(); 122 | let mut cmd = Command::new("rg"); 123 | cmd.args(rg_args) 124 | .arg("--pre") 125 | .arg(preproc_exe) 126 | .arg("--pre-glob") 127 | .arg(pre_glob) 128 | .args(passthrough_args); 129 | log::debug!("rg command to run: {:?}", cmd); 130 | let mut child = cmd 131 | .spawn() 132 | .map_err(|e| map_exe_error(e, "rg", "Please make sure you have ripgrep installed."))?; 133 | 134 | let result = child.wait()?; 135 | 136 | log::debug!("running rg took {}", print_dur(before)); 137 | if !result.success() { 138 | std::process::exit(result.code().unwrap_or(1)); 139 | } 140 | Ok(()) 141 | } 142 | 143 | /// add the directory that contains `rga` to PATH, so rga-preproc can find pandoc etc (if we are on Windows where we include dependent binaries) 144 | fn add_exe_to_path() -> Result<()> { 145 | use std::env; 146 | let mut exe = env::current_exe().expect("Could not get executable location"); 147 | // let preproc_exe = exe.with_file_name("rga-preproc"); 148 | exe.pop(); // dirname 149 | 150 | let path = env::var_os("PATH").unwrap_or_default(); 151 | let paths = env::split_paths(&path).collect::>(); 152 | // prepend: prefer bundled versions to system-installed versions of binaries 153 | // solves https://github.com/phiresky/ripgrep-all/issues/32 154 | // may be somewhat of a security issue if rga binary is in installed in unprivileged locations 155 | let paths = [&[exe.to_owned(), exe.join("lib")], &paths[..]].concat(); 156 | let new_path = env::join_paths(paths)?; 157 | // TODO: Audit that the environment access only happens in single-threaded code. 158 | unsafe { env::set_var("PATH", new_path) }; 159 | Ok(()) 160 | } 161 | -------------------------------------------------------------------------------- /src/caching_writer.rs: -------------------------------------------------------------------------------- 1 | use std::{future::Future, pin::Pin}; 2 | 3 | use anyhow::{Context, Result}; 4 | use async_compression::tokio::write::ZstdEncoder; 5 | use async_stream::stream; 6 | 7 | use crate::to_io_err; 8 | use log::*; 9 | use tokio::io::{AsyncRead, AsyncWriteExt}; 10 | use tokio_stream::StreamExt; 11 | use tokio_util::io::{ReaderStream, StreamReader}; 12 | 13 | type FinishHandler = 14 | dyn FnOnce((u64, Option>)) -> Pin> + Send>> + Send; 15 | /** 16 | * wrap a AsyncRead so that it is passthrough, 17 | * but also the written data is compressed and written into a buffer, 18 | * unless more than max_cache_size bytes is written, then the cache is dropped and it is pure passthrough. 19 | */ 20 | pub fn async_read_and_write_to_cache<'a>( 21 | inp: impl AsyncRead + Send + 'a, 22 | max_cache_size: usize, 23 | compression_level: i32, 24 | on_finish: Box, 25 | ) -> Result>> { 26 | let inp = Box::pin(inp); 27 | let mut zstd_writer = Some(ZstdEncoder::with_quality( 28 | Vec::new(), 29 | async_compression::Level::Precise(compression_level), 30 | )); 31 | let mut bytes_written = 0; 32 | 33 | let s = stream! { 34 | let mut stream = ReaderStream::new(inp); 35 | while let Some(bytes) = stream.next().await { 36 | trace!("read bytes: {:?}", bytes); 37 | if let Ok(bytes) = &bytes { 38 | if let Some(writer) = zstd_writer.as_mut() { 39 | writer.write_all(bytes).await?; 40 | bytes_written += bytes.len() as u64; 41 | let compressed_len = writer.get_ref().len(); 42 | trace!("wrote {} to zstd, len now {}", bytes.len(), compressed_len); 43 | if compressed_len > max_cache_size { 44 | debug!("cache longer than max, dropping"); 45 | //writer.finish(); 46 | zstd_writer.take(); 47 | } 48 | } 49 | } 50 | yield bytes; 51 | } 52 | trace!("eof"); 53 | // EOF, call on_finish 54 | let finish = { 55 | match zstd_writer.take() { Some(mut writer) => { 56 | writer.shutdown().await?; 57 | let res = writer.into_inner(); 58 | trace!("EOF"); 59 | if res.len() <= max_cache_size { 60 | trace!("writing {} bytes to cache", res.len()); 61 | (bytes_written, Some(res)) 62 | } else { 63 | trace!("cache longer than max, dropping"); 64 | (bytes_written, None) 65 | } 66 | } _ => { 67 | (bytes_written, None) 68 | }} 69 | }; 70 | 71 | // EOF, finish! 72 | on_finish(finish).await.context("write_to_cache on_finish") 73 | .map_err(to_io_err)?; 74 | 75 | }; 76 | 77 | Ok(Box::pin(StreamReader::new(s))) 78 | } 79 | -------------------------------------------------------------------------------- /src/config.rs: -------------------------------------------------------------------------------- 1 | use crate::{adapters::custom::CustomAdapterConfig, project_dirs}; 2 | use anyhow::{Context, Result}; 3 | use derive_more::FromStr; 4 | use log::*; 5 | use schemars::JsonSchema; 6 | use serde::{Deserialize, Serialize}; 7 | use std::ffi::OsString; 8 | use std::io::Read; 9 | use std::{fs::File, io::Write, iter::IntoIterator, path::PathBuf, str::FromStr}; 10 | use structopt::StructOpt; 11 | 12 | #[derive(Debug, Deserialize, Serialize)] 13 | struct ReadableBytesCount(i64); 14 | 15 | fn is_default(t: &T) -> bool { 16 | t == &T::default() 17 | } 18 | #[derive(JsonSchema, Debug, Serialize, Deserialize, Copy, Clone, PartialEq, FromStr)] 19 | pub struct CacheCompressionLevel(pub i32); 20 | 21 | impl std::fmt::Display for CacheCompressionLevel { 22 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 23 | write!(f, "{}", self.0) 24 | } 25 | } 26 | impl Default for CacheCompressionLevel { 27 | fn default() -> Self { 28 | Self(12) 29 | } 30 | } 31 | #[derive(JsonSchema, Debug, Serialize, Deserialize, Copy, Clone, PartialEq, FromStr)] 32 | pub struct MaxArchiveRecursion(pub i32); 33 | 34 | impl std::fmt::Display for MaxArchiveRecursion { 35 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 36 | write!(f, "{}", self.0) 37 | } 38 | } 39 | impl Default for MaxArchiveRecursion { 40 | fn default() -> Self { 41 | Self(5) 42 | } 43 | } 44 | 45 | #[derive(JsonSchema, Debug, Serialize, Deserialize, Clone, PartialEq, FromStr)] 46 | pub struct CachePath(pub String); 47 | 48 | impl std::fmt::Display for CachePath { 49 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 50 | write!(f, "{}", self.0) 51 | } 52 | } 53 | impl Default for CachePath { 54 | fn default() -> Self { 55 | let pd = project_dirs().expect("could not get cache path"); 56 | let app_cache = pd.cache_dir(); 57 | Self(app_cache.to_str().expect("cache path not utf8").to_owned()) 58 | } 59 | } 60 | 61 | #[derive(JsonSchema, Debug, Serialize, Deserialize, Copy, Clone, PartialEq, Eq)] 62 | pub struct CacheMaxBlobLen(pub usize); 63 | 64 | impl std::fmt::Display for CacheMaxBlobLen { 65 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 66 | write!(f, "{}", self.0) 67 | } 68 | } 69 | impl Default for CacheMaxBlobLen { 70 | fn default() -> Self { 71 | Self(2000000) 72 | } 73 | } 74 | 75 | impl FromStr for CacheMaxBlobLen { 76 | type Err = anyhow::Error; 77 | fn from_str(s: &str) -> Result { 78 | let suffix = s.chars().last(); 79 | if let Some(suffix) = suffix { 80 | Ok(Self(match suffix { 81 | 'k' | 'M' | 'G' => usize::from_str(s.trim_end_matches(suffix)) 82 | .with_context(|| "Could not parse int".to_string()) 83 | .map(|e| { 84 | e * match suffix { 85 | 'k' => 1000, 86 | 'M' => 1_000_000, 87 | 'G' => 1_000_000_000, 88 | _ => panic!("impossible"), 89 | } 90 | }), 91 | _ => usize::from_str(s).with_context(|| "Could not parse int".to_string()), 92 | }?)) 93 | } else { 94 | Err(anyhow::format_err!("empty byte input")) 95 | } 96 | } 97 | } 98 | 99 | /// # rga configuration 100 | /// 101 | /// This is kind of a "polyglot" struct serving multiple purposes: 102 | /// 103 | /// 1. Declare the command line arguments using structopt+clap 104 | /// 1. Provide information for manpage / readme generation. 105 | /// 1. Describe the config file format (output as JSON schema via schemars). 106 | #[derive(StructOpt, Debug, Deserialize, Serialize, JsonSchema, Default, Clone)] 107 | #[structopt( 108 | name = "ripgrep-all", 109 | rename_all = "kebab-case", 110 | about = env!("CARGO_PKG_DESCRIPTION"), 111 | author = env!("CARGO_PKG_HOMEPAGE"), 112 | long_about="rga: ripgrep, but also search in PDFs, E-Books, Office documents, zip, tar.gz, etc.", 113 | // TODO: long_about does not seem to work to only show this on short help 114 | after_help = "-h shows a concise overview, --help shows more detail and advanced options.\n\nAll other options not shown here are passed directly to rg, especially [PATTERN] and [PATH ...]", 115 | usage = "rga [RGA OPTIONS] [RG OPTIONS] PATTERN [PATH ...]" 116 | )] 117 | pub struct RgaConfig { 118 | /// Use more accurate but slower matching by mime type. 119 | /// 120 | /// By default, rga will match files using file extensions. 121 | /// Some programs, such as sqlite3, don't care about the file extension at all, so users sometimes use any or no extension at all. 122 | /// With this flag, rga will try to detect the mime type of input files using the magic bytes (similar to the `file` utility), and use that to choose the adapter. 123 | /// Detection is only done on the first 8KiB of the file, since we can't always seek on the input (in archives). 124 | #[serde(default, skip_serializing_if = "is_default")] 125 | #[structopt(long = "--rga-accurate")] 126 | pub accurate: bool, 127 | 128 | /// Change which adapters to use and in which priority order (descending). 129 | /// 130 | /// - "foo,bar" means use only adapters foo and bar. 131 | /// - "-bar,baz" means use all default adapters except for bar and baz. 132 | /// - "+bar,baz" means use all default adapters and also bar and baz. 133 | #[serde(default, skip_serializing_if = "is_default")] 134 | #[structopt( 135 | long = "--rga-adapters", 136 | require_equals = true, 137 | require_delimiter = true 138 | )] 139 | pub adapters: Vec, 140 | 141 | #[serde(default, skip_serializing_if = "is_default")] 142 | #[structopt(flatten)] 143 | pub cache: CacheConfig, 144 | 145 | /// Maximum depth of nested archives to recurse into. 146 | /// 147 | /// When searching in archives, rga will recurse into archives inside archives. 148 | /// This option limits the depth. 149 | #[serde(default, skip_serializing_if = "is_default")] 150 | #[structopt( 151 | default_value, 152 | long = "--rga-max-archive-recursion", 153 | require_equals = true, 154 | hidden_short_help = true 155 | )] 156 | pub max_archive_recursion: MaxArchiveRecursion, 157 | 158 | /// Don't prefix lines of files within archive with the path inside the archive. 159 | /// 160 | /// Inside archives, by default rga prefixes the content of each file with the file path within the archive. 161 | /// This is usually useful, but can cause problems because then the inner path is also searched for the pattern. 162 | #[serde(default, skip_serializing_if = "is_default")] 163 | #[structopt(long = "--rga-no-prefix-filenames")] 164 | pub no_prefix_filenames: bool, 165 | 166 | #[serde(default, skip_serializing_if = "is_default")] 167 | #[structopt(skip)] // config file only 168 | pub custom_adapters: Option>, 169 | 170 | #[serde(skip)] 171 | #[structopt(long = "--rga-config-file", require_equals = true)] 172 | pub config_file_path: Option, 173 | 174 | /// Same as passing path directly, except if argument is empty. 175 | /// 176 | /// Kinda hacky, but if no file is found, `fzf` calls `rga` with empty string as path, which causes "No such file or directory from rg". 177 | /// So filter those cases and return specially. 178 | #[serde(skip)] // CLI only 179 | #[structopt(long = "--rga-fzf-path", require_equals = true, hidden = true)] 180 | pub fzf_path: Option, 181 | 182 | #[serde(skip)] // CLI only 183 | #[structopt(long = "--rga-list-adapters", help = "List all known adapters")] 184 | pub list_adapters: bool, 185 | 186 | #[serde(skip)] // CLI only 187 | #[structopt( 188 | long = "--rga-print-config-schema", 189 | help = "Print the JSON Schema of the configuration file" 190 | )] 191 | pub print_config_schema: bool, 192 | 193 | #[serde(skip)] // CLI only 194 | #[structopt(long, help = "Show help for ripgrep itself")] 195 | pub rg_help: bool, 196 | 197 | #[serde(skip)] // CLI only 198 | #[structopt(long, help = "Show version of ripgrep itself")] 199 | pub rg_version: bool, 200 | } 201 | 202 | #[derive(StructOpt, Debug, Deserialize, Serialize, JsonSchema, Default, Clone, PartialEq)] 203 | pub struct CacheConfig { 204 | /// Disable caching of results. 205 | /// 206 | /// By default, rga caches the extracted text, if it is small enough, to a database. 207 | /// This way, repeated searches on the same set of files will be much faster. 208 | /// The location of the DB varies by platform: 209 | /// - `${XDG_CACHE_DIR-~/.cache}/ripgrep-all` on Linux 210 | /// - `~/Library/Caches/ripgrep-all` on macOS 211 | /// - `C:\Users\username\AppData\Local\ripgrep-all` on Windows 212 | /// 213 | /// If you pass this flag, all caching will be disabled. 214 | #[serde(default, skip_serializing_if = "is_default")] 215 | #[structopt(long = "--rga-no-cache")] 216 | pub disabled: bool, 217 | 218 | /// Max compressed size to cache. 219 | /// 220 | /// Longest byte length (after compression) to store in cache. 221 | /// Longer adapter outputs will not be cached and recomputed every time. 222 | /// 223 | /// Allowed suffixes on command line: k M G 224 | #[serde(default, skip_serializing_if = "is_default")] 225 | #[structopt( 226 | default_value, 227 | long = "--rga-cache-max-blob-len", 228 | hidden_short_help = true, 229 | require_equals = true, 230 | // parse(try_from_str = parse_readable_bytes_str) 231 | )] 232 | pub max_blob_len: CacheMaxBlobLen, 233 | 234 | /// ZSTD compression level to apply to adapter outputs before storing in cache DB. 235 | /// 236 | /// Ranges from 1 - 22. 237 | #[serde(default, skip_serializing_if = "is_default")] 238 | #[structopt( 239 | default_value, 240 | long = "--rga-cache-compression-level", 241 | hidden_short_help = true, 242 | require_equals = true, 243 | help = "" 244 | )] 245 | pub compression_level: CacheCompressionLevel, 246 | 247 | /// Path to store cache DB. 248 | #[serde(default, skip_serializing_if = "is_default")] 249 | #[structopt( 250 | default_value, 251 | long = "--rga-cache-path", 252 | hidden_short_help = true, 253 | require_equals = true 254 | )] 255 | pub path: CachePath, 256 | } 257 | 258 | static RGA_CONFIG: &str = "RGA_CONFIG"; 259 | 260 | use serde_json::Value; 261 | fn json_merge(a: &mut Value, b: &Value) { 262 | match (a, b) { 263 | (&mut Value::Object(ref mut a), Value::Object(b)) => { 264 | for (k, v) in b { 265 | json_merge(a.entry(k.clone()).or_insert(Value::Null), v); 266 | } 267 | } 268 | (a, b) => { 269 | *a = b.clone(); 270 | } 271 | } 272 | } 273 | 274 | fn read_config_file(path_override: Option) -> Result<(String, Value)> { 275 | let proj = project_dirs()?; 276 | let config_dir = proj.config_dir(); 277 | let config_filename = path_override 278 | .as_ref() 279 | .map(PathBuf::from) 280 | .unwrap_or_else(|| config_dir.join("config.jsonc")); 281 | let config_filename_str = config_filename.to_string_lossy().into_owned(); 282 | if config_filename.exists() { 283 | let config_file_contents = { 284 | let raw = std::fs::read_to_string(config_filename).with_context(|| { 285 | format!("Could not read config file json {config_filename_str}") 286 | })?; 287 | let mut s = String::new(); 288 | json_comments::StripComments::new(raw.as_bytes()) 289 | .read_to_string(&mut s) 290 | .context("strip comments")?; 291 | s 292 | }; 293 | { 294 | // just for error messages, actual deserialization happens after merging with cmd args 295 | serde_json::from_str::(&config_file_contents).with_context(|| { 296 | format!("Error in config file {config_filename_str}: {config_file_contents}") 297 | })?; 298 | } 299 | let config_json: serde_json::Value = 300 | serde_json::from_str(&config_file_contents).context("Could not parse config json")?; 301 | Ok((config_filename_str, config_json)) 302 | } else if let Some(p) = path_override.as_ref() { 303 | Err(anyhow::anyhow!("Config file not found: {}", p))? 304 | } else { 305 | // write default config 306 | std::fs::create_dir_all(config_dir)?; 307 | let mut schemafile = File::create(config_dir.join("config.v1.schema.json"))?; 308 | 309 | schemafile.write_all( 310 | serde_json::to_string_pretty(&schemars::schema_for!(RgaConfig))?.as_bytes(), 311 | )?; 312 | 313 | let mut configfile = File::create(config_filename)?; 314 | configfile.write_all(include_str!("../doc/config.default.jsonc").as_bytes())?; 315 | Ok(( 316 | config_filename_str, 317 | serde_json::Value::Object(Default::default()), 318 | )) 319 | } 320 | } 321 | fn read_config_env() -> Result { 322 | let val = std::env::var(RGA_CONFIG).ok(); 323 | if let Some(val) = val { 324 | serde_json::from_str(&val).context("could not parse config from env RGA_CONFIG") 325 | } else { 326 | serde_json::to_value(RgaConfig::default()).context("could not create default config") 327 | } 328 | } 329 | pub fn parse_args(args: I, is_rga_preproc: bool) -> Result 330 | where 331 | I: IntoIterator, 332 | I::Item: Into + Clone, 333 | { 334 | // TODO: don't read config file in rga-preproc for performance (called for every file) 335 | 336 | let arg_matches: RgaConfig = RgaConfig::from_iter(args); 337 | let args_config = serde_json::to_value(&arg_matches)?; 338 | 339 | let merged_config = { 340 | if is_rga_preproc { 341 | // only read from env and args 342 | let mut merged_config = read_config_env()?; 343 | json_merge(&mut merged_config, &args_config); 344 | log::debug!("Config: {}", serde_json::to_string(&merged_config)?); 345 | merged_config 346 | } else { 347 | // read from config file, env and args 348 | let (config_filename, config_file_config) = 349 | read_config_file(arg_matches.config_file_path)?; 350 | let env_var_config = read_config_env()?; 351 | let mut merged_config = config_file_config.clone(); 352 | json_merge(&mut merged_config, &env_var_config); 353 | json_merge(&mut merged_config, &args_config); 354 | log::debug!( 355 | "Configs:\n{}: {}\n{}: {}\nArgs: {}\nMerged: {}", 356 | config_filename, 357 | serde_json::to_string_pretty(&config_file_config)?, 358 | RGA_CONFIG, 359 | serde_json::to_string_pretty(&env_var_config)?, 360 | serde_json::to_string_pretty(&args_config)?, 361 | serde_json::to_string_pretty(&merged_config)? 362 | ); 363 | // pass to child processes 364 | // TODO: Audit that the environment access only happens in single-threaded code. 365 | unsafe { std::env::set_var(RGA_CONFIG, merged_config.to_string()) }; 366 | merged_config 367 | } 368 | }; 369 | 370 | let mut res: RgaConfig = serde_json::from_value(merged_config.clone()) 371 | .map_err(|e| { 372 | println!("{e:?}"); 373 | e 374 | }) 375 | .with_context(|| { 376 | format!( 377 | "Error parsing merged config: {}", 378 | serde_json::to_string_pretty(&merged_config).expect("no tostring") 379 | ) 380 | })?; 381 | { 382 | // readd values with [serde(skip)] 383 | res.fzf_path = arg_matches.fzf_path; 384 | res.list_adapters = arg_matches.list_adapters; 385 | res.print_config_schema = arg_matches.print_config_schema; 386 | res.rg_help = arg_matches.rg_help; 387 | res.rg_version = arg_matches.rg_version; 388 | } 389 | Ok(res) 390 | } 391 | 392 | /// Split arguments into the ones we care about and the ones rg cares about 393 | pub fn split_args(is_rga_preproc: bool) -> Result<(RgaConfig, Vec)> { 394 | let mut app = RgaConfig::clap(); 395 | 396 | app.p.create_help_and_version(); 397 | let mut firstarg = true; 398 | // debug!("{:#?}", app.p.flags); 399 | let (our_args, mut passthrough_args): (Vec, Vec) = std::env::args_os() 400 | .partition(|os_arg| { 401 | if firstarg { 402 | // hacky, but .enumerate() would be ugly because partition is too simplistic 403 | firstarg = false; 404 | return true; 405 | } 406 | if let Some(arg) = os_arg.to_str() { 407 | arg.starts_with("--rga-") 408 | || arg.starts_with("--rg-") 409 | || arg == "--help" 410 | || arg == "-h" 411 | || arg == "--version" 412 | || arg == "-V" 413 | } else { 414 | // args that are not unicode can only be filenames, pass them to rg 415 | false 416 | } 417 | }); 418 | debug!("rga (our) args: {:?}", our_args); 419 | let matches = parse_args(our_args, is_rga_preproc).context("Could not parse config")?; 420 | if matches.rg_help { 421 | passthrough_args.insert(0, "--help".into()); 422 | } 423 | if matches.rg_version { 424 | passthrough_args.insert(0, "--version".into()); 425 | } 426 | debug!("rga (passthrough) args: {:?}", passthrough_args); 427 | Ok((matches, passthrough_args)) 428 | } 429 | -------------------------------------------------------------------------------- /src/expand.rs: -------------------------------------------------------------------------------- 1 | use std::borrow::Cow; 2 | 3 | use anyhow::Result; 4 | 5 | // from https://github.com/phiresky/timetrackrs/blob/1c3df09ba2c1fda6065f2927045bd28dea0738d3/src/expand.rs 6 | 7 | pub fn find_byte(needle: u8, haystack: &[u8]) -> Option { 8 | #[cfg(not(feature = "perf-literal"))] 9 | fn imp(needle: u8, haystack: &[u8]) -> Option { 10 | haystack.iter().position(|&b| b == needle) 11 | } 12 | 13 | #[cfg(feature = "perf-literal")] 14 | fn imp(needle: u8, haystack: &[u8]) -> Option { 15 | use memchr::memchr; 16 | memchr(needle, haystack) 17 | } 18 | 19 | imp(needle, haystack) 20 | } 21 | 22 | pub fn expand_str_ez<'a, F>(replacement: &'a str, lambda: F) -> Result 23 | where 24 | F: Fn(&str) -> Result>, 25 | { 26 | let mut dst = String::new(); 27 | expand_str_lambda(lambda, replacement, &mut dst)?; 28 | Ok(dst) 29 | } 30 | 31 | pub fn expand_str_lambda<'a, F>(cap: F, replacement: &'a str, dst: &mut String) -> Result<()> 32 | where 33 | F: Fn(&str) -> Result>, 34 | { 35 | let mut replacement = replacement; 36 | while !replacement.is_empty() { 37 | match find_byte(b'$', replacement.as_bytes()) { 38 | None => break, 39 | Some(i) => { 40 | dst.push_str(&replacement[..i]); 41 | replacement = &replacement[i..]; 42 | } 43 | } 44 | if replacement.as_bytes().get(1).is_some_and(|&b| b == b'$') { 45 | dst.push('$'); 46 | replacement = &replacement[2..]; 47 | continue; 48 | } 49 | debug_assert!(!replacement.is_empty()); 50 | let cap_ref = match find_cap_ref(replacement.as_bytes()) { 51 | Some(cap_ref) => cap_ref, 52 | None => { 53 | dst.push('$'); 54 | replacement = &replacement[1..]; 55 | continue; 56 | } 57 | }; 58 | replacement = &replacement[cap_ref.end..]; 59 | dst.push_str(cap(cap_ref.cap)?.as_ref()); 60 | } 61 | dst.push_str(replacement); 62 | Ok(()) 63 | } 64 | 65 | /// `CaptureRef` represents a reference to a capture group inside some text. 66 | /// The reference is either a capture group name or a number. 67 | /// 68 | /// It is also tagged with the position in the text following the 69 | /// capture reference. 70 | #[derive(Clone, Copy, Debug, Eq, PartialEq)] 71 | struct CaptureRef<'a> { 72 | cap: &'a str, 73 | end: usize, 74 | } 75 | 76 | /// Parses a possible reference to a capture group name in the given text, 77 | /// starting at the beginning of `replacement`. 78 | /// 79 | /// If no such valid reference could be found, None is returned. 80 | fn find_cap_ref(replacement: &[u8]) -> Option { 81 | let mut i = 0; 82 | let rep: &[u8] = replacement; 83 | if rep.len() <= 1 || rep[0] != b'$' { 84 | return None; 85 | } 86 | i += 1; 87 | if rep[i] == b'{' { 88 | return find_cap_ref_braced(rep, i + 1); 89 | } 90 | let mut cap_end = i; 91 | while rep.get(cap_end).is_some_and(is_valid_cap_letter) { 92 | cap_end += 1; 93 | } 94 | if cap_end == i { 95 | return None; 96 | } 97 | // We just verified that the range 0..cap_end is valid ASCII, so it must 98 | // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8 99 | // check with either unsafe or by parsing the number straight from &[u8]. 100 | let cap = std::str::from_utf8(&rep[i..cap_end]).expect("valid UTF-8 capture name"); 101 | Some(CaptureRef { cap, end: cap_end }) 102 | } 103 | 104 | fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option { 105 | let start = i; 106 | while rep.get(i).is_some_and(|&b| b != b'}') { 107 | i += 1; 108 | } 109 | if rep.get(i).is_none_or(|&b| b != b'}') { 110 | return None; 111 | } 112 | // When looking at braced names, we don't put any restrictions on the name, 113 | // so it's possible it could be invalid UTF-8. But a capture group name 114 | // can never be invalid UTF-8, so if we have invalid UTF-8, then we can 115 | // safely return None. 116 | let cap = match std::str::from_utf8(&rep[start..i]) { 117 | Err(_) => return None, 118 | Ok(cap) => cap, 119 | }; 120 | Some(CaptureRef { cap, end: i + 1 }) 121 | } 122 | 123 | /// Returns true if and only if the given byte is allowed in a capture name. 124 | fn is_valid_cap_letter(b: &u8) -> bool { 125 | matches!(b, b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_') 126 | } 127 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![warn(clippy::all)] 2 | 3 | pub mod adapted_iter; 4 | pub mod adapters; 5 | mod caching_writer; 6 | pub mod config; 7 | pub mod expand; 8 | pub mod matching; 9 | pub mod preproc; 10 | pub mod preproc_cache; 11 | pub mod recurse; 12 | #[cfg(test)] 13 | pub mod test_utils; 14 | use anyhow::Context; 15 | use anyhow::Result; 16 | use async_stream::stream; 17 | use directories_next::ProjectDirs; 18 | use std::time::Instant; 19 | use tokio::io::AsyncRead; 20 | use tokio::task::JoinHandle; 21 | use tokio_util::io::StreamReader; 22 | 23 | pub fn project_dirs() -> Result { 24 | directories_next::ProjectDirs::from("", "", "ripgrep-all") 25 | .context("no home directory found! :(") 26 | } 27 | 28 | // no "significant digits" format specifier in rust?? 29 | // https://stackoverflow.com/questions/60497397/how-do-you-format-a-float-to-the-first-significant-decimal-and-with-specified-pr 30 | fn meh(float: f32, precision: usize) -> usize { 31 | // compute absolute value 32 | let a = float.abs(); 33 | 34 | // if abs value is greater than 1, then precision becomes less than "standard" 35 | 36 | if a >= 1. { 37 | // reduce by number of digits, minimum 0 38 | let n = (1. + a.log10().floor()) as usize; 39 | precision.saturating_sub(n) 40 | // if precision is less than 1 (but non-zero), then precision becomes greater than "standard" 41 | } else if a > 0. { 42 | // increase number of digits 43 | let n = -(1. + a.log10().floor()) as usize; 44 | precision + n 45 | // special case for 0 46 | } else { 47 | 0 48 | } 49 | } 50 | 51 | pub fn print_dur(start: Instant) -> String { 52 | let mut dur = Instant::now().duration_since(start).as_secs_f32(); 53 | let mut suffix = ""; 54 | if dur < 0.1 { 55 | suffix = "m"; 56 | dur *= 1000.0; 57 | } 58 | let precision = meh(dur, 3); 59 | format!("{dur:.precision$}{suffix}s") 60 | } 61 | 62 | pub fn print_bytes(bytes: impl Into) -> String { 63 | pretty_bytes::converter::convert(bytes.into()) 64 | } 65 | 66 | pub fn to_io_err(e: anyhow::Error) -> std::io::Error { 67 | std::io::Error::new(std::io::ErrorKind::Other, e) 68 | } 69 | 70 | #[cfg(test)] 71 | #[ctor::ctor] 72 | fn init() { 73 | env_logger::init(); 74 | } 75 | 76 | /** returns an AsyncRead that is empty but returns an io error if the given task had an io error or join error */ 77 | pub fn join_handle_to_stream(join: JoinHandle>) -> impl AsyncRead { 78 | let st = stream! { 79 | join.await.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))??; 80 | yield std::io::Result::Ok(&b""[..]) 81 | }; 82 | 83 | StreamReader::new(st) 84 | } 85 | -------------------------------------------------------------------------------- /src/matching.rs: -------------------------------------------------------------------------------- 1 | /** 2 | * Module for matching adapters to files based on file name or mime type 3 | */ 4 | use crate::adapters::*; 5 | 6 | use anyhow::*; 7 | 8 | use regex::{Regex, RegexSet}; 9 | 10 | use std::iter::Iterator; 11 | 12 | use std::sync::Arc; 13 | 14 | // match only based on file path 15 | #[derive(Clone, Debug)] 16 | pub enum FastFileMatcher { 17 | // MimeType(Regex), 18 | /** 19 | * without the leading dot, e.g. "jpg" or "tar.gz". Matched as /.*\.ext$/ 20 | * 21 | */ 22 | FileExtension(String), 23 | // todo: maybe add others, e.g. regex on whole filename or even paths 24 | // todo: maybe allow matching a directory (e.g. /var/lib/postgres) 25 | } 26 | 27 | #[derive(Clone, Debug)] 28 | pub enum FileMatcher { 29 | /// any type of fast matcher 30 | Fast(FastFileMatcher), 31 | /// 32 | /// match by exact mime type extracted using tree_magic 33 | /// TODO: allow match ignoring suffix etc? 34 | MimeType(String), 35 | } 36 | 37 | impl From for FileMatcher { 38 | fn from(t: FastFileMatcher) -> Self { 39 | Self::Fast(t) 40 | } 41 | } 42 | 43 | pub struct FileMeta { 44 | // filename is not actually a utf8 string, but since we can't do regex on OsStr and can't get a &[u8] from OsStr either, 45 | // and since we probably only want to do only matching on ascii stuff anyways, this is the filename as a string with non-valid bytes removed 46 | pub lossy_filename: String, 47 | // only given when slow matching is enabled 48 | pub mimetype: Option<&'static str>, 49 | } 50 | 51 | pub fn extension_to_regex(extension: &str) -> Regex { 52 | Regex::new(&format!("(?i)\\.{}$", ®ex::escape(extension))) 53 | .expect("we know this regex compiles") 54 | } 55 | 56 | #[allow(clippy::type_complexity)] 57 | pub fn adapter_matcher( 58 | adapters: &[Arc], 59 | slow: bool, 60 | ) -> Result Option<(Arc, FileMatcher)> + use<>> { 61 | // need order later 62 | let adapter_names: Vec = adapters.iter().map(|e| e.metadata().name.clone()).collect(); 63 | let mut fname_regexes = vec![]; 64 | let mut mime_regexes = vec![]; 65 | for adapter in adapters.iter() { 66 | let metadata = adapter.metadata(); 67 | use FileMatcher::*; 68 | for matcher in metadata.get_matchers(slow) { 69 | match matcher.as_ref() { 70 | MimeType(re) => { 71 | mime_regexes.push((re.clone(), adapter.clone(), MimeType(re.clone()))) 72 | } 73 | Fast(FastFileMatcher::FileExtension(re)) => fname_regexes.push(( 74 | extension_to_regex(re), 75 | adapter.clone(), 76 | Fast(FastFileMatcher::FileExtension(re.clone())), 77 | )), 78 | }; 79 | } 80 | } 81 | let fname_regex_set = RegexSet::new(fname_regexes.iter().map(|p| p.0.as_str()))?; 82 | let mime_regex_set = RegexSet::new(mime_regexes.iter().map(|p| p.0.as_str()))?; 83 | Ok(move |meta: FileMeta| { 84 | let fname_matches: Vec<_> = fname_regex_set 85 | .matches(&meta.lossy_filename) 86 | .into_iter() 87 | .collect(); 88 | let mime_matches: Vec<_> = if slow { 89 | mime_regex_set 90 | .matches(meta.mimetype.expect("No mimetype?")) 91 | .into_iter() 92 | .collect() 93 | } else { 94 | vec![] 95 | }; 96 | if fname_matches.len() + mime_matches.len() > 1 { 97 | // get first according to original priority list... 98 | // todo: kinda ugly 99 | let fa = fname_matches 100 | .iter() 101 | .map(|e| (fname_regexes[*e].1.clone(), fname_regexes[*e].2.clone())); 102 | let fb = mime_matches 103 | .iter() 104 | .map(|e| (mime_regexes[*e].1.clone(), mime_regexes[*e].2.clone())); 105 | let mut v = vec![]; 106 | v.extend(fa); 107 | v.extend(fb); 108 | v.sort_by_key(|e| { 109 | adapter_names 110 | .iter() 111 | .position(|r| r == &e.0.metadata().name) 112 | .expect("impossib7") 113 | }); 114 | eprintln!( 115 | "Warning: found multiple adapters for {}:", 116 | meta.lossy_filename 117 | ); 118 | for mmatch in v.iter() { 119 | eprintln!(" - {}", mmatch.0.metadata().name); 120 | } 121 | return Some(v[0].clone()); 122 | } 123 | if mime_matches.is_empty() { 124 | if fname_matches.is_empty() { 125 | None 126 | } else { 127 | let (_, adapter, matcher) = &fname_regexes[fname_matches[0]]; 128 | Some((adapter.clone(), matcher.clone())) 129 | } 130 | } else { 131 | let (_, adapter, matcher) = &mime_regexes[mime_matches[0]]; 132 | Some((adapter.clone(), matcher.clone())) 133 | } 134 | }) 135 | } 136 | -------------------------------------------------------------------------------- /src/preproc.rs: -------------------------------------------------------------------------------- 1 | use crate::adapted_iter::AdaptedFilesIterBox; 2 | use crate::adapters::*; 3 | use crate::caching_writer::async_read_and_write_to_cache; 4 | use crate::config::RgaConfig; 5 | use crate::matching::*; 6 | use crate::preproc_cache::CacheKey; 7 | use crate::recurse::concat_read_streams; 8 | use crate::{ 9 | preproc_cache::{PreprocCache, open_cache_db}, 10 | print_bytes, 11 | }; 12 | use anyhow::*; 13 | use async_compression::tokio::bufread::ZstdDecoder; 14 | use async_stream::stream; 15 | // use futures::future::{BoxFuture, FutureExt}; 16 | use log::*; 17 | use postproc::PostprocPrefix; 18 | use std::future::Future; 19 | use std::io::Cursor; 20 | use std::path::Path; 21 | use std::pin::Pin; 22 | use std::sync::Arc; 23 | use tokio::io::AsyncBufReadExt; 24 | use tokio::io::BufReader; 25 | use tokio::io::{AsyncBufRead, AsyncReadExt}; 26 | 27 | pub type ActiveAdapters = Vec>; 28 | 29 | async fn choose_adapter( 30 | config: &RgaConfig, 31 | filepath_hint: &Path, 32 | archive_recursion_depth: i32, 33 | inp: &mut (impl AsyncBufRead + Unpin), 34 | ) -> Result, FileMatcher, ActiveAdapters)>> { 35 | let active_adapters = get_adapters_filtered(config.custom_adapters.clone(), &config.adapters)?; 36 | let adapters = adapter_matcher(&active_adapters, config.accurate)?; 37 | let filename = filepath_hint 38 | .file_name() 39 | .ok_or_else(|| format_err!("Empty filename"))?; 40 | debug!("Archive recursion depth: {}", archive_recursion_depth); 41 | 42 | let mimetype = if config.accurate { 43 | let buf = inp.fill_buf().await?; // fill but do not consume! 44 | if buf.starts_with(b"From \x0d") || buf.starts_with(b"From -") { 45 | Some("application/mbox") 46 | } else { 47 | let mimetype = tree_magic::from_u8(buf); 48 | debug!("mimetype: {:?}", mimetype); 49 | Some(mimetype) 50 | } 51 | } else { 52 | None 53 | }; 54 | let adapter = adapters(FileMeta { 55 | mimetype, 56 | lossy_filename: filename.to_string_lossy().to_string(), 57 | }); 58 | Ok(adapter.map(|e| (e.0, e.1, active_adapters))) 59 | } 60 | 61 | enum Ret { 62 | Recurse(AdaptInfo, Arc, FileMatcher, ActiveAdapters), 63 | Passthrough(AdaptInfo), 64 | } 65 | async fn buf_choose_adapter(ai: AdaptInfo) -> Result { 66 | let mut inp = BufReader::with_capacity(1 << 16, ai.inp); 67 | let adapter = choose_adapter( 68 | &ai.config, 69 | &ai.filepath_hint, 70 | ai.archive_recursion_depth, 71 | &mut inp, 72 | ) 73 | .await?; 74 | let ai = AdaptInfo { 75 | inp: Box::pin(inp), 76 | ..ai 77 | }; 78 | let (a, b, c) = match adapter { 79 | Some(x) => x, 80 | None => { 81 | // allow passthrough if the file is in an archive or accurate matching is enabled 82 | // otherwise it should have been filtered out by rg pre-glob since rg can handle those better than us 83 | let allow_cat = !ai.is_real_file || ai.config.accurate; 84 | if allow_cat { 85 | if ai.postprocess { 86 | ( 87 | Arc::new(PostprocPrefix {}) as Arc, 88 | FileMatcher::Fast(FastFileMatcher::FileExtension("default".to_string())), 89 | Vec::new(), 90 | ) 91 | } else { 92 | return Ok(Ret::Passthrough(ai)); 93 | } 94 | } else { 95 | return Err(format_err!( 96 | "No adapter found for file {:?}, passthrough disabled.", 97 | ai.filepath_hint 98 | .file_name() 99 | .ok_or_else(|| format_err!("Empty filename"))? 100 | )); 101 | } 102 | } 103 | }; 104 | Ok(Ret::Recurse(ai, a, b, c)) 105 | } 106 | 107 | /** 108 | * preprocess a file as defined in `ai`. 109 | * 110 | * If a cache is passed, read/write to it. 111 | * 112 | */ 113 | pub async fn rga_preproc(ai: AdaptInfo) -> Result { 114 | debug!("path (hint) to preprocess: {:?}", ai.filepath_hint); 115 | 116 | // todo: figure out when using a bufreader is a good idea and when it is not 117 | // seems to be good for File::open() reads, but not sure about within archives (tar, zip) 118 | let (ai, adapter, detection_reason, active_adapters) = match buf_choose_adapter(ai).await? { 119 | Ret::Recurse(ai, a, b, c) => (ai, a, b, c), 120 | Ret::Passthrough(ai) => { 121 | return Ok(ai.inp); 122 | } 123 | }; 124 | let path_hint_copy = ai.filepath_hint.clone(); 125 | adapt_caching(ai, adapter, detection_reason, active_adapters) 126 | .await 127 | .with_context(|| format!("run_adapter({})", &path_hint_copy.to_string_lossy())) 128 | } 129 | 130 | async fn adapt_caching( 131 | ai: AdaptInfo, 132 | adapter: Arc, 133 | detection_reason: FileMatcher, 134 | active_adapters: ActiveAdapters, 135 | ) -> Result { 136 | let meta = adapter.metadata(); 137 | debug!( 138 | "Chose adapter '{}' because of matcher {:?}", 139 | &meta.name, &detection_reason 140 | ); 141 | eprintln!( 142 | "{} adapter: {}", 143 | ai.filepath_hint.to_string_lossy(), 144 | &meta.name 145 | ); 146 | let cache_compression_level = ai.config.cache.compression_level; 147 | let cache_max_blob_len = ai.config.cache.max_blob_len; 148 | 149 | let cache = if ai.is_real_file && !ai.config.cache.disabled { 150 | Some(open_cache_db(Path::new(&ai.config.cache.path.0)).await?) 151 | } else { 152 | None 153 | }; 154 | 155 | let mut cache = cache.context("No cache?")?; 156 | let cache_key = CacheKey::new( 157 | ai.postprocess, 158 | &ai.filepath_hint, 159 | adapter.as_ref(), 160 | &active_adapters, 161 | )?; 162 | // let dbg_ctx = format!("adapter {}", &adapter.metadata().name); 163 | let cached = cache.get(&cache_key).await.context("cache.get")?; 164 | match cached { 165 | Some(cached) => Ok(Box::pin(ZstdDecoder::new(Cursor::new(cached)))), 166 | None => { 167 | debug!("cache MISS, running adapter with caching..."); 168 | let inp = loop_adapt(adapter.as_ref(), detection_reason, ai).await?; 169 | let inp = concat_read_streams(inp); 170 | let inp = async_read_and_write_to_cache( 171 | inp, 172 | cache_max_blob_len.0, 173 | cache_compression_level.0, 174 | Box::new(move |(uncompressed_size, compressed)| { 175 | Box::pin(async move { 176 | debug!( 177 | "uncompressed output: {}", 178 | print_bytes(uncompressed_size as f64) 179 | ); 180 | if let Some(cached) = compressed { 181 | debug!("compressed output: {}", print_bytes(cached.len() as f64)); 182 | cache 183 | .set(&cache_key, cached) 184 | .await 185 | .context("writing to cache")? 186 | } 187 | Ok(()) 188 | }) 189 | }), 190 | )?; 191 | 192 | Ok(Box::pin(inp)) 193 | } 194 | } 195 | } 196 | 197 | async fn read_discard(mut x: ReadBox) -> Result<()> { 198 | let mut buf = [0u8; 1 << 16]; 199 | loop { 200 | let n = x.read(&mut buf).await?; 201 | if n == 0 { 202 | break; 203 | } 204 | } 205 | Ok(()) 206 | } 207 | 208 | pub fn loop_adapt( 209 | adapter: &dyn FileAdapter, 210 | detection_reason: FileMatcher, 211 | ai: AdaptInfo, 212 | ) -> Pin> + Send + '_>> { 213 | Box::pin(async move { loop_adapt_inner(adapter, detection_reason, ai).await }) 214 | } 215 | pub async fn loop_adapt_inner( 216 | adapter: &dyn FileAdapter, 217 | detection_reason: FileMatcher, 218 | ai: AdaptInfo, 219 | ) -> anyhow::Result { 220 | let fph = ai.filepath_hint.clone(); 221 | let inp = adapter.adapt(ai, &detection_reason).await; 222 | let inp = if adapter.metadata().name == "postprocprefix" { 223 | // don't add confusing error context 224 | inp? 225 | } else { 226 | inp.with_context(|| { 227 | format!( 228 | "adapting {} via {} failed", 229 | fph.to_string_lossy(), 230 | adapter.metadata().name 231 | ) 232 | })? 233 | }; 234 | let s = stream! { 235 | for await file in inp { 236 | trace!("next file"); 237 | match buf_choose_adapter(file?).await? { 238 | Ret::Recurse(ai, adapter, detection_reason, _active_adapters) => { 239 | if ai.archive_recursion_depth >= ai.config.max_archive_recursion.0 { 240 | // some adapters (esp. zip) assume that the entry is read fully and might hang otherwise 241 | read_discard(ai.inp).await?; 242 | let s = format!("{}[rga: max archive recursion reached ({})]\n", ai.line_prefix, ai.archive_recursion_depth).into_bytes(); 243 | yield Ok(AdaptInfo { 244 | inp: Box::pin(Cursor::new(s)), 245 | ..ai 246 | }); 247 | continue; 248 | } 249 | debug!( 250 | "Chose adapter '{}' because of matcher {:?}", 251 | &adapter.metadata().name, &detection_reason 252 | ); 253 | eprintln!( 254 | "{} adapter: {}", 255 | ai.filepath_hint.to_string_lossy(), 256 | &adapter.metadata().name 257 | ); 258 | for await ifile in loop_adapt(adapter.as_ref(), detection_reason, ai).await? { 259 | yield ifile; 260 | } 261 | } 262 | Ret::Passthrough(ai) => { 263 | debug!("no adapter for {}, ending recursion", ai.filepath_hint.to_string_lossy()); 264 | yield Ok(ai); 265 | } 266 | } 267 | trace!("done with files"); 268 | } 269 | trace!("stream ended"); 270 | }; 271 | Ok(Box::pin(s)) 272 | } 273 | -------------------------------------------------------------------------------- /src/preproc_cache.rs: -------------------------------------------------------------------------------- 1 | use crate::{adapters::FileAdapter, preproc::ActiveAdapters}; 2 | use anyhow::{Context, Result}; 3 | use log::warn; 4 | use path_clean::PathClean; 5 | use rusqlite::{OptionalExtension, named_params}; 6 | use std::{path::Path, time::UNIX_EPOCH}; 7 | use tokio_rusqlite::Connection; 8 | 9 | static SCHEMA_VERSION: i32 = 3; 10 | #[derive(Clone)] 11 | pub struct CacheKey { 12 | config_hash: String, 13 | adapter: String, 14 | adapter_version: i32, 15 | active_adapters: String, 16 | file_path: String, 17 | file_mtime_unix_ms: i64, 18 | } 19 | impl CacheKey { 20 | pub fn new( 21 | postprocess: bool, 22 | filepath_hint: &Path, 23 | adapter: &dyn FileAdapter, 24 | active_adapters: &ActiveAdapters, 25 | ) -> Result { 26 | let meta = std::fs::metadata(filepath_hint) 27 | .with_context(|| format!("reading metadata for {}", filepath_hint.to_string_lossy()))?; 28 | let modified = meta.modified().expect("weird OS that can't into mtime"); 29 | let file_mtime_unix_ms = modified.duration_since(UNIX_EPOCH)?.as_millis() as i64; 30 | let active_adapters = if adapter.metadata().recurses { 31 | serde_json::to_string( 32 | &active_adapters 33 | .iter() 34 | .map(|a| format!("{}.v{}", a.metadata().name, a.metadata().version)) 35 | .collect::>(), 36 | )? 37 | } else { 38 | "null".to_string() 39 | }; 40 | Ok(Self { 41 | config_hash: if postprocess { 42 | "a41e2e9".to_string() 43 | } else { 44 | "f1502a3".to_string() 45 | }, // todo: when we add more config options that affect caching, create a struct and actually hash it 46 | adapter: adapter.metadata().name.clone(), 47 | adapter_version: adapter.metadata().version, 48 | file_path: filepath_hint.clean().to_string_lossy().to_string(), 49 | file_mtime_unix_ms, 50 | active_adapters, 51 | }) 52 | } 53 | } 54 | 55 | #[async_trait::async_trait] 56 | pub trait PreprocCache { 57 | async fn get(&self, key: &CacheKey) -> Result>>; 58 | async fn set(&mut self, key: &CacheKey, value: Vec) -> Result<()>; 59 | } 60 | 61 | async fn connect_pragmas(db: &Connection) -> Result<()> { 62 | // https://phiresky.github.io/blog/2020/sqlite-performance-tuning/ 63 | //let want_page_size = 32768; 64 | //db.execute(&format!("pragma page_size = {};", want_page_size)) 65 | // .context("setup pragma 1")?; 66 | db.call(|db| { 67 | // db.busy_timeout(Duration::from_secs(10))?; 68 | db.pragma_update(None, "journal_mode", "wal")?; 69 | db.pragma_update(None, "foreign_keys", "on")?; 70 | db.pragma_update(None, "temp_store", "memory")?; 71 | db.pragma_update(None, "synchronous", "off")?; // integrity isn't very important here 72 | db.pragma_update(None, "mmap_size", "2000000000")?; 73 | db.execute(" 74 | create table if not exists preproc_cache ( 75 | config_hash text not null, 76 | adapter text not null, 77 | adapter_version integer not null, 78 | created_unix_ms integer not null default (unixepoch() * 1000), 79 | active_adapters text not null, -- 'null' if adapter cannot recurse 80 | file_path text not null, 81 | file_mtime_unix_ms integer not null, 82 | text_content_zstd blob not null 83 | ) strict", [] 84 | )?; 85 | 86 | db.execute("create unique index if not exists preproc_cache_idx on preproc_cache (config_hash, adapter, adapter_version, file_path, active_adapters)", [])?; 87 | 88 | Ok(()) 89 | }) 90 | .await.context("connect_pragmas")?; 91 | let jm: i64 = db 92 | .call(|db| Ok(db.pragma_query_value(None, "application_id", |r| r.get(0))?)) 93 | .await?; 94 | if jm != 924716026 { 95 | // (probably) newly created db 96 | db.call(|db| Ok(db.pragma_update(None, "application_id", "924716026")?)) 97 | .await?; 98 | } 99 | Ok(()) 100 | } 101 | 102 | struct SqliteCache { 103 | db: Connection, 104 | } 105 | impl SqliteCache { 106 | async fn new(path: &Path) -> Result { 107 | let db = Connection::open(path.join("cache.sqlite3")).await?; 108 | db.call(|db| { 109 | let schema_version: i32 = db.pragma_query_value(None, "user_version", |r| r.get(0))?; 110 | if schema_version != SCHEMA_VERSION { 111 | warn!("Cache schema version mismatch, clearing cache"); 112 | db.execute("drop table if exists preproc_cache", [])?; 113 | db.pragma_update(None, "user_version", format!("{SCHEMA_VERSION}"))?; 114 | } 115 | Ok(()) 116 | }) 117 | .await?; 118 | 119 | connect_pragmas(&db).await?; 120 | 121 | Ok(Self { db }) 122 | } 123 | } 124 | 125 | #[async_trait::async_trait] 126 | impl PreprocCache for SqliteCache { 127 | async fn get(&self, key: &CacheKey) -> Result>> { 128 | let key = (*key).clone(); // todo: without cloning 129 | Ok(self 130 | .db 131 | .call(move |db| { 132 | Ok(db 133 | .query_row( 134 | "select text_content_zstd from preproc_cache where 135 | adapter = :adapter 136 | and config_hash = :config_hash 137 | and adapter_version = :adapter_version 138 | and active_adapters = :active_adapters 139 | and file_path = :file_path 140 | and file_mtime_unix_ms = :file_mtime_unix_ms 141 | ", 142 | named_params! { 143 | ":config_hash": &key.config_hash, 144 | ":adapter": &key.adapter, 145 | ":adapter_version": &key.adapter_version, 146 | ":active_adapters": &key.active_adapters, 147 | ":file_path": &key.file_path, 148 | ":file_mtime_unix_ms": &key.file_mtime_unix_ms 149 | }, 150 | |r| r.get::<_, Vec>(0), 151 | ) 152 | .optional()?) 153 | }) 154 | .await 155 | .context("reading from cache")?) 156 | } 157 | 158 | async fn set(&mut self, key: &CacheKey, value: Vec) -> Result<()> { 159 | let key = (*key).clone(); // todo: without cloning 160 | log::trace!( 161 | "Writing to cache: {}, {}, {} byte", 162 | key.adapter, 163 | key.file_path, 164 | value.len() 165 | ); 166 | Ok(self 167 | .db 168 | .call(move |db| { 169 | db.execute( 170 | "insert into preproc_cache (config_hash, adapter, adapter_version, active_adapters, file_path, file_mtime_unix_ms, text_content_zstd) values 171 | (:config_hash, :adapter, :adapter_version, :active_adapters, :file_path, :file_mtime_unix_ms, :text_content_zstd) 172 | on conflict (config_hash, adapter, adapter_version, active_adapters, file_path) do update set 173 | file_mtime_unix_ms = :file_mtime_unix_ms, 174 | created_unix_ms = unixepoch() * 1000, 175 | text_content_zstd = :text_content_zstd", 176 | named_params! { 177 | ":config_hash": &key.config_hash, 178 | ":adapter": &key.adapter, 179 | ":adapter_version": &key.adapter_version, 180 | ":active_adapters": &key.active_adapters, 181 | ":file_path": &key.file_path, 182 | ":file_mtime_unix_ms": &key.file_mtime_unix_ms, 183 | ":text_content_zstd": value 184 | })?; 185 | Ok(()) 186 | }) 187 | .await?) 188 | } 189 | } 190 | /// opens a default cache 191 | pub async fn open_cache_db(path: &Path) -> Result> { 192 | std::fs::create_dir_all(path)?; 193 | SqliteCache::new(path).await 194 | } 195 | 196 | #[cfg(test)] 197 | mod test { 198 | 199 | use crate::preproc_cache::*; 200 | 201 | #[tokio::test] 202 | async fn test_read_write() -> anyhow::Result<()> { 203 | let path = tempfile::tempdir()?; 204 | let _db = open_cache_db(&path.path().join("foo.sqlite3")).await?; 205 | // db.set(); 206 | Ok(()) 207 | } 208 | } 209 | -------------------------------------------------------------------------------- /src/recurse.rs: -------------------------------------------------------------------------------- 1 | use tokio_util::io::{ReaderStream, StreamReader}; 2 | 3 | use crate::{adapted_iter::AdaptedFilesIterBox, adapters::*, to_io_err}; 4 | use async_stream::stream; 5 | 6 | pub fn concat_read_streams(input: AdaptedFilesIterBox) -> ReadBox { 7 | let s = stream! { 8 | for await output in input { 9 | let o = output.map_err(to_io_err)?.inp; 10 | let stream = ReaderStream::new(o); 11 | for await bytes in stream { 12 | yield bytes; 13 | } 14 | } 15 | }; 16 | Box::pin(StreamReader::new(s)) 17 | } 18 | -------------------------------------------------------------------------------- /src/test_utils.rs: -------------------------------------------------------------------------------- 1 | use crate::{ 2 | adapted_iter::AdaptedFilesIterBox, 3 | adapters::{ 4 | AdaptInfo, ReadBox, 5 | custom::{BUILTIN_SPAWNING_ADAPTERS, CustomSpawningFileAdapter}, 6 | }, 7 | config::RgaConfig, 8 | matching::{FastFileMatcher, FileMatcher}, 9 | recurse::concat_read_streams, 10 | }; 11 | use anyhow::Result; 12 | use std::path::{Path, PathBuf}; 13 | use tokio::{fs::File, io::AsyncReadExt}; 14 | 15 | pub use pretty_assertions::{assert_eq, assert_ne}; 16 | pub fn test_data_dir() -> PathBuf { 17 | let mut d = PathBuf::from(env!("CARGO_MANIFEST_DIR")); 18 | d.push("exampledir/test/"); 19 | d 20 | } 21 | 22 | pub async fn simple_fs_adapt_info(filepath: &Path) -> Result<(AdaptInfo, FileMatcher)> { 23 | Ok(simple_adapt_info_full( 24 | filepath, 25 | Box::pin(File::open(filepath).await?), 26 | true, 27 | )) 28 | } 29 | pub fn simple_adapt_info(filepath: &Path, inp: ReadBox) -> (AdaptInfo, FileMatcher) { 30 | simple_adapt_info_full(filepath, inp, false) 31 | } 32 | 33 | pub fn simple_adapt_info_full( 34 | filepath: &Path, 35 | inp: ReadBox, 36 | is_real_file: bool, 37 | ) -> (AdaptInfo, FileMatcher) { 38 | ( 39 | AdaptInfo { 40 | filepath_hint: filepath.to_owned(), 41 | is_real_file, 42 | archive_recursion_depth: 0, 43 | inp, 44 | line_prefix: "PREFIX:".to_string(), 45 | config: RgaConfig::default(), 46 | postprocess: true, 47 | }, 48 | FastFileMatcher::FileExtension( 49 | filepath 50 | .extension() 51 | .unwrap_or_default() 52 | .to_string_lossy() 53 | .into_owned(), 54 | ) 55 | .into(), 56 | ) 57 | } 58 | 59 | pub async fn adapted_to_vec(adapted: AdaptedFilesIterBox) -> Result> { 60 | let mut res = concat_read_streams(adapted); 61 | 62 | let mut buf = Vec::new(); 63 | res.read_to_end(&mut buf).await?; 64 | Ok(buf) 65 | } 66 | 67 | pub fn poppler_adapter() -> CustomSpawningFileAdapter { 68 | let adapter = BUILTIN_SPAWNING_ADAPTERS 69 | .iter() 70 | .find(|e| e.name == "poppler") 71 | .expect("no poppler adapter"); 72 | 73 | adapter.to_adapter() 74 | } 75 | 76 | #[cfg(test)] 77 | pub fn init_logging() { 78 | let _ = env_logger::builder().is_test(true).try_init(); 79 | } 80 | --------------------------------------------------------------------------------