├── .cargo
    └── audit.toml
├── .envrc
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
    └── workflows
    │   ├── ci.yml
    │   └── release.yml
├── .gitignore
├── .typos.toml
├── .vscode
    └── launch.json
├── CHANGELOG.md
├── Cargo.lock
├── Cargo.toml
├── LICENSE.md
├── README.md
├── ci
    ├── macos-install-packages
    └── ubuntu-install-packages
├── doc
    ├── config.default.jsonc
    ├── demodir.png
    ├── notes.md
    ├── rga-fzf.gif
    └── update-readme.sh
├── exampledir
    ├── decompress
    │   ├── test.log
    │   ├── test.log.bz2
    │   ├── test.log.gz
    │   ├── test.log.xz
    │   ├── test.log.zst
    │   └── testlogbutwithoutextension
    ├── demo
    │   ├── greeting.mkv
    │   ├── hello.odt
    │   ├── hello.sqlite3
    │   └── somearchive.zip
    ├── droste.zip
    ├── encoding
    │   ├── utf16le.txt
    │   ├── utf8.txt
    │   └── zip.tar.gz
    ├── exif.png
    ├── formatting.epub
    ├── mail_nested.eml
    ├── mail_pdf_attach.eml
    ├── screenshot.png
    ├── short.pdf
    ├── sqlitedb
    ├── tar
    │   ├── exampledir.tar.gz
    │   ├── test.tar
    │   ├── test.tar.bz2
    │   └── test.tar.zip
    ├── test.djvu
    ├── test.zip
    ├── test
    │   ├── github_email.eml
    │   ├── hello.gz
    │   ├── hello.sqlite3
    │   ├── hello.tar
    │   ├── mail_with_attachment.mbox
    │   ├── only-seek-zip.zip
    │   ├── short.pdf
    │   ├── short.pdf.gz
    │   ├── test.mbx
    │   └── twoblankpages.pdf
    ├── wasteland.docx
    ├── wasteland.epub
    ├── wasteland.fb2
    ├── wasteland.mkv
    ├── wasteland.mobi
    ├── wasteland.odt
    └── wasteland.pdf
├── flake.lock
├── flake.nix
├── rust-toolchain.toml
├── rustfmt.toml
└── src
    ├── adapted_iter.rs
    ├── adapters.rs
    ├── adapters
        ├── custom.rs
        ├── decompress.rs
        ├── ffmpeg.rs
        ├── mbox.rs
        ├── postproc.rs
        ├── sqlite.rs
        ├── tar.rs
        ├── writing.rs
        └── zip.rs
    ├── bin
        ├── rga-fzf-open.rs
        ├── rga-fzf.rs
        ├── rga-preproc.rs
        └── rga.rs
    ├── caching_writer.rs
    ├── config.rs
    ├── expand.rs
    ├── lib.rs
    ├── matching.rs
    ├── preproc.rs
    ├── preproc_cache.rs
    ├── recurse.rs
    └── test_utils.rs


/.cargo/audit.toml:
--------------------------------------------------------------------------------
1 | [yanked]
2 | enabled = false # doesn't work in Nix sandbox
3 | update_index = false # crates.io index managed by Nix
4 | 


--------------------------------------------------------------------------------
/.envrc:
--------------------------------------------------------------------------------
1 | use flake
2 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | 
12 | 
13 | **To Reproduce**
14 | 
15 | Attach example file:
16 | 
17 | Run command:
18 | 
19 | **Output**
20 | 
21 | **Screenshots**
22 | If applicable, add screenshots to help explain your problem.
23 | 
24 | **Operating System and Version**
25 | 
26 | 
27 | **Output of `rga --version`**
28 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | # Based on https://github.com/actions-rs/meta/blob/master/recipes/quickstart.md
 2 | #
 3 | # While our "example" application has platform-specific code,
 4 | # for simplicity we are compiling and testing everything in a nix-on-Linux environment only.
 5 | 
 6 | on: [push, pull_request]
 7 | 
 8 | name: ci
 9 | 
10 | jobs:
11 |   nix-flake-check:
12 |     name: nix flake check
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - name: Checkout sources
16 |         uses: actions/checkout@v4
17 | 
18 |       - name: Install nix
19 |         uses: cachix/install-nix-action@v21
20 | 
21 |       - name: Ensure the build succeeds
22 |         run: nix build
23 | 
24 |       - name: Run `nix flake check` to run formatters, linters, and tests
25 |         run: nix flake check --print-build-logs
26 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
  1 | # adapted from https://github.com/BurntSushi/ripgrep/blob/master/.github/workflows/release.yml
  2 | # The way this works is a little weird. But basically, the create-release job
  3 | # runs purely to initialize the GitHub release itself. Once done, the upload
  4 | # URL of the release is saved as an artifact.
  5 | #
  6 | # The build-release job runs only once create-release is finished. It gets
  7 | # the release upload URL by downloading the corresponding artifact (which was
  8 | # uploaded by create-release). It then builds the release executables for each
  9 | # supported platform and attaches them as release assets to the previously
 10 | # created release.
 11 | #
 12 | # The key here is that we create the release only once.
 13 | 
 14 | name: release
 15 | on:
 16 |   push:
 17 |     # Enable when testing release infrastructure on a branch.
 18 |     # branches:
 19 |     # - ag/release
 20 |     tags:
 21 |       - "v[0-9]+.[0-9]+.[0-9]+*"
 22 | jobs:
 23 |   create-release:
 24 |     permissions: write-all
 25 |     name: create-release
 26 |     runs-on: ubuntu-latest
 27 |     # env:
 28 |     # Set to force version number, e.g., when no tag exists.
 29 |     # RG_VERSION: TEST-0.0.0
 30 |     steps:
 31 |       - name: Create artifacts directory
 32 |         run: mkdir artifacts
 33 | 
 34 |       - name: Get the release version from the tag
 35 |         if: env.RG_VERSION == ''
 36 |         run: |
 37 |           # Apparently, this is the right way to get a tag name. Really?
 38 |           #
 39 |           # See: https://github.community/t5/GitHub-Actions/How-to-get-just-the-tag-name/m-p/32167/highlight/true#M1027
 40 |           echo "RG_VERSION=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV
 41 |           echo "version is: ${{ env.RG_VERSION }}"
 42 | 
 43 |       - name: Create GitHub release
 44 |         id: release
 45 |         uses: actions/create-release@v1
 46 |         env:
 47 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 48 |         with:
 49 |           tag_name: ${{ env.RG_VERSION }}
 50 |           release_name: ${{ env.RG_VERSION }}
 51 | 
 52 |       - name: Save release upload URL to artifact
 53 |         run: echo "${{ steps.release.outputs.upload_url }}" > artifacts/release-upload-url
 54 | 
 55 |       - name: Save version number to artifact
 56 |         run: echo "${{ env.RG_VERSION }}" > artifacts/release-version
 57 | 
 58 |       - name: Upload artifacts
 59 |         uses: actions/upload-artifact@v4
 60 |         with:
 61 |           name: artifacts
 62 |           path: artifacts
 63 | 
 64 |   build-release:
 65 |     name: build-release
 66 |     needs: ["create-release"]
 67 |     runs-on: ${{ matrix.os }}
 68 |     env:
 69 |       # For some builds, we use cross to test on 32-bit and big-endian
 70 |       # systems.
 71 |       CARGO: cargo
 72 |       # When CARGO is set to CROSS, this is set to `--target matrix.target`.
 73 |       TARGET_FLAGS:
 74 |       # When CARGO is set to CROSS, TARGET_DIR includes matrix.target.
 75 |       TARGET_DIR: ./target
 76 |       # Emit backtraces on panics.
 77 |       RUST_BACKTRACE: 1
 78 |     strategy:
 79 |       matrix:
 80 |         build: [linux, linux-arm, macos, win-msvc]
 81 |         include:
 82 |           - build: linux
 83 |             os: ubuntu-22.04
 84 |             rust: nightly
 85 |             target: x86_64-unknown-linux-musl
 86 |           - build: linux-arm
 87 |             os: ubuntu-22.04
 88 |             rust: nightly
 89 |             target: arm-unknown-linux-gnueabihf
 90 |           - build: macos
 91 |             os: macos-latest
 92 |             rust: nightly
 93 |             target: x86_64-apple-darwin
 94 |           - build: win-msvc
 95 |             os: windows-2019
 96 |             rust: nightly
 97 |             target: x86_64-pc-windows-msvc
 98 |         #- build: win-gnu
 99 |         #  os: windows-2019
100 |         #  rust: nightly-x86_64-gnu
101 |         #  target: x86_64-pc-windows-gnu
102 | 
103 |     steps:
104 |       - name: Checkout repository
105 |         uses: actions/checkout@v4
106 |         with:
107 |           fetch-depth: 1
108 | 
109 |       - name: Install packages (Ubuntu)
110 |         if: matrix.os == 'ubuntu-22.04'
111 |         run: |
112 |           ci/ubuntu-install-packages
113 | 
114 |       - name: Install packages (macOS)
115 |         if: matrix.os == 'macos-latest'
116 |         run: |
117 |           ci/macos-install-packages
118 | 
119 |       - name: Install Rust
120 |         uses: actions-rs/toolchain@v1
121 |         with:
122 |           toolchain: ${{ matrix.rust }}
123 |           profile: minimal
124 |           override: true
125 |           target: ${{ matrix.target }}
126 | 
127 |       - name: Use Cross
128 |         shell: bash
129 |         run: |
130 |           cargo install cross
131 |           echo "CARGO=cross" >> $GITHUB_ENV
132 |           echo "TARGET_FLAGS=--target ${{ matrix.target }}" >> $GITHUB_ENV
133 |           echo "TARGET_DIR=./target/${{ matrix.target }}" >> $GITHUB_ENV
134 | 
135 |       - name: Show command used for Cargo
136 |         run: |
137 |           echo "cargo command is: ${{ env.CARGO }}"
138 |           echo "target flag is: ${{ env.TARGET_FLAGS }}"
139 |           echo "target dir is: ${{ env.TARGET_DIR }}"
140 | 
141 |       - name: Get release download URL
142 |         uses: actions/download-artifact@v4
143 |         with:
144 |           name: artifacts
145 |           path: artifacts
146 | 
147 |       - name: Set release upload URL and release version
148 |         shell: bash
149 |         run: |
150 |           echo "RELEASE_UPLOAD_URL=$(cat artifacts/release-upload-url)" >> $GITHUB_ENV
151 |           echo "release upload url: $RELEASE_UPLOAD_URL"
152 |           echo "RELEASE_VERSION=$(cat artifacts/release-version)" >> $GITHUB_ENV
153 |           echo "release version: $RELEASE_VERSION"
154 | 
155 |       - name: Build release binary
156 |         run: ${{ env.CARGO }} build --verbose --release ${{ env.TARGET_FLAGS }}
157 | 
158 |       - name: Strip release binary (linux and macos)
159 |         if: matrix.build == 'linux' || matrix.build == 'macos'
160 |         run: |
161 |           strip "target/${{ matrix.target }}/release/rga" \
162 |             "target/${{ matrix.target }}/release/rga-preproc" \
163 |             "target/${{ matrix.target }}/release/rga-fzf" \
164 |             "target/${{ matrix.target }}/release/rga-fzf-open"
165 | 
166 |       - name: Strip release binary (arm)
167 |         if: matrix.build == 'linux-arm'
168 |         run: |
169 |           docker run --rm -v \
170 |             "$PWD/target:/target:Z" \
171 |             rustembedded/cross:arm-unknown-linux-gnueabihf \
172 |             arm-linux-gnueabihf-strip \
173 |             /target/arm-unknown-linux-gnueabihf/release/rga \
174 |             /target/arm-unknown-linux-gnueabihf/release/rga-preproc \
175 |             /target/arm-unknown-linux-gnueabihf/release/rga-fzf \
176 |             /target/arm-unknown-linux-gnueabihf/release/rga-fzf-open
177 | 
178 |       - name: Build archive
179 |         shell: bash
180 |         run: |
181 |           staging="ripgrep_all-${{ env.RELEASE_VERSION }}-${{ matrix.target }}"
182 |           mkdir -p "$staging"/doc
183 | 
184 |           cp {README.md,LICENSE.md} "$staging/"
185 |           cp CHANGELOG.md "$staging/doc/"
186 | 
187 |           if [ "${{ matrix.os }}" = "windows-2019" ]; then
188 |             cp "target/${{ matrix.target }}/release/rga.exe" "$staging/"
189 |             cp "target/${{ matrix.target }}/release/rga-preproc.exe" "$staging/"
190 |             cp "target/${{ matrix.target }}/release/rga-fzf.exe" "$staging/"
191 |             cp "target/${{ matrix.target }}/release/rga-fzf-open.exe" "$staging/"
192 |             7z a "$staging.zip" "$staging"
193 |             echo "ASSET=$staging.zip" >> $GITHUB_ENV
194 |           else
195 |             cp "target/${{ matrix.target }}/release/rga" "$staging/"
196 |             cp "target/${{ matrix.target }}/release/rga-preproc" "$staging/"
197 |             cp "target/${{ matrix.target }}/release/rga-fzf" "$staging/"
198 |             cp "target/${{ matrix.target }}/release/rga-fzf-open" "$staging/"
199 |             tar czf "$staging.tar.gz" "$staging"
200 |             echo "ASSET=$staging.tar.gz" >> $GITHUB_ENV
201 |           fi
202 | 
203 |       - name: Upload release archive
204 |         uses: actions/upload-release-asset@v1.0.1
205 |         env:
206 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
207 |         with:
208 |           upload_url: ${{ env.RELEASE_UPLOAD_URL }}
209 |           asset_path: ${{ env.ASSET }}
210 |           asset_name: ${{ env.ASSET }}
211 |           asset_content_type: application/octet-stream
212 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /result
2 | /target
3 | /exampledir.2
4 | /.idea
5 | /.pre-commit-config.yaml
6 | /.vscode/settings.json
7 | **/*.rs.bk
8 | 


--------------------------------------------------------------------------------
/.typos.toml:
--------------------------------------------------------------------------------
1 | [default.extend-words]
2 | als = "als"
3 | 
4 | [files]
5 | extend-exclude = ["exampledir/*"]
6 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   // Use IntelliSense to learn about possible attributes.
 3 |   // Hover to view descriptions of existing attributes.
 4 |   // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
 5 |   "version": "0.2.0",
 6 |   "configurations": [
 7 |     {
 8 |       "type": "lldb",
 9 |       "request": "attach",
10 |       "name": "Attach",
11 |       "program": "${workspaceFolder}/target/release/rga-preproc"
12 |     },
13 |     {
14 |       "type": "lldb",
15 |       "request": "launch",
16 |       "name": "Debug unit tests in library 'rga'",
17 |       "cargo": {
18 |         "args": ["test", "--no-run", "--lib", "--package=rga"],
19 |         "filter": {
20 |           "name": "rga",
21 |           "kind": "lib"
22 |         }
23 |       },
24 |       "args": [],
25 |       "cwd": "${workspaceFolder}"
26 |     },
27 |     {
28 |       "type": "lldb",
29 |       "request": "launch",
30 |       "name": "Debug executable 'rga'",
31 |       "cargo": {
32 |         "args": ["build", "--bin=rga"],
33 |         "filter": {
34 |           "name": "rga",
35 |           "kind": "bin"
36 |         }
37 |       },
38 |       "args": [],
39 |       "cwd": "${workspaceFolder}"
40 |     },
41 |     {
42 |       "type": "lldb",
43 |       "request": "launch",
44 |       "name": "Debug unit tests in executable 'rga'",
45 |       "cargo": {
46 |         "args": ["test", "--no-run", "--bin=rga", "--package=ripgrep-all"],
47 |         "filter": {
48 |           "name": "rga",
49 |           "kind": "bin"
50 |         }
51 |       },
52 |       "args": [],
53 |       "cwd": "${workspaceFolder}"
54 |     },
55 |     {
56 |       "type": "lldb",
57 |       "request": "launch",
58 |       "name": "Debug executable 'rga-preproc'",
59 |       "cargo": {
60 |         "args": ["build", "--bin=rga-preproc"],
61 |         "filter": {
62 |           "name": "rga-preproc",
63 |           "kind": "bin"
64 |         }
65 |       },
66 |       "args": ["exampledir/tar/test.tar.bz2"],
67 |       "cwd": "${workspaceFolder}"
68 |     },
69 |     {
70 |       "type": "lldb",
71 |       "request": "launch",
72 |       "name": "Debug unit tests in executable 'rga-preproc'",
73 |       "cargo": {
74 |         "args": ["test", "--no-run", "--bin=rga-preproc", "--package=rga"],
75 |         "filter": {
76 |           "name": "rga-preproc",
77 |           "kind": "bin"
78 |         }
79 |       },
80 |       "args": [],
81 |       "cwd": "${workspaceFolder}"
82 |     }
83 |   ]
84 | }
85 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # 0.10.5 (2024-01-16)
 2 | 
 3 | - return the same exit status as rg
 4 | 
 5 | # 0.10.4 (2024-01-16)
 6 | 
 7 | - add `--rga-no-prefix-filenames` flag (https://github.com/phiresky/ripgrep-all/issues/154)
 8 | 
 9 | # 0.10.3 (2024-01-15)
10 | 
11 | This was originally supposed to be version 1.0.0, but I don't feel confident enough in the stability to call it that.
12 | 
13 | Highlights:
14 | 
15 | - rga is now configurable via a config file (~/.config/ripgrep-all/config.jsonc) that is generated on first use, including schema.
16 | - Custom subprocess-spawning adapters can be defined via config file. See https://github.com/phiresky/ripgrep-all/wiki
17 | - External adapters can be shared with the community at https://github.com/phiresky/ripgrep-all/discussions
18 | 
19 | Others:
20 | 
21 | - mbox adapter (@FliegendeWurst https://github.com/phiresky/ripgrep-all/pull/104)
22 | - auto generate parts of the readme
23 | - add loads of debug logs and performance timings when `--debug` is used
24 | - better error messages via `anyhow`
25 | - add cross-platform rga-fzf binary
26 | - change whole code base to be async
27 | - change adapter interface from `(&Read, &Write) -> ()` to `AsyncRead -> AsyncRead` to allow chaining of adapters
28 | 
29 | # 0.9.6 (2020-05-19)
30 | 
31 | - Fix windows builds
32 | - Case insensitive file extension matching
33 | - Move to Github Actions instead of Travis
34 | - Fix searching for words that are hyphenated in PDFs (#44)
35 | - Always load rga-preproc binary from location where rga is
36 | 
37 | # 0.9.5 (2020-04-08)
38 | 
39 | - Allow search in pdf files without extension (https://github.com/phiresky/ripgrep-all/issues/39)
40 | - Prefer shipped binaries to system-installed ones (https://github.com/phiresky/ripgrep-all/issues/32)
41 | - Upgrade dependencies
42 | 
43 | # 0.9.3 (2019-09-19)
44 | 
45 | - Fix compilation on new Rust by updating rusqlite ([#25](https://github.com/phiresky/ripgrep-all/pull/25))
46 | 
47 | # 0.9.2 (2019-06-17)
48 | 
49 | - Fix file ending regex ([#13](https://github.com/phiresky/ripgrep-all/issues/13))
50 | - Fix decoding of UTF16 with BOM ([#5](https://github.com/phiresky/ripgrep-all/issues/5))
51 | - Shorten the output on failure to two lines (https://github.com/phiresky/ripgrep-all/issues/7), you can use `--no-messages` to completely suppress errors.
52 | - Better installations instructions in readme for each OS
53 | - Add windows binaries! Including all dependencies!
54 | 
55 | # 0.9.1 (2019-06-16)
56 | 
57 | - Add enabled adapters to cache key if caching for archive
58 | - Prevent empty trailing page output in pdf reader
59 | 
60 | # 0.9.0 (2019-06-16)
61 | 
62 | - Split decompress and tar adapter so we can also read pure .bz2 files etc
63 | - Add mime type detection to decompress so we can read e.g. /boot/initramfs.img which is a bz2 file without ending
64 | 
65 | # 0.8.9 (2019-06-15)
66 | 
67 | - Finally fix linux binary package
68 | - add readme to crates.io
69 | 
70 | # 0.8.7 (2019-06-15)
71 | 
72 | Minor fixes
73 | 
74 | - Correctly wrap help text
75 | - Show own help when no arguments given
76 | - Hopefully package the rga binary correctly
77 | 
78 | # 0.8.5
79 | 
80 | previous changes not documented
81 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | authors = ["phiresky <phireskyde+git@gmail.com>"]
 3 | description = "rga: ripgrep, but also search in PDFs, E-Books, Office documents, zip, tar.gz, etc."
 4 | edition = "2024"
 5 | exclude = [
 6 |   "exampledir/*",
 7 | ]
 8 | homepage = "https://github.com/phiresky/ripgrep-all"
 9 | license = "AGPL-3.0-or-later"
10 | name = "ripgrep_all"
11 | readme = "README.md"
12 | repository = "https://github.com/phiresky/ripgrep-all"
13 | version = "0.10.9"
14 | 
15 | [features]
16 | default = ["perf-literal"]
17 | perf-literal = ["regex/perf-literal"]
18 | 
19 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
20 | 
21 | [dependencies]
22 | anyhow = {version = "1.0.71", features = ["backtrace"]}
23 | async-compression = { version = "0.4.0", features = ["all", "all-algorithms", "tokio"] }
24 | async-stream = "0.3.5"
25 | async-trait = "0.1.68"
26 | async_zip = {version = "0.0.12", features = ["full"]}
27 | bincode = "1.3.3"
28 | bytes = "1.4.0"
29 | clap = {version = "4.3.0", features = ["wrap_help"]}
30 | crossbeam = "0.8.2"
31 | crossbeam-channel = "0.5.8"
32 | derive_more = "0.99.17"
33 | directories-next = "2.0.0"
34 | dyn-clonable = "0.9.0"
35 | dyn-clone = "1.0.11"
36 | encoding_rs = "0.8.32"
37 | encoding_rs_io = "0.1.7"
38 | env_logger = "0.10.0"
39 | glob = "0.3.1"
40 | json_comments = "0.2.1"
41 | lazy_static = "1.4.0"
42 | log = "0.4.17"
43 | mailparse = "0.14.0"
44 | memchr = "2.5.0"
45 | mime2ext = "0.1.52"
46 | open = "5"
47 | paste = "1.0.12"
48 | path-clean = "1.0.1"
49 | pretty-bytes = "0.2.2"
50 | regex = "1.8.2"
51 | rusqlite = {version = "0.30.0", features = ["vtab", "bundled"]}
52 | schemars = {version = "0.8.12", features = ["preserve_order"]}
53 | serde = {version = "1.0.163", features = ["derive"]}
54 | serde_json = "1.0.96"
55 | size_format = "1.0.2"
56 | structopt = "0.3.26"
57 | tempfile = "3.5.0"
58 | tokio = {version = "1.28.1", features = ["full"]}
59 | tokio-rusqlite = "0.5.0"
60 | tokio-stream = {version = "0.1.14", features = ["io-util", "tokio-util"]}
61 | astral-tokio-tar =  "0.5.1" 
62 | tokio-util = {version = "0.7.8", features = ["io", "full"]}
63 | tree_magic = {package = "tree_magic_mini", version = "3.0.3"}
64 | 
65 | [dev-dependencies]
66 | async-recursion = "1.0.4"
67 | ctor = "0.2.0"
68 | pretty_assertions = "1.3.0"
69 | tempfile = "3.5.0"
70 | tokio-test = "0.4.2"
71 | 
72 | [profile.release]
73 | debug = true
74 | lto = "thin"
75 | split-debuginfo = "packed"
76 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # rga: ripgrep, but also search in PDFs, E-Books, Office documents, zip, tar.gz, etc.
  2 | 
  3 | rga is a line-oriented search tool that allows you to look for a regex in a multitude of file types. rga wraps the awesome [ripgrep] and enables it to search in pdf, docx, sqlite, jpg, movie subtitles (mkv, mp4), etc.
  4 | 
  5 | [ripgrep]: https://github.com/BurntSushi/ripgrep
  6 | 
  7 | [![github repo](https://img.shields.io/badge/repo-github.com%2Fphiresky%2Fripgrep--all-informational.svg)](https://github.com/phiresky/ripgrep-all)
  8 | [![Crates.io](https://img.shields.io/crates/v/ripgrep-all.svg)](https://crates.io/crates/ripgrep-all)
  9 | [![fearless concurrency](https://img.shields.io/badge/concurrency-fearless-success.svg)](https://www.reddit.com/r/rustjerk/top/?sort=top&t=all)
 10 | 
 11 | For more detail, see this introductory blogpost: https://phiresky.github.io/blog/2019/rga--ripgrep-for-zip-targz-docx-odt-epub-jpg/
 12 | 
 13 | rga will recursively descend into archives and match text in every file type it knows.
 14 | 
 15 | Here is an [example directory](https://github.com/phiresky/ripgrep-all/tree/master/exampledir/demo) with different file types:
 16 | 
 17 | ```
 18 | demo/
 19 | ├── greeting.mkv
 20 | ├── hello.odt
 21 | ├── hello.sqlite3
 22 | └── somearchive.zip
 23 | ├── dir
 24 | │ ├── greeting.docx
 25 | │ └── inner.tar.gz
 26 | │ └── greeting.pdf
 27 | └── greeting.epub
 28 | ```
 29 | 
 30 | ![rga output](doc/demodir.png)
 31 | 
 32 | ## Integration with fzf
 33 | 
 34 | ![rga-fzf](doc/rga-fzf.gif)
 35 | 
 36 | See [the wiki](https://github.com/phiresky/ripgrep-all/wiki/fzf-Integration) for instructions of integrating rga with fzf.
 37 | 
 38 | ## INSTALLATION
 39 | 
 40 | Linux x64, macOS and Windows binaries are available [in GitHub Releases][latestrelease].
 41 | 
 42 | [latestrelease]: https://github.com/phiresky/ripgrep-all/releases/latest
 43 | 
 44 | ### Linux
 45 | 
 46 | #### Arch Linux
 47 | 
 48 | `pacman -S ripgrep-all`
 49 | 
 50 | #### Gentoo Linux
 51 | 
 52 | `emerge sys-apps/ripgrep-all`
 53 | 
 54 | #### Nix
 55 | 
 56 | `nix-env -iA nixpkgs.ripgrep-all`
 57 | 
 58 | #### Debian-based
 59 | 
 60 | download the [rga binary][latestrelease] and get the dependencies like this:
 61 | 
 62 | `apt install ripgrep pandoc poppler-utils ffmpeg`
 63 | 
 64 | If ripgrep is not included in your package sources, get it from [here](https://github.com/BurntSushi/ripgrep/releases).
 65 | 
 66 | rga will search for all binaries it calls in \$PATH and the directory itself is in.
 67 | 
 68 | ### Windows
 69 | 
 70 | Note that installing via [chocolatey](https://chocolatey.org/packages/ripgrep-all) or [scoop](https://github.com/ScoopInstaller/Main/blob/master/bucket/rga.json) is the only supported download method. If you download the binary from releases manually, you will not get the dependencies (for example pdftotext from poppler).
 71 | 
 72 | If you get an error like `VCRUNTIME140.DLL could not be found`, you need to install [vc_redist.x64.exe](https://support.microsoft.com/en-us/help/2977003/the-latest-supported-visual-c-downloads).
 73 | 
 74 | #### Chocolatey
 75 | 
 76 | ```
 77 | choco install ripgrep-all
 78 | ```
 79 | 
 80 | #### Scoop
 81 | 
 82 | ```
 83 | scoop install rga
 84 | ```
 85 | 
 86 | ### Homebrew/Linuxbrew
 87 | 
 88 | `rga` can be installed with [Homebrew](https://formulae.brew.sh/formula/ripgrep-all#default):
 89 | 
 90 | `brew install rga`
 91 | 
 92 | To install the dependencies that are each not strictly necessary but very useful:
 93 | 
 94 | `brew install pandoc poppler ffmpeg`
 95 | 
 96 | ### MacPorts
 97 | 
 98 | `rga` can also be installed on macOS via [MacPorts](https://ports.macports.org/port/ripgrep-all/):
 99 | 
100 | `sudo port install ripgrep-all`
101 | 
102 | ### Compile from source
103 | 
104 | rga should compile with stable Rust (v1.75.0+, check with `rustc --version`). To build it, run the following (or the equivalent in your OS):
105 | 
106 | ```
107 | ~$ apt install build-essential pandoc poppler-utils ffmpeg ripgrep cargo
108 | ~$ cargo install --locked ripgrep_all
109 | ~$ rga --version    # this should work now
110 | ```
111 | 
112 | ## Available Adapters
113 | 
114 | rga works with _adapters_ that adapt various file formats. It comes with a few adapters integrated:
115 | 
116 | ```
117 | rga --rga-list-adapters
118 | ```
119 | 
120 | You can also add **custom adapters**. See [the wiki](https://github.com/phiresky/ripgrep-all/wiki) for more information.
121 | 
122 | <!-- this part generated by update-readme.sh -->
123 | 
124 | Adapters:
125 | 
126 | - **pandoc**
127 |   Uses pandoc to convert binary/unreadable text documents to plain markdown-like text
128 |   Runs: pandoc --from= --to=plain --wrap=none --markdown-headings=atx  
129 |    Extensions: .epub, .odt, .docx, .fb2, .ipynb, .html, .htm
130 | 
131 | - **poppler**
132 |   Uses pdftotext (from poppler-utils) to extract plain text from PDF files
133 |   Runs: pdftotext - -  
134 |    Extensions: .pdf  
135 |    Mime Types: application/pdf
136 | 
137 | - **postprocpagebreaks**
138 |   Adds the page number to each line for an input file that specifies page breaks as ascii page break character.
139 |   Mainly to be used internally by the poppler adapter.  
140 |    Extensions: .asciipagebreaks
141 | 
142 | - **ffmpeg**
143 |   Uses ffmpeg to extract video metadata/chapters, subtitles, lyrics, and other metadata  
144 |    Extensions: .mkv, .mp4, .avi, .mp3, .ogg, .flac, .webm
145 | 
146 | - **zip**
147 |   Reads a zip file as a stream and recurses down into its contents  
148 |    Extensions: .zip, .jar  
149 |    Mime Types: application/zip
150 | 
151 | - **decompress**
152 |   Reads compressed file as a stream and runs a different extractor on the contents.  
153 |    Extensions: .als, .bz2, .gz, .tbz, .tbz2, .tgz, .xz, .zst  
154 |    Mime Types: application/gzip, application/x-bzip, application/x-xz, application/zstd
155 | 
156 | - **tar**
157 |   Reads a tar file as a stream and recurses down into its contents  
158 |    Extensions: .tar
159 | 
160 | - **sqlite**
161 |   Uses sqlite bindings to convert sqlite databases into a simple plain text format  
162 |    Extensions: .db, .db3, .sqlite, .sqlite3  
163 |    Mime Types: application/x-sqlite3
164 | 
165 | The following adapters are disabled by default, and can be enabled using '--rga-adapters=+foo,bar':
166 | 
167 | - **mail**
168 |   Reads mailbox/mail files and runs extractors on the contents and attachments.  
169 |    Extensions: .mbox, .mbx, .eml  
170 |    Mime Types: application/mbox, message/rfc822
171 | 
172 | ## USAGE:
173 | 
174 | > rga \[RGA OPTIONS\] \[RG OPTIONS\] PATTERN \[PATH \...\]
175 | 
176 | 
177 | ## FLAGS:
178 | 
179 | **\--rga-accurate**
180 | 
181 | > Use more accurate but slower matching by mime type
182 | 
183 | > By default, rga will match files using file extensions. Some programs,
184 | > such as sqlite3, don\'t care about the file extension at all, so users
185 | > sometimes use any or no extension at all. With this flag, rga will try
186 | > to detect the mime type of input files using the magic bytes (similar
187 | > to the \`file\` utility), and use that to choose the adapter.
188 | > Detection is only done on the first 8KiB of the file, since we can\'t
189 | > always seek on the input (in archives).
190 | 
191 | **\--rga-no-cache**
192 | 
193 | > Disable caching of results
194 | 
195 | > By default, rga caches the extracted text, if it is small enough, to a
196 | > database in \${XDG_CACHE_DIR-\~/.cache}/ripgrep-all on Linux,
197 | > _\~/Library/Caches/ripgrep-all_ on macOS, or
198 | > C:\\Users\\username\\AppData\\Local\\ripgrep-all on Windows. This way,
199 | > repeated searches on the same set of files will be much faster. If you
200 | > pass this flag, all caching will be disabled.
201 | 
202 | **-h**, **\--help**
203 | 
204 | > Prints help information
205 | 
206 | **\--rga-list-adapters**
207 | 
208 | > List all known adapters
209 | 
210 | **\--rga-print-config-schema**
211 | 
212 | > Print the JSON Schema of the configuration file
213 | 
214 | **\--rg-help**
215 | 
216 | > Show help for ripgrep itself
217 | 
218 | **\--rg-version**
219 | 
220 | > Show version of ripgrep itself
221 | 
222 | **-V**, **\--version**
223 | 
224 | > Prints version information
225 | 
226 | ## OPTIONS:
227 | 
228 | **\--rga-adapters=**\<adapters\>\...
229 | 
230 | > Change which adapters to use and in which priority order (descending)
231 | 
232 | > \"foo,bar\" means use only adapters foo and bar. \"-bar,baz\" means
233 | > use all default adapters except for bar and baz. \"+bar,baz\" means
234 | > use all default adapters and also bar and baz.
235 | 
236 | **\--rga-cache-compression-level=**\<compression-level\>
237 | 
238 | > ZSTD compression level to apply to adapter outputs before storing in
239 | > cache db
240 | 
241 | > Ranges from 1 - 22 \[default: 12\]
242 | 
243 | **\--rga-config-file=**\<config-file-path\>
244 | 
245 | **\--rga-max-archive-recursion=**\<max-archive-recursion\>
246 | 
247 | > Maximum nestedness of archives to recurse into \[default: 5\]
248 | 
249 | **\--rga-cache-max-blob-len=**\<max-blob-len\>
250 | 
251 | > Max compressed size to cache
252 | 
253 | > Longest byte length (after compression) to store in cache. Longer
254 | > adapter outputs will not be cached and recomputed every time.
255 | 
256 | > Allowed suffixes on command line: k M G \[default: 2000000\]
257 | 
258 | **\--rga-cache-path=**\<path\>
259 | 
260 | > Path to store cache db \[default: /home/phire/.cache/ripgrep-all\]
261 | 
262 | **-h** shows a concise overview, **\--help** shows more detail and
263 | advanced options.
264 | 
265 | All other options not shown here are passed directly to rg, especially
266 | \[PATTERN\] and \[PATH \...\]
267 | 
268 | <!-- end of part generated by update-readme.sh -->
269 | 
270 | ## Config
271 | The config file location leverage the mechanisms defined by
272 | - the [XDG base directory](https://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html) and
273 |   the [XDG user directory](https://www.freedesktop.org/wiki/Software/xdg-user-dirs/) specifications on Linux (ex: `~/.config/ripgrep-all/config.jsonc`)
274 | - the [Known Folder](https://msdn.microsoft.com/en-us/library/windows/desktop/dd378457.aspx) API on Windows (ex:  `C:\Users\Alice\AppData\Roaming\ripgrep-all/config.jsonc`)
275 | - the [Standard Directories](https://developer.apple.com/library/content/documentation/FileManagement/Conceptual/FileSystemProgrammingGuide/FileSystemOverview/FileSystemOverview.html#//apple_ref/doc/uid/TP40010672-CH2-SW6)
276 |   guidelines on macOS (ex: `~/Library/Application Support/ripgrep-all/config.jsonc`)
277 | 
278 | 
279 | ## Development
280 | 
281 | To enable debug logging:
282 | 
283 | ```bash
284 | export RUST_LOG=debug
285 | export RUST_BACKTRACE=1
286 | ```
287 | 
288 | Also remember to disable caching with `--rga-no-cache` or clear the cache
289 | (`~/Library/Caches/rga` on macOS, `~/.cache/rga` on other Unixes,
290 | or `C:\Users\username\AppData\Local\rga` on Windows)
291 | to debug the adapters.
292 | 
293 | ### Nix and Direnv
294 | 
295 | You can use the provided [`flake.nix`](./flake.nix) to setup all build- and
296 | run-time dependencies:
297 | 
298 | 1. Enable [Flakes](https://wiki.nixos.org/wiki/Flakes) in your Nix configuration.
299 | 1. Add [`direnv`](https://direnv.net/) to your profile:
300 |    `nix profile install nixpkgs#direnv`
301 | 1. `cd` into the directory where you have cloned this directory.
302 | 1. Allow use of [`.envrc`](./.envrc): `direnv allow`
303 | 1. After the dependencies have been installed, your shell will now have all of
304 |    the necessary development dependencies.
305 | 


--------------------------------------------------------------------------------
/ci/macos-install-packages:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | brew install poppler
4 | 


--------------------------------------------------------------------------------
/ci/ubuntu-install-packages:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | sudo apt-get update
4 | sudo apt-get install -y --no-install-recommends \
5 |     poppler-utils
6 | 


--------------------------------------------------------------------------------
/doc/config.default.jsonc:
--------------------------------------------------------------------------------
 1 | {
 2 |   // This file follows the JSON schema defined below.
 3 |   // If you use an editor that supports JSON schema (e.g. VS Code),
 4 |   // you should be getting IntelliSense and validation.
 5 |   "$schema": "./config.v1.schema.json",
 6 |   // The default config and schema will be regenerated if they are missing
 7 |   // https://github.com/phiresky/ripgrep-all/blob/master/doc/config.default.jsonc
 8 | 
 9 |   // The config options are the same as the command line options,
10 |   // but with --rga- prefix removed and - and . replaced with _.
11 |   // e.g. --rga-no-cache becomes `"no_cache": true.
12 |   // The only exception is the `custom_adapters` option, which can only be set in this file.
13 | 
14 |   "custom_adapters": [
15 |     // See https://github.com/phiresky/ripgrep-all/wiki for more information
16 |     // to verify if your custom adapters are picked up correctly, run `rga --rga-list-adapters`
17 |   ]
18 | }
19 | 


--------------------------------------------------------------------------------
/doc/demodir.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/doc/demodir.png


--------------------------------------------------------------------------------
/doc/notes.md:
--------------------------------------------------------------------------------
 1 | ## schema -> ui generation
 2 | 
 3 | https://json-schema.org/implementations.html#web-ui-generation
 4 | 
 5 | -   https://github.com/guillotinaweb/ngx-schema-form
 6 | -   https://github.com/hamzahamidi/ajsf angular igh
 7 | -   https://github.com/dashjoin/json-schema-form
 8 | -   https://github.com/json-editor/json-editor
 9 | -   https://github.com/jsonform/jsonform
10 | -   https://github.com/vazco/uniforms
11 | 
12 | ## json schema is ridiculous
13 | 
14 |     "mimetypes": {
15 |         "description": "if not null and --rga-accurate is enabled, mime type matching is used instead of file name matching",
16 |         "type": [
17 |             "array",
18 |             "null"
19 |         ],
20 |         "items": {
21 |             "type": "string"
22 |         }
23 |     },
24 | 
25 |     what the fuck????
26 |     this is the only thing required to see that json schema has horrible design
27 | 


--------------------------------------------------------------------------------
/doc/rga-fzf.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/doc/rga-fzf.gif


--------------------------------------------------------------------------------
/doc/update-readme.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | content=$(
 4 |     cat <<END
 5 | <!-- this part generated by update-readme.sh -->
 6 | $(cargo run --bin rga -- --rga-list-adapters)
 7 | 
 8 | $(help2man -N "cargo run --bin rga --" | pandoc -f man -t markdown --markdown-headings=atx | rg --multiline "## USAGE:(.|\n)*")
 9 | <!-- end of part generated by update-readme.sh -->
10 | END
11 | )
12 | 
13 | rg --passthrough --multiline '.*update-readme.sh(.|\n)*update-readme.sh.*' README.md --replace "$content" | sponge README.md
14 | prettier --write README.md
15 | 


--------------------------------------------------------------------------------
/exampledir/decompress/test.log:
--------------------------------------------------------------------------------
1 | hello world
2 | this is a test
3 | 


--------------------------------------------------------------------------------
/exampledir/decompress/test.log.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/decompress/test.log.bz2


--------------------------------------------------------------------------------
/exampledir/decompress/test.log.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/decompress/test.log.gz


--------------------------------------------------------------------------------
/exampledir/decompress/test.log.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/decompress/test.log.xz


--------------------------------------------------------------------------------
/exampledir/decompress/test.log.zst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/decompress/test.log.zst


--------------------------------------------------------------------------------
/exampledir/decompress/testlogbutwithoutextension:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/decompress/testlogbutwithoutextension


--------------------------------------------------------------------------------
/exampledir/demo/greeting.mkv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/demo/greeting.mkv


--------------------------------------------------------------------------------
/exampledir/demo/hello.odt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/demo/hello.odt


--------------------------------------------------------------------------------
/exampledir/demo/hello.sqlite3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/demo/hello.sqlite3


--------------------------------------------------------------------------------
/exampledir/demo/somearchive.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/demo/somearchive.zip


--------------------------------------------------------------------------------
/exampledir/droste.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/droste.zip


--------------------------------------------------------------------------------
/exampledir/encoding/utf16le.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/encoding/utf16le.txt


--------------------------------------------------------------------------------
/exampledir/encoding/utf8.txt:
--------------------------------------------------------------------------------
1 | hello wörld!
2 | 


--------------------------------------------------------------------------------
/exampledir/encoding/zip.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/encoding/zip.tar.gz


--------------------------------------------------------------------------------
/exampledir/exif.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/exif.png


--------------------------------------------------------------------------------
/exampledir/formatting.epub:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/formatting.epub


--------------------------------------------------------------------------------
/exampledir/mail_nested.eml:
--------------------------------------------------------------------------------
  1 | To: submit.t4eseGWSvG1JST3r@spam.spamcop.net
  2 | From: 2012gdwu <2012gdwu@posteo.de>
  3 | Subject: Postbank Spam
  4 | Autocrypt: addr=2012gdwu@posteo.de; keydata=
  5 |  mDMEXXjwiRYJKwYBBAHaRw8BAQdAmjXRazNXXy5tK05Dwl5mSRbdth9JkQq92V/QVyqjdgm0
  6 |  I0FybmUgS2VsbGVyIDxhcm5lLmtlbGxlckBwb3N0ZW8uZGU+iJYEExYIAD4WIQR2UN3HoAGx
  7 |  KI0B7Eih+UCxBQvPLgUCXXjwiQIbAwUJCWYBgAULCQgHAgYVCgkICwIEFgIDAQIeAQIXgAAK
  8 |  CRCh+UCxBQvPLpPfAP4gs6Oky3+UO2LU2XxweeQO+YEWXK0QtM2+ajzrGaF3HAD+LBfmyB9+
  9 |  Wom2KP0CwxUzI4d6zmiAMSKOnGGgzd65igm4OARdePCJEgorBgEEAZdVAQUBAQdAncxZ3Rox
 10 |  wmvm+/qCkCm9+PU2HmWr08M3qdqkf2L4IngDAQgHiH4EGBYIACYWIQR2UN3HoAGxKI0B7Eih
 11 |  +UCxBQvPLgUCXXjwiQIbDAUJCWYBgAAKCRCh+UCxBQvPLpQkAQCgYOlOftMNi+sfn+XQvfOc
 12 |  ULQWp+cgOBMcyVCdpJEQCwD9HBuwuHobl8FPm0PbRtlCn/7GY4WK+Hh4+3BKmhRn8wU=
 13 | Message-ID: <1530ae05-33a7-fa40-9473-ca625a14385a@posteo.de>
 14 | Date: Mon, 20 Jul 2020 07:35:55 +0200
 15 | User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101
 16 |  Thunderbird/68.10.0
 17 | MIME-Version: 1.0
 18 | Content-Type: multipart/mixed;
 19 |  boundary="------------6670F92201FB126ED9472803"
 20 | Content-Language: de-DE
 21 | 
 22 | This is a multi-part message in MIME format.
 23 | --------------6670F92201FB126ED9472803
 24 | Content-Type: text/plain; charset=utf-8
 25 | Content-Transfer-Encoding: 7bit
 26 | 
 27 | here you go
 28 | 
 29 | 
 30 | --------------6670F92201FB126ED9472803
 31 | Content-Type: message/rfc822;
 32 |  name="postbank.eml"
 33 | Content-Transfer-Encoding: 7bit
 34 | Content-Disposition: attachment;
 35 |  filename="postbank.eml"
 36 | 
 37 | Return-Path: <gxnwgddl@carcarry.de>
 38 | Delivered-To: arne.keller@posteo.de
 39 | Received: from proxy02.posteo.name ([127.0.0.1])
 40 | 	by dovecot12 (Dovecot) with LMTP id EaKBGxv9FF+9mwEAJesNpQ
 41 | 	for <arne.keller@posteo.de>; Mon, 20 Jul 2020 04:15:27 +0200
 42 | Received: from proxy02.posteo.de ([127.0.0.1])
 43 | 	by proxy02.posteo.name (Dovecot) with LMTP id 31UFGtHsFF+T4gMAGFAyLg
 44 | 	; Mon, 20 Jul 2020 04:15:27 +0200
 45 | Received: from mailin05.posteo.de (unknown [10.0.1.5])
 46 | 	by proxy02.posteo.de (Postfix) with ESMTPS id 4B950v2JYGz11fk
 47 | 	for <arne.keller@posteo.de>; Mon, 20 Jul 2020 04:15:27 +0200 (CEST)
 48 | Received: from mx03.posteo.de (mailin05.posteo.de [127.0.0.1])
 49 | 	by mailin05.posteo.de (Postfix) with ESMTPS id 4270120F15
 50 | 	for <arne.keller@posteo.de>; Mon, 20 Jul 2020 04:15:27 +0200 (CEST)
 51 | X-Virus-Scanned: amavisd-new at posteo.de
 52 | X-Spam-Flag: NO
 53 | X-Spam-Score: 2.639
 54 | X-Spam-Level: **
 55 | X-Spam-Status: No, score=2.639 tagged_above=-1000 required=8
 56 | 	tests=[AV:Heuristics.Phishing.Email.SpoofedDomain=0.1, ALL_TRUSTED=-1,
 57 | 	FROM_LOCAL_NOVOWEL=0.5, HK_RANDOM_ENVFROM=0.626, HK_RANDOM_FROM=0.999,
 58 | 	HTML_FONT_LOW_CONTRAST=0.001, HTML_IMAGE_ONLY_24=1.282,
 59 | 	HTML_MESSAGE=0.001, HTTPS_HTTP_MISMATCH=0.1, POSTEO_GENERICS_IO=0.01,
 60 | 	T_FILL_THIS_FORM_SHORT=0.01, T_REMOTE_IMAGE=0.01] autolearn=disabled
 61 | Received: from mout.web.de (mout.web.de [212.227.15.14])
 62 | 	by mx03.posteo.de (Postfix) with ESMTPS id 4B950t696Mz10nB
 63 | 	for <arne.keller@posteo.de>; Mon, 20 Jul 2020 04:15:26 +0200 (CEST)
 64 | Authentication-Results: mx03.posteo.de; dmarc=none (p=none dis=none) header.from=carcarry.de
 65 | Received: from [212.227.15.17] ([212.227.15.17]) by mx-ha.web.de (mxweb010
 66 |  [212.227.15.17]) with ESMTPS (Nemesis) id 1MRloE-1kQNT22I4w-00T9hm for
 67 |  <arne.keller@posteo.de>; Mon, 20 Jul 2020 04:15:26 +0200
 68 | Received: from mout.kundenserver.de ([212.227.17.24]) by mx-ha.web.de
 69 |  (mxweb010 [212.227.15.17]) with ESMTPS (Nemesis) id 1MINbE-1k0aRm2Hzw-00EOVM
 70 |  for <2012gdwu@web.de>; Mon, 20 Jul 2020 04:15:26 +0200
 71 | Received: from 217.160.251.109 ([217.160.251.109]) by mrelayeu.kundenserver.de
 72 |  (mreue107 [212.227.15.183]) with ESMTPSA (Nemesis) id
 73 |  1MPoPd-1kBHRt0o2F-00MqkS for <2012gdwu@web.de>; Mon, 20 Jul 2020 04:15:26
 74 |  +0200
 75 | From: "=?utf-8?B?UE9TVEJBTs2fS82f?=" <gxnwgddl@carcarry.de>
 76 | Subject: BsetSign App : Y7P32-HTXU2-FRDG7
 77 | To: "2012gdwu" <2012gdwu@web.de>
 78 | Content-Type: multipart/alternative; boundary="QHebeB08yNTYquFAhtQnxv=_cOW4Xd528c"
 79 | MIME-Version: 1.0
 80 | Date: Mon, 20 Jul 2020 02:15:26 +0000
 81 | Message-ID: <1M3lHZ-1jyAPt0pTn-000u1I@mrelayeu.kundenserver.de>
 82 | X-Provags-ID: V03:K1:68TECBVA88ZKh8HcSl/N+ElwlecL1tc+1AuDDyqm9em66WO295R
 83 |  IfuHqA9uG7+Vlyr99v+OneGltnr43KfsgRKj9GgOpDj2QelHphKFGPILAvvsQ8vOq6ucC2W
 84 |  BW3NEOh3JhitB6o4xLEmj+dbivC0ie728/cPMcjj6TwyBzw5nT1or8mBZWoEMSF/zcu+PIr
 85 |  gGpFY2puzzURN4oKX82/w==
 86 | X-Spam-Flag: NO
 87 | X-UI-Out-Filterresults: notjunk:1;V03:K0:c01ZANnvlk8=:ouSMGue72FUx2PJOSNnmEW
 88 |  qI8A89gf6q3aAdJBhLX1Bhd70xio64ljpha9X5ArOYg6Q2RH1JYyvfBSMoTo3HMy37H3L8kaq
 89 |  ReRCdSPOMD8+llZ/rRpPLl+7PofGOv+Hu3UO7gzgm9v0YqwLZIwh9P2w9TIu+GqVJWeDdmxrs
 90 |  RDPeHY8lsRL+8AFeSGNiWBYMEHDxKofTqS5Zh7mal1Bm4JbgEEIP36V4oL3c6V1olMHQZzEH9
 91 |  7D0T8U6LyLyfSbuu5M6QN2FZ+F6IDJNDUG1uwNt9K12ESY6TweMR3xInFabiZ9fMPmrjPaNwW
 92 |  hlyKg67tDYL2lfk2fpa/LbhLnlfKEDqSvkgK54CZh+xbIQetju66cZUEFQyCIcGdAOWI8+nty
 93 |  FdbNUzxhNpZTPBrA7H95gRuc0u2GJBfZZsxdp46jpBwG65yqmJ32pkJrATo8CNbBO9A6hpdyL
 94 |  UNu5bavZBJp9dsyY6Cnm6vMOIjJ8qMy/vNkrtRXNWBrnVHhuQZ3B+osG8XWLiyq7s4hFOwDxY
 95 |  WLRgjKL6HgIj+2DLParwiuSsX8TVy5+WhxDUou0UJDzD3C1JmYiryTlo4Vu4CIZFXkgAuAsEq
 96 |  c55M6L2eUmD3xQNaqgMEJFksT2qXWaSb2Qw6HM7mtLBbSUhuWtSv2oeVrNwgx8XWexWYYZYFv
 97 |  KAZzICpkVhxpYIntoKRiDtQZxBDejPwGmne2iG81rn34pGJwOOYojf9dFghodE5bZEqVh6KbA
 98 |  f/38x9FIoYewzA2WuyngX/bXTdkLQM49W1vdlF5DQOlgYuM8Ni7NeJG888VhDZxcUn6vIIJs3
 99 |  xH0jOWrWCUz0gK9uyyagjcfdXr54Zv1E7i936CTlRq5QnDKN2C9jQFH5ymD4G1W5zX6Xj/05O
100 |  M7VaU9Y3mvOM/+82zsKc5zJOFOf9MoI5JBhnPjHWeqaJgpYhNoKgGvPo3QfZFwzk/MHH2PgB1
101 |  PLGvjSE8u/cpYeGhJdzTXM00J9ai5yGRNFD71zHoHBOFGCpmZVnJJ8SD+qUd4K4BfSD+DJ5Qd
102 |  t1wsCpH5bgodnXgMcN6Zj0q3P/ODk3dnah1hsYMyIWDBFZ0cTlp2QkYhAKZh1HM5WcfSc5UwU
103 |  SrcK9HHiG7BKOFYA1r6Rx5YYqwGWeGxr9mlH7MLyfCwI8PlWtfeB7Pj4eEI1hLy9GMnHBCJDj
104 |  W8o1yDeE54rgWHR7CtIF6w+qF+quA3ZdwVSPOHwQeH7vS4OaJjeEyeeT4YOJdIMI7UknEasAG
105 |  LfMS/PKWx7+YcUNaz0xvO70NwZj1FKJuWqDS6ZTciMSvGkEFTWVOqn5nPlHi8hDbBTVn70aPa
106 |  BQi3U68hgdDpJIHlVLLvRcaCYYly3L60NQBgJroag4fRiIvDUSXfDatrDYOv+L4xBYdB3GP+s
107 |  wqtsPY82YOwXP5KlRMPVEZcuWX5tWiOuaNjePbEkXpE2iQZUqfkDQTYNUGZR+TTBqHOWjO7R3
108 |  hORQB0gOwe85gZv80G1EL32EtRjVxJxQfrHGPCGXb8HRXbvGGV3Xu3wZEE8iuJngBUJtWeDBq
109 |  q61rYwZxVuml72lfRM6Lo+OGLAsyqvobxujY9BHpokZH4FNlUstjUoPANTGoAhM+MyQb0fSAV
110 |  8HA/r6n0oJh0B8+2AxJvVokbhEbL/RlJIZIYpCeRceeA+jjBaR7EvuglUoLN3CcB9CrdDH/qz
111 |  ymHzEjPVnFar3/sqRjeKyIk71z4yotOKCPQcdD1gTbYWehZiIJwAlDFSpfPdFTQLOJMWd3wuD
112 |  0mHLep6tLtCY+hjhCYWlTyKKQ8CWiBWPTql21bPp7XVWCfc+4u8kZi5Y3dg3pvpSwwmcyRisX
113 |  +7+8a+pBzN4VOEuX+dzglKDrNd6h2OL0tBMnk1yqAV27dX9cMRrO941IvtiaZO90BjZtV92oP
114 |  XkGxvKnGQuynHus/3yblaw==
115 | 
116 | This is a multi-part message in MIME format
117 | 
118 | --QHebeB08yNTYquFAhtQnxv=_cOW4Xd528c
119 | Content-Type: text/plain; charset="utf-8"
120 | Content-Transfer-Encoding: quoted-printable
121 | Content-Disposition: inline
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 
129 |  Sehr geehrter Herr / Frau =E2=80=A6,
130 | 
131 |  Ab dem 20. Jul 2020 aktualisiert die Postbank alle BestSign-Anwendung=
132 | en.
133 | 
134 | 
135 | 
136 |  =C3=96ffnen Sie den unten stehenden Aktivierungslink, um am Upgrade t=
137 | eilzunehmen. Verkn=C3=BCpfung
138 | 
139 | 
140 | 
141 | 
142 |  https://meine.postbank.de/#/login
143 | 
144 | 
145 | 
146 | 
147 |  Wir empfehlen dringend, dieses Upgrade durchzuf=C3=BChren.
148 | 
149 |  Reundliche Gr=C3=BC=C3=9Fe,
150 | 
151 |  =C2=A9 2020 Postbank=E2=80=93 eine Niederlassung der Deutsche Bank AG=
152 | 
153 | 
154 | Hypnotiseur/zertifizierter Hypnosecoach (DVH)
155 | Burnoutpr=C3=A4ventionscoach
156 | Modeberater f=C3=BCr Ma=C3=9Fhemden/Ma=C3=9Fblusen
157 | Kurs/Seminarleiter Waldbaden/Waldcoach
158 | Am Wiesengrund 5
159 | 24980 Schafflund
160 | Tel.: 04639-98475
161 | Mob.: 015117317305
162 | Home : www.hypnosepraxis-im-norden.de
163 | Home : www.masshemden-im-norden.de
164 | Home : www.waldbaden-zwischen-den-meeren.de
165 | 
166 | 
167 | --QHebeB08yNTYquFAhtQnxv=_cOW4Xd528c
168 | Content-Type: text/html; charset="utf-8"
169 | Content-Transfer-Encoding: quoted-printable
170 | Content-Disposition: inline
171 | 
172 | <html><head></head><body><p><img width=3D"174" height=3D"51" alt=3D"" =
173 | src=3D"https://upload.wikimedia.org/wikipedia/commons/thumb/d/d1/Postb=
174 | ank-Logo.svg/1200px-Postbank-Logo.svg.png"></p><p><br></p>
175 | <div>
176 | <div>&nbsp;Sehr geehrter Herr / Frau =E2=80=A6,</div>
177 | <div>&nbsp;Ab dem 20. Jul 2020 aktualisiert die Postbank alle BestSign=
178 | -Anwendungen.<br><br></div>
179 | <div>&nbsp;=C3=96ffnen Sie den unten stehenden Aktivierungslink, um am=
180 |  Upgrade teilzunehmen. Verkn=C3=BCpfung</div><div><br></div>
181 | <div>&nbsp;<a href=3D"https://www.astcdubai.com/.well-known/.re/">http=
182 | s://meine.postbank.de/#/login</a></div><div><br></div>
183 | <div>&nbsp;Wir empfehlen dringend, dieses Upgrade durchzuf=C3=BChren.<=
184 | /div>
185 | <div>&nbsp;Reundliche Gr=C3=BC=C3=9Fe,</div>
186 | <div>&nbsp;<strong>=C2=A9</strong> 2020 <strong>Postbank</strong>=E2=80=
187 | =93 eine Niederlassung der Deutsche Bank AG<br><br> <span style=3D"col=
188 | or: rgb(255, 255, 255);">Hypnotiseur/zertifizierter Hypnosecoach (DVH)=
189 | </span><br><span style=3D"color: rgb(255, 255, 255);"> Burnoutpr=C3=A4=
190 | ventionscoach</span><br><span style=3D"color: rgb(255, 255, 255);"> Mo=
191 | deberater f=C3=BCr Ma=C3=9Fhemden/Ma=C3=9Fblusen</span><br><span style=
192 | =3D"color: rgb(255, 255, 255);"> Kurs/Seminarleiter Waldbaden/Waldcoac=
193 | h</span><br><span style=3D"color: rgb(255, 255, 255);"> Am Wiesengrund=
194 |  5</span><br><span style=3D"color: rgb(255, 255, 255);"> 24980 Schaffl=
195 | und</span><br><span style=3D"color: rgb(255, 255, 255);"> Tel.: 04639-=
196 | 98475</span><br><span style=3D"color: rgb(255, 255, 255);"> Mob.: 0151=
197 | 17317305</span><br><span style=3D"color: rgb(255, 255, 255);"> Home : =
198 | <a style=3D"color: rgb(255, 255, 255);" href=3D"https://deref-gmx.net/=
199 | mail/client/Pk7kcpLwLpI/dereferrer/?redirectUrl=3Dhttp%3A%2F%2Fwww.hyp=
200 | nosepraxis-im-norden.de" target=3D"_blank" rel=3D"noopener">www.hypnos=
201 | epraxis-im-norden.de</a></span><br><span style=3D"color: rgb(255, 255,=
202 |  255);"> Home : <a style=3D"color: rgb(255, 255, 255);" href=3D"https:=
203 | //deref-gmx.net/mail/client/KR0VAuy5YPo/dereferrer/?redirectUrl=3Dhttp=
204 | %3A%2F%2Fwww.masshemden-im-norden.de" target=3D"_blank" rel=3D"noopene=
205 | r">www.masshemden-im-norden.de</a></span><br><span style=3D"color: rgb=
206 | (255, 255, 255);"> Home : <a style=3D"color: rgb(255, 255, 255);" href=
207 | =3D"https://deref-gmx.net/mail/client/QTybHixMVsI/dereferrer/?redirect=
208 | Url=3Dhttp%3A%2F%2Fwww.waldbaden-zwischen-den-meeren.de" target=3D"_bl=
209 | ank" rel=3D"noopener">www.waldbaden-zwischen-den-meeren.de</a></span><=
210 | /div>
211 | </div></body></html>
212 | 
213 | 
214 | --QHebeB08yNTYquFAhtQnxv=_cOW4Xd528c--
215 | 
216 | 
217 | --------------6670F92201FB126ED9472803--
218 | 


--------------------------------------------------------------------------------
/exampledir/screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/screenshot.png


--------------------------------------------------------------------------------
/exampledir/short.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/short.pdf


--------------------------------------------------------------------------------
/exampledir/sqlitedb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/sqlitedb


--------------------------------------------------------------------------------
/exampledir/tar/exampledir.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/tar/exampledir.tar.gz


--------------------------------------------------------------------------------
/exampledir/tar/test.tar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/tar/test.tar


--------------------------------------------------------------------------------
/exampledir/tar/test.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/tar/test.tar.bz2


--------------------------------------------------------------------------------
/exampledir/tar/test.tar.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/tar/test.tar.zip


--------------------------------------------------------------------------------
/exampledir/test.djvu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/test.djvu


--------------------------------------------------------------------------------
/exampledir/test.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/test.zip


--------------------------------------------------------------------------------
/exampledir/test/github_email.eml:
--------------------------------------------------------------------------------
 1 | Return-Path: <noreply@github.com>
 2 | Date: Mon, 31 Jul 2023 01:34:57 -0700
 3 | From: "github-actions[bot]" <notifications@github.com>
 4 | To: KeYProject/key <key@noreply.github.com>
 5 | In-Reply-To: <KeYProject/key/pull/3232@github.com>
 6 | References: <KeYProject/key/pull/3232@github.com>
 7 | Subject: Re: [KeYProject/key] Fix more UI bugs (PR #3232)
 8 | Mime-Version: 1.0
 9 | Content-Type: multipart/alternative;
10 |  boundary="--==_mimepart_64c77231630fb_12bafdd0c012685";
11 |  charset=UTF-8
12 | Content-Transfer-Encoding: 7bit
13 | Precedence: list
14 | X-GitHub-Sender: github-actions[bot]
15 | X-GitHub-Recipient: FliegendeWurst
16 | X-GitHub-Reason: author
17 | List-ID: KeYProject/key <key.KeYProject.github.com>
18 | List-Archive: https://github.com/KeYProject/key
19 | X-Auto-Response-Suppress: All
20 | destinations: 2012gdwu+github@posteo.de
21 | X-GitHub-Recipient-Address: 2012gdwu+github@posteo.de
22 | 
23 | 
24 | ----==_mimepart_64c77231630fb_12bafdd0c012685
25 | Content-Type: text/plain;
26 |  charset=UTF-8
27 | Content-Transfer-Encoding: 7bit
28 | 
29 | Thank you for your contribution.
30 | 
31 | The test artifacts are available on [Artiweb](https://keyproject.github.io/artiweb/3232/).
32 | The newest artifact is [here](https://keyproject.github.io/artiweb/3232/833812796/).
33 | 
34 | -- 
35 | Reply to this email directly or view it on GitHub:
36 | https://github.com/KeYProject/key/pull/3232#issuecomment-1657918122
37 | You are receiving this because you authored the thread.
38 | 
39 | Message ID: <KeYProject/key/pull/3232/c1657918122@github.com>
40 | ----==_mimepart_64c77231630fb_12bafdd0c012685
41 | Content-Type: text/html;
42 |  charset=UTF-8
43 | Content-Transfer-Encoding: 7bit
44 | 
45 | <p></p>
46 | <p dir="auto">Thank you for your contribution.</p>
47 | <p dir="auto">The test artifacts are available on <a href="https://keyproject.github.io/artiweb/3232/" rel="nofollow">Artiweb</a>.<br>
48 | The newest artifact is <a href="https://keyproject.github.io/artiweb/3232/833812796/" rel="nofollow">here</a>.</p>
49 | 
50 | <p style="font-size:small;-webkit-text-size-adjust:none;color:#666;">&mdash;<br />Reply to this email directly, <a href="https://github.com/KeYProject/key/pull/3232#issuecomment-1657918122">view it on GitHub</a>, or <a href="https://github.com/notifications/unsubscribe-auth/AC72QTN2XK2D42XVPNPMTXLXS5U3DANCNFSM6AAAAAA25Y7T4M">unsubscribe</a>.<br />You are receiving this because you authored the thread.<img src="https://github.com/notifications/beacon/AC72QTNZM6RAGMF7TAEUEL3XS5U3DA5CNFSM6AAAAAA25Y7T4OWGG33NNVSW45C7OR4XAZNMJFZXG5LFINXW23LFNZ2KUY3PNVWWK3TUL5UWJTTC2HJKU.gif" height="1" width="1" alt="" /><span style="color: transparent; font-size: 0; display: none; visibility: hidden; overflow: hidden; opacity: 0; width: 0; height: 0; max-width: 0; max-height: 0; mso-hide: all">Message ID: <span>&lt;KeYProject/key/pull/3232/c1657918122</span><span>@</span><span>github</span><span>.</span><span>com&gt;</span></span></p>
51 | <script type="application/ld+json">[
52 | {
53 | "@context": "http://schema.org",
54 | "@type": "EmailMessage",
55 | "potentialAction": {
56 | "@type": "ViewAction",
57 | "target": "https://github.com/KeYProject/key/pull/3232#issuecomment-1657918122",
58 | "url": "https://github.com/KeYProject/key/pull/3232#issuecomment-1657918122",
59 | "name": "View Pull Request"
60 | },
61 | "description": "View this Pull Request on GitHub",
62 | "publisher": {
63 | "@type": "Organization",
64 | "name": "GitHub",
65 | "url": "https://github.com"
66 | }
67 | }
68 | ]</script>
69 | ----==_mimepart_64c77231630fb_12bafdd0c012685--
70 | 


--------------------------------------------------------------------------------
/exampledir/test/hello.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/test/hello.gz


--------------------------------------------------------------------------------
/exampledir/test/hello.sqlite3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/test/hello.sqlite3


--------------------------------------------------------------------------------
/exampledir/test/hello.tar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/test/hello.tar


--------------------------------------------------------------------------------
/exampledir/test/only-seek-zip.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/test/only-seek-zip.zip


--------------------------------------------------------------------------------
/exampledir/test/short.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/test/short.pdf


--------------------------------------------------------------------------------
/exampledir/test/short.pdf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/test/short.pdf.gz


--------------------------------------------------------------------------------
/exampledir/test/test.mbx:
--------------------------------------------------------------------------------
 1 | From 
 2 | Message-ID: <55a23774-4da7-057c-77a7-ec390fed487b@posteo.de>
 3 | Date: Mon, 27 Feb 2023 12:05:46 +0100
 4 | MIME-Version: 1.0
 5 | User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101
 6 |  Thunderbird/102.8.0
 7 | From: Arne Keller <2012gdwu@web.de>
 8 | Subject: From encoding test
 9 | To: arne.keller@posteo.de
10 | Content-Language: de-DE
11 | X-Enigmail-Draft-Status: N00200
12 | X-Mozilla-Draft-Info: internal/draft; vcard=0; receipt=0; DSN=0; uuencode=0;
13 |  attachmentreminder=0; deliveryformat=0
14 | X-Identity-Key: id2
15 | Fcc: imap://2012gdwu@imap.web.de/Gesendet
16 | Content-Type: text/html; charset=UTF-8
17 | Content-Transfer-Encoding: 7bit
18 | 
19 | <html>
20 |   <head>
21 |     <meta http-equiv="content-type" content="text/html; charset=UTF-8">
22 |   </head>
23 |   <body>
24 |     <p>&gt;From</p>
25 |     <p>Another word &gt;From<br>
26 |     </p>
27 |   </body>
28 | </html>
29 | From 
30 | Message-ID: <55a23774-4da7-057c-77a7-ec390fed487b@posteo.de>
31 | Date: Mon, 27 Feb 2023 12:06:56 +0100
32 | MIME-Version: 1.0
33 | User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101
34 |  Thunderbird/102.8.0
35 | From: Arne Keller <2012gdwu@web.de>
36 | Subject: From encoding test
37 | To: arne.keller@posteo.de
38 | Content-Language: de-DE
39 | X-Enigmail-Draft-Status: N00200
40 | X-Mozilla-Draft-Info: internal/draft; vcard=0; receipt=0; DSN=0; uuencode=0;
41 |  attachmentreminder=0; deliveryformat=1
42 | X-Identity-Key: id2
43 | Fcc: imap://2012gdwu@imap.web.de/Gesendet
44 | Content-Type: text/html; charset=UTF-8
45 | Content-Transfer-Encoding: 7bit
46 | 
47 | <html>
48 |   <head>
49 |     <meta http-equiv="content-type" content="text/html; charset=UTF-8">
50 |   </head>
51 |   <body>
52 |     <p>&gt;From</p>
53 |     <p>Another word &gt;From<br>
54 |     </p>
55 |   </body>
56 | </html>
57 | From - Mon Feb 27 12:06:57 2023
58 | X-Mozilla-Status: 0001
59 | X-Mozilla-Status2: 00000000
60 | Message-ID: <55a23774-4da7-057c-77a7-ec390fed487b@posteo.de>
61 | Date: Mon, 27 Feb 2023 12:06:56 +0100
62 | MIME-Version: 1.0
63 | User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101
64 |  Thunderbird/102.8.0
65 | From: Arne Keller <2012gdwu@web.de>
66 | Subject: From encoding test
67 | To: arne.keller@posteo.de
68 | Content-Language: de-DE
69 | X-Enigmail-Draft-Status: N00200
70 | X-Mozilla-Draft-Info: internal/draft; vcard=0; receipt=0; DSN=0; uuencode=0;
71 |  attachmentreminder=0; deliveryformat=1
72 | X-Identity-Key: id2
73 | Fcc: imap://2012gdwu@imap.web.de/Gesendet
74 | Content-Type: text/html; charset=UTF-8
75 | Content-Transfer-Encoding: 7bit
76 | 
77 | <html>
78 |   <head>
79 |     <meta http-equiv="content-type" content="text/html; charset=UTF-8">
80 |   </head>
81 |   <body>
82 |     <p>&gt;From</p>
83 |     <p>Another word &gt;From<br>
84 |     </p>
85 |   </body>
86 | </html>
87 | 


--------------------------------------------------------------------------------
/exampledir/test/twoblankpages.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/test/twoblankpages.pdf


--------------------------------------------------------------------------------
/exampledir/wasteland.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/wasteland.docx


--------------------------------------------------------------------------------
/exampledir/wasteland.epub:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/wasteland.epub


--------------------------------------------------------------------------------
/exampledir/wasteland.mkv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/wasteland.mkv


--------------------------------------------------------------------------------
/exampledir/wasteland.mobi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/wasteland.mobi


--------------------------------------------------------------------------------
/exampledir/wasteland.odt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/wasteland.odt


--------------------------------------------------------------------------------
/exampledir/wasteland.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phiresky/ripgrep-all/2ccf3eeb71e491f074fce67e3c2d1f72f380b09a/exampledir/wasteland.pdf


--------------------------------------------------------------------------------
/flake.lock:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nodes": {
  3 |     "advisory-db": {
  4 |       "flake": false,
  5 |       "locked": {
  6 |         "lastModified": 1740407442,
  7 |         "narHash": "sha256-EGzWKm5cUDDJbwVzxSB4N/+CIVycwOG60Gh5f1Vp7JM=",
  8 |         "owner": "rustsec",
  9 |         "repo": "advisory-db",
 10 |         "rev": "2e25d9665f10de885c81a9fb9d51a289f625b05f",
 11 |         "type": "github"
 12 |       },
 13 |       "original": {
 14 |         "owner": "rustsec",
 15 |         "repo": "advisory-db",
 16 |         "type": "github"
 17 |       }
 18 |     },
 19 |     "crane": {
 20 |       "locked": {
 21 |         "lastModified": 1739936662,
 22 |         "narHash": "sha256-x4syUjNUuRblR07nDPeLDP7DpphaBVbUaSoeZkFbGSk=",
 23 |         "owner": "ipetkov",
 24 |         "repo": "crane",
 25 |         "rev": "19de14aaeb869287647d9461cbd389187d8ecdb7",
 26 |         "type": "github"
 27 |       },
 28 |       "original": {
 29 |         "owner": "ipetkov",
 30 |         "repo": "crane",
 31 |         "type": "github"
 32 |       }
 33 |     },
 34 |     "flake-compat": {
 35 |       "flake": false,
 36 |       "locked": {
 37 |         "lastModified": 1696426674,
 38 |         "narHash": "sha256-kvjfFW7WAETZlt09AgDn1MrtKzP7t90Vf7vypd3OL1U=",
 39 |         "owner": "edolstra",
 40 |         "repo": "flake-compat",
 41 |         "rev": "0f9255e01c2351cc7d116c072cb317785dd33b33",
 42 |         "type": "github"
 43 |       },
 44 |       "original": {
 45 |         "owner": "edolstra",
 46 |         "repo": "flake-compat",
 47 |         "type": "github"
 48 |       }
 49 |     },
 50 |     "flake-utils": {
 51 |       "inputs": {
 52 |         "systems": "systems"
 53 |       },
 54 |       "locked": {
 55 |         "lastModified": 1731533236,
 56 |         "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
 57 |         "owner": "numtide",
 58 |         "repo": "flake-utils",
 59 |         "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
 60 |         "type": "github"
 61 |       },
 62 |       "original": {
 63 |         "owner": "numtide",
 64 |         "repo": "flake-utils",
 65 |         "type": "github"
 66 |       }
 67 |     },
 68 |     "gitignore": {
 69 |       "inputs": {
 70 |         "nixpkgs": [
 71 |           "pre-commit-hooks",
 72 |           "nixpkgs"
 73 |         ]
 74 |       },
 75 |       "locked": {
 76 |         "lastModified": 1709087332,
 77 |         "narHash": "sha256-HG2cCnktfHsKV0s4XW83gU3F57gaTljL9KNSuG6bnQs=",
 78 |         "owner": "hercules-ci",
 79 |         "repo": "gitignore.nix",
 80 |         "rev": "637db329424fd7e46cf4185293b9cc8c88c95394",
 81 |         "type": "github"
 82 |       },
 83 |       "original": {
 84 |         "owner": "hercules-ci",
 85 |         "repo": "gitignore.nix",
 86 |         "type": "github"
 87 |       }
 88 |     },
 89 |     "nixpkgs": {
 90 |       "locked": {
 91 |         "lastModified": 1740711547,
 92 |         "narHash": "sha256-qvixVB2cFGOX/B//KbjKUndrMbIDEGBx7xphitqnvr8=",
 93 |         "owner": "NixOS",
 94 |         "repo": "nixpkgs",
 95 |         "rev": "2ca95eef7e3b33ea8b858ed025e492373aca8106",
 96 |         "type": "github"
 97 |       },
 98 |       "original": {
 99 |         "owner": "NixOS",
100 |         "repo": "nixpkgs",
101 |         "type": "github"
102 |       }
103 |     },
104 |     "pre-commit-hooks": {
105 |       "inputs": {
106 |         "flake-compat": "flake-compat",
107 |         "gitignore": "gitignore",
108 |         "nixpkgs": [
109 |           "nixpkgs"
110 |         ]
111 |       },
112 |       "locked": {
113 |         "lastModified": 1737465171,
114 |         "narHash": "sha256-R10v2hoJRLq8jcL4syVFag7nIGE7m13qO48wRIukWNg=",
115 |         "owner": "cachix",
116 |         "repo": "pre-commit-hooks.nix",
117 |         "rev": "9364dc02281ce2d37a1f55b6e51f7c0f65a75f17",
118 |         "type": "github"
119 |       },
120 |       "original": {
121 |         "owner": "cachix",
122 |         "repo": "pre-commit-hooks.nix",
123 |         "type": "github"
124 |       }
125 |     },
126 |     "root": {
127 |       "inputs": {
128 |         "advisory-db": "advisory-db",
129 |         "crane": "crane",
130 |         "flake-utils": "flake-utils",
131 |         "nixpkgs": "nixpkgs",
132 |         "pre-commit-hooks": "pre-commit-hooks",
133 |         "rust-overlay": "rust-overlay"
134 |       }
135 |     },
136 |     "rust-overlay": {
137 |       "inputs": {
138 |         "nixpkgs": [
139 |           "nixpkgs"
140 |         ]
141 |       },
142 |       "locked": {
143 |         "lastModified": 1740709839,
144 |         "narHash": "sha256-4dF++MXIXna/AwlZWDKr7bgUmY4xoEwvkF1GewjNrt0=",
145 |         "owner": "oxalica",
146 |         "repo": "rust-overlay",
147 |         "rev": "b4270835bf43c6f80285adac6f66a26d83f0f277",
148 |         "type": "github"
149 |       },
150 |       "original": {
151 |         "owner": "oxalica",
152 |         "repo": "rust-overlay",
153 |         "type": "github"
154 |       }
155 |     },
156 |     "systems": {
157 |       "locked": {
158 |         "lastModified": 1681028828,
159 |         "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
160 |         "owner": "nix-systems",
161 |         "repo": "default",
162 |         "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
163 |         "type": "github"
164 |       },
165 |       "original": {
166 |         "owner": "nix-systems",
167 |         "repo": "default",
168 |         "type": "github"
169 |       }
170 |     }
171 |   },
172 |   "root": "root",
173 |   "version": 7
174 | }
175 | 


--------------------------------------------------------------------------------
/flake.nix:
--------------------------------------------------------------------------------
  1 | {
  2 |   description = "ripgrep, but also search in PDFs, E-Books, Office documents, zip, tar.gz, etc.";
  3 | 
  4 |   inputs = {
  5 |     nixpkgs.url = "github:NixOS/nixpkgs";
  6 | 
  7 |     crane = {
  8 |       url = "github:ipetkov/crane";
  9 |     };
 10 | 
 11 |     flake-utils.url = "github:numtide/flake-utils";
 12 | 
 13 |     rust-overlay = {
 14 |       url = "github:oxalica/rust-overlay";
 15 |       inputs.nixpkgs.follows = "nixpkgs";
 16 |     };
 17 | 
 18 |     advisory-db = {
 19 |       url = "github:rustsec/advisory-db";
 20 |       flake = false;
 21 |     };
 22 | 
 23 |     pre-commit-hooks = {
 24 |       url = "github:cachix/pre-commit-hooks.nix";
 25 |       inputs.nixpkgs.follows = "nixpkgs";
 26 |     };
 27 |   };
 28 | 
 29 |   outputs = {
 30 |     self,
 31 |     nixpkgs,
 32 |     crane,
 33 |     flake-utils,
 34 |     rust-overlay,
 35 |     advisory-db,
 36 |     pre-commit-hooks,
 37 |   }:
 38 |     flake-utils.lib.eachDefaultSystem (system: let
 39 |       pkgs = import nixpkgs {
 40 |         inherit system;
 41 |         overlays = [(import rust-overlay)];
 42 |       };
 43 | 
 44 |       craneLib =
 45 |         (crane.mkLib pkgs).overrideToolchain
 46 |         (p:
 47 |           (p.rust-bin.fromRustupToolchainFile ./rust-toolchain.toml).override {
 48 |             extensions = [
 49 |               "rust-analyzer"
 50 |               "rust-src"
 51 |               "rustfmt"
 52 |             ];
 53 |           });
 54 | 
 55 |       src = pkgs.lib.cleanSourceWith {
 56 |         src = craneLib.path ./.;
 57 |         filter = pkgs.lib.cleanSourceFilter;
 58 |       };
 59 | 
 60 |       nativeBuildInputs = pkgs.lib.optionals pkgs.stdenv.isDarwin [
 61 |         # Additional darwin specific inputs can be set here
 62 |         pkgs.libiconv
 63 |       ];
 64 | 
 65 |       runtimeInputs = with pkgs; [ffmpeg pandoc poppler_utils ripgrep zip];
 66 | 
 67 |       # Build *just* the cargo dependencies, so we can reuse
 68 |       # all of that work (e.g. via cachix) when running in CI
 69 |       cargoArtifacts =
 70 |         craneLib.buildDepsOnly {inherit src nativeBuildInputs;};
 71 | 
 72 |       # Build the actual crate itself, reusing the dependency
 73 |       # artifacts from above.
 74 |       rgaBinary = craneLib.buildPackage {
 75 |         inherit cargoArtifacts src nativeBuildInputs;
 76 |         buildInputs = runtimeInputs; # needed for tests
 77 |       };
 78 | 
 79 |       # Provide a shell script of the Rust binary plus runtime dependencies.
 80 |       rga = pkgs.pkgs.writeShellApplication {
 81 |         name = "rga";
 82 |         text = ''rga "$@"'';
 83 |         runtimeInputs = runtimeInputs ++ [rgaBinary];
 84 |       };
 85 | 
 86 |       pre-commit = pre-commit-hooks.lib."${system}".run;
 87 |     in {
 88 |       # `nix flake check`
 89 |       checks = {
 90 |         # Build the crate as part of `nix flake check` for convenience
 91 |         inherit rgaBinary;
 92 | 
 93 |         # Run clippy (and deny all warnings) on the crate source,
 94 |         # again, resuing the dependency artifacts from above.
 95 |         #
 96 |         # Note that this is done as a separate derivation so that
 97 |         # we can block the CI if there are issues here, but not
 98 |         # prevent downstream consumers from building our crate by itself.
 99 |         rga-clippy = craneLib.cargoClippy {
100 |           inherit cargoArtifacts src;
101 |           cargoClippyExtraArgs = "--all-targets -- --deny warnings";
102 |         };
103 | 
104 |         rga-doc = craneLib.cargoDoc {inherit cargoArtifacts src;};
105 | 
106 |         # Audit dependencies
107 |         rga-audit = craneLib.cargoAudit {inherit src advisory-db;};
108 | 
109 |         # Run tests with cargo-nextest.
110 |         rga-nextest = craneLib.cargoNextest {
111 |           inherit cargoArtifacts src nativeBuildInputs;
112 |           buildInputs = runtimeInputs; # needed for tests
113 |           partitions = 1;
114 |           partitionType = "count";
115 |         };
116 | 
117 |         pre-commit = pre-commit {
118 |           src = ./.;
119 |           hooks = {
120 |             alejandra.enable = true;
121 |             rustfmt = {
122 |               enable = true;
123 |               packageOverrides.cargo = craneLib.cargo;
124 |               packageOverrides.rustfmt = craneLib.rustfmt;
125 |             };
126 |             typos = {
127 |               enable = true;
128 |               settings = {
129 |                 exclude = "exampledir/*";
130 |               };
131 |             };
132 |           };
133 |         };
134 |       };
135 | 
136 |       # `nix build`
137 |       packages = {
138 |         inherit rgaBinary rga;
139 |         default = rga; # `nix build`
140 |       };
141 | 
142 |       # `nix run`
143 |       apps.default = flake-utils.lib.mkApp {drv = rga;};
144 | 
145 |       # `nix develop`
146 |       devShells.default = craneLib.devShell {
147 |         inherit nativeBuildInputs;
148 |         inherit (self.checks.${system}.pre-commit) shellHook;
149 |         inputsFrom = builtins.attrValues self.checks;
150 |         buildInputs = self.checks.${system}.pre-commit.enabledPackages;
151 |         packages = runtimeInputs;
152 |       };
153 |     });
154 | }
155 | 


--------------------------------------------------------------------------------
/rust-toolchain.toml:
--------------------------------------------------------------------------------
1 | [toolchain]
2 | channel = "stable"
3 | 


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | edition = "2018"
2 | 


--------------------------------------------------------------------------------
/src/adapted_iter.rs:
--------------------------------------------------------------------------------
 1 | use std::pin::Pin;
 2 | 
 3 | use tokio_stream::Stream;
 4 | 
 5 | use crate::adapters::AdaptInfo;
 6 | 
 7 | pub trait AdaptedFilesIter: Stream<Item = anyhow::Result<AdaptInfo>> + Send {}
 8 | impl<T> AdaptedFilesIter for T where T: Stream<Item = anyhow::Result<AdaptInfo>> + Send {}
 9 | 
10 | pub type AdaptedFilesIterBox = Pin<Box<dyn AdaptedFilesIter>>;
11 | 
12 | pub fn one_file(ai: AdaptInfo) -> AdaptedFilesIterBox {
13 |     Box::pin(tokio_stream::once(Ok(ai)))
14 | }
15 | 


--------------------------------------------------------------------------------
/src/adapters.rs:
--------------------------------------------------------------------------------
  1 | pub mod custom;
  2 | pub mod decompress;
  3 | pub mod ffmpeg;
  4 | pub mod mbox;
  5 | pub mod postproc;
  6 | use std::sync::Arc;
  7 | pub mod sqlite;
  8 | pub mod tar;
  9 | pub mod writing;
 10 | pub mod zip;
 11 | use crate::{adapted_iter::AdaptedFilesIterBox, config::RgaConfig, matching::*};
 12 | use anyhow::{Context, Result, format_err};
 13 | use async_trait::async_trait;
 14 | use custom::BUILTIN_SPAWNING_ADAPTERS;
 15 | use custom::CustomAdapterConfig;
 16 | use log::*;
 17 | use tokio::io::AsyncRead;
 18 | 
 19 | use core::fmt::Debug;
 20 | use std::borrow::Cow;
 21 | use std::collections::HashMap;
 22 | use std::iter::Iterator;
 23 | use std::path::PathBuf;
 24 | use std::pin::Pin;
 25 | 
 26 | use self::postproc::PostprocPageBreaks;
 27 | 
 28 | pub type ReadBox = Pin<Box<dyn AsyncRead + Send>>;
 29 | pub struct AdapterMeta {
 30 |     /// unique short name of this adapter (a-z0-9 only)
 31 |     pub name: String,
 32 |     /// version identifier. used to key cache entries, change if your output format changes
 33 |     pub version: i32,
 34 |     pub description: String,
 35 |     /// indicates whether this adapter can descend (=call rga_preproc again). if true, the cache key needs to include the list of active adapters
 36 |     pub recurses: bool,
 37 |     /// list of matchers (interpreted as a OR b OR ...)
 38 |     pub fast_matchers: Vec<FastFileMatcher>,
 39 |     /// list of matchers when we have mime type detection active (interpreted as ORed)
 40 |     /// warning: this *overrides* the fast matchers
 41 |     pub slow_matchers: Option<Vec<FileMatcher>>,
 42 |     /// if true, slow_matchers is merged with fast matchers if accurate is enabled
 43 |     /// for example, in sqlite you want this disabled since the db extension can mean other things and the mime type matching is very accurate for sqlite.
 44 |     /// but for tar you want it enabled, since the tar extension is very accurate but the tar mime matcher can have false negatives
 45 |     pub keep_fast_matchers_if_accurate: bool,
 46 |     // if true, adapter is only used when user lists it in `--rga-adapters`
 47 |     pub disabled_by_default: bool,
 48 | }
 49 | impl AdapterMeta {
 50 |     // todo: this is pretty ugly
 51 |     pub fn get_matchers<'a>(
 52 |         &'a self,
 53 |         slow: bool,
 54 |     ) -> Box<dyn Iterator<Item = Cow<'a, FileMatcher>> + 'a> {
 55 |         match (
 56 |             slow,
 57 |             self.keep_fast_matchers_if_accurate,
 58 |             &self.slow_matchers,
 59 |         ) {
 60 |             (true, false, Some(sm)) => Box::new(sm.iter().map(Cow::Borrowed)),
 61 |             (true, true, Some(sm)) => Box::new(
 62 |                 sm.iter().map(Cow::Borrowed).chain(
 63 |                     self.fast_matchers
 64 |                         .iter()
 65 |                         .map(|e| Cow::Owned(FileMatcher::Fast(e.clone()))),
 66 |                 ),
 67 |             ),
 68 |             // don't have slow matchers or slow matching disabled
 69 |             (true, _, None) | (false, _, _) => Box::new(
 70 |                 self.fast_matchers
 71 |                     .iter()
 72 |                     .map(|e| Cow::Owned(FileMatcher::Fast(e.clone()))),
 73 |             ),
 74 |         }
 75 |     }
 76 | }
 77 | 
 78 | pub trait GetMetadata {
 79 |     fn metadata(&self) -> &AdapterMeta;
 80 | }
 81 | 
 82 | #[async_trait]
 83 | pub trait FileAdapter: GetMetadata + Send + Sync {
 84 |     /// adapt a file.
 85 |     ///
 86 |     /// detection_reason is the Matcher that was used to identify this file. Unless --rga-accurate was given, it is always a FastMatcher
 87 |     async fn adapt(
 88 |         &self,
 89 |         a: AdaptInfo,
 90 |         detection_reason: &FileMatcher,
 91 |     ) -> Result<AdaptedFilesIterBox>;
 92 | }
 93 | 
 94 | pub struct AdaptInfo {
 95 |     /// file path. May not be an actual file on the file system (e.g. in an archive). Used for matching file extensions.
 96 |     pub filepath_hint: PathBuf,
 97 |     /// true if filepath_hint is an actual file on the file system
 98 |     pub is_real_file: bool,
 99 |     /// depth at which this file is in archives. 0 for real filesystem
100 |     pub archive_recursion_depth: i32,
101 |     /// stream to read the file from. can be from a file or from some decoder
102 |     pub inp: ReadBox,
103 |     /// prefix every output line with this string to better indicate the file's location if it is in some archive
104 |     pub line_prefix: String,
105 |     pub postprocess: bool,
106 |     pub config: RgaConfig,
107 | }
108 | 
109 | /// (enabledAdapters, disabledAdapters)
110 | type AdaptersTuple = (Vec<Arc<dyn FileAdapter>>, Vec<Arc<dyn FileAdapter>>);
111 | 
112 | pub fn get_all_adapters(custom_adapters: Option<Vec<CustomAdapterConfig>>) -> AdaptersTuple {
113 |     // order in descending priority
114 |     let mut adapters: Vec<Arc<dyn FileAdapter>> = vec![];
115 |     if let Some(custom_adapters) = custom_adapters {
116 |         for adapter_config in custom_adapters {
117 |             adapters.push(Arc::new(adapter_config.to_adapter()));
118 |         }
119 |     }
120 | 
121 |     let internal_adapters: Vec<Arc<dyn FileAdapter>> = vec![
122 |         Arc::new(PostprocPageBreaks::default()),
123 |         Arc::new(ffmpeg::FFmpegAdapter::new()),
124 |         Arc::new(zip::ZipAdapter::new()),
125 |         Arc::new(decompress::DecompressAdapter::new()),
126 |         Arc::new(mbox::MboxAdapter::new()),
127 |         Arc::new(tar::TarAdapter::new()),
128 |         Arc::new(sqlite::SqliteAdapter::new()),
129 |     ];
130 |     adapters.extend(
131 |         BUILTIN_SPAWNING_ADAPTERS
132 |             .iter()
133 |             .map(|e| -> Arc<dyn FileAdapter> { Arc::new(e.to_adapter()) }),
134 |     );
135 |     adapters.extend(internal_adapters);
136 | 
137 |     adapters
138 |         .into_iter()
139 |         .partition(|e| !e.metadata().disabled_by_default)
140 | }
141 | 
142 | /**
143 |  * filter adapters by given names:
144 |  *
145 |  *  - "" means use default enabled adapter list
146 |  *  - "a,b" means use adapters a,b
147 |  *  - "-a,b" means use default list except for a and b
148 |  *  - "+a,b" means use default list but also a and b (a,b will be prepended to the list so given higher priority)
149 |  */
150 | pub fn get_adapters_filtered<T: AsRef<str>>(
151 |     custom_adapters: Option<Vec<CustomAdapterConfig>>,
152 |     adapter_names: &[T],
153 | ) -> Result<Vec<Arc<dyn FileAdapter>>> {
154 |     let (def_enabled_adapters, def_disabled_adapters) = get_all_adapters(custom_adapters);
155 |     let adapters = if !adapter_names.is_empty() {
156 |         let adapters_map: HashMap<_, _> = def_enabled_adapters
157 |             .iter()
158 |             .chain(def_disabled_adapters.iter())
159 |             .map(|e| (e.metadata().name.clone(), e.clone()))
160 |             .collect();
161 |         let mut adapters = vec![];
162 |         let mut subtractive = false;
163 |         let mut additive = false;
164 |         for (i, name) in adapter_names.iter().enumerate() {
165 |             let mut name = name.as_ref();
166 |             if i == 0 && (name.starts_with('-')) {
167 |                 subtractive = true;
168 |                 name = &name[1..];
169 |                 adapters = def_enabled_adapters.clone();
170 |             } else if i == 0 && (name.starts_with('+')) {
171 |                 name = &name[1..];
172 |                 adapters = def_enabled_adapters.clone();
173 |                 additive = true;
174 |             }
175 |             if subtractive {
176 |                 let inx = adapters
177 |                     .iter()
178 |                     .position(|a| a.metadata().name == name)
179 |                     .ok_or_else(|| format_err!("Could not remove adapter {}: Not in list", name))?;
180 |                 adapters.remove(inx);
181 |             } else {
182 |                 let adapter = adapters_map
183 |                     .get(name)
184 |                     .ok_or_else(|| {
185 |                         format_err!(
186 |                             "Unknown adapter: \"{}\". Known adapters: {}",
187 |                             name,
188 |                             adapters_map
189 |                                 .keys()
190 |                                 .map(|e| e.as_ref())
191 |                                 .collect::<Vec<&str>>()
192 |                                 .join(", ")
193 |                         )
194 |                     })?
195 |                     .clone();
196 |                 if additive {
197 |                     adapters.insert(0, adapter);
198 |                 } else {
199 |                     adapters.push(adapter);
200 |                 }
201 |             }
202 |         }
203 |         adapters
204 |     } else {
205 |         def_enabled_adapters
206 |     };
207 |     debug!(
208 |         "Chosen available adapters: {}",
209 |         adapters
210 |             .iter()
211 |             .map(|a| a.metadata().name.clone())
212 |             .collect::<Vec<String>>()
213 |             .join(",")
214 |     );
215 |     Ok(adapters)
216 | }
217 | 


--------------------------------------------------------------------------------
/src/adapters/custom.rs:
--------------------------------------------------------------------------------
  1 | use super::*;
  2 | use super::{AdaptInfo, AdapterMeta, FileAdapter, GetMetadata};
  3 | use crate::adapted_iter::one_file;
  4 | 
  5 | use crate::{
  6 |     adapted_iter::AdaptedFilesIterBox,
  7 |     expand::expand_str_ez,
  8 |     matching::{FastFileMatcher, FileMatcher},
  9 | };
 10 | use crate::{join_handle_to_stream, to_io_err};
 11 | use anyhow::Result;
 12 | use async_stream::stream;
 13 | use bytes::Bytes;
 14 | use lazy_static::lazy_static;
 15 | use log::debug;
 16 | use schemars::JsonSchema;
 17 | use serde::{Deserialize, Serialize};
 18 | use std::path::Path;
 19 | use std::process::Stdio;
 20 | use tokio::io::AsyncReadExt;
 21 | use tokio::process::Child;
 22 | use tokio::process::Command;
 23 | 
 24 | use tokio_util::io::StreamReader;
 25 | // mostly the same as AdapterMeta + SpawningFileAdapter
 26 | #[derive(Debug, Deserialize, Serialize, JsonSchema, Default, PartialEq, Clone)]
 27 | pub struct CustomAdapterConfig {
 28 |     /// The unique identifier and name of this adapter.
 29 |     ///
 30 |     /// Must only include a-z, 0-9, _.
 31 |     pub name: String,
 32 | 
 33 |     /// The description of this adapter shown in help.
 34 |     pub description: String,
 35 | 
 36 |     /// If true, the adapter will be disabled by default.
 37 |     pub disabled_by_default: Option<bool>,
 38 | 
 39 |     /// Version identifier used to key cache entries.
 40 |     ///
 41 |     /// Change this if the configuration or program changes.
 42 |     pub version: i32,
 43 | 
 44 |     /// The file extensions this adapter supports, for example `["epub", "mobi"]`.
 45 |     pub extensions: Vec<String>,
 46 | 
 47 |     /// If not null and `--rga-accurate` is enabled, mimetype matching is used instead of file name matching.
 48 |     pub mimetypes: Option<Vec<String>>,
 49 | 
 50 |     /// If `--rga-accurate`, only match by mime types and ignore extensions completely.
 51 |     pub match_only_by_mime: Option<bool>,
 52 | 
 53 |     /// The name or path of the binary to run.
 54 |     pub binary: String,
 55 | 
 56 |     /// The arguments to run the program with.
 57 |     /// Placeholders:
 58 |     /// - `$input_file_extension`: the file extension (without dot). e.g. foo.tar.gz -> gz
 59 |     /// - `$input_file_stem`: the file name without the last extension. e.g. foo.tar.gz -> foo.tar
 60 |     /// - `$input_virtual_path`: the full input file path.
 61 |     ///   Note that this path may not actually exist on disk because it is the result of another adapter.
 62 |     ///
 63 |     /// stdin of the program will be connected to the input file, and stdout is assumed to be the converted file
 64 |     pub args: Vec<String>,
 65 | 
 66 |     /// The output path hint.
 67 |     /// The placeholders are the same as for `.args`
 68 |     ///
 69 |     /// If not set, defaults to `"${input_virtual_path}.txt"`.
 70 |     ///
 71 |     /// Setting this is useful if the output format is not plain text (.txt) but instead some other format that should be passed to another adapter
 72 |     pub output_path_hint: Option<String>,
 73 | }
 74 | 
 75 | fn strs(arr: &[&str]) -> Vec<String> {
 76 |     arr.iter().map(ToString::to_string).collect()
 77 | }
 78 | 
 79 | lazy_static! {
 80 |     pub static ref BUILTIN_SPAWNING_ADAPTERS: Vec<CustomAdapterConfig> = vec![
 81 |         // from https://github.com/jgm/pandoc/blob/master/src/Text/Pandoc/App/FormatHeuristics.hs
 82 |         // excluding formats that could cause problems (.db ?= sqlite) or that are already text formats (e.g. xml-based)
 83 |         //"db"       -> Just "docbook"
 84 |         //"adoc"     -> Just "asciidoc"
 85 |         //"asciidoc" -> Just "asciidoc"
 86 |         //"context"  -> Just "context"
 87 |         //"ctx"      -> Just "context"
 88 |         //"dokuwiki" -> Just "dokuwiki"
 89 |         //"htm"      -> Just "html"
 90 |         //"html"     -> Just "html"
 91 |         //"json"     -> Just "json"
 92 |         //"latex"    -> Just "latex"
 93 |         //"lhs"      -> Just "markdown+lhs"
 94 |         //"ltx"      -> Just "latex"
 95 |         //"markdown" -> Just "markdown"
 96 |         //"md"       -> Just "markdown"
 97 |         //"ms"       -> Just "ms"
 98 |         //"muse"     -> Just "muse"
 99 |         //"native"   -> Just "native"
100 |         //"opml"     -> Just "opml"
101 |         //"org"      -> Just "org"
102 |         //"roff"     -> Just "ms"
103 |         //"rst"      -> Just "rst"
104 |         //"s5"       -> Just "s5"
105 |         //"t2t"      -> Just "t2t"
106 |         //"tei"      -> Just "tei"
107 |         //"tei.xml"  -> Just "tei"
108 |         //"tex"      -> Just "latex"
109 |         //"texi"     -> Just "texinfo"
110 |         //"texinfo"  -> Just "texinfo"
111 |         //"textile"  -> Just "textile"
112 |         //"text"     -> Just "markdown"
113 |         //"txt"      -> Just "markdown"
114 |         //"xhtml"    -> Just "html"
115 |         //"wiki"     -> Just "mediawiki"
116 |         CustomAdapterConfig {
117 |             name: "pandoc".to_string(),
118 |             description: "Uses pandoc to convert binary/unreadable text documents to plain markdown-like text".to_string(),
119 |             version: 3,
120 |             extensions: strs(&["epub", "odt", "docx", "fb2", "ipynb", "html", "htm"]),
121 |             binary: "pandoc".to_string(),
122 |             mimetypes: None,
123 |             // simpler markdown (with more information loss but plainer text)
124 |             //.arg("--to=commonmark-header_attributes-link_attributes-fenced_divs-markdown_in_html_blocks-raw_html-native_divs-native_spans-bracketed_spans")
125 |             args: strs(&[
126 |                 "--from=$input_file_extension",
127 |                 "--to=plain",
128 |                 "--wrap=none",
129 |                 "--markdown-headings=atx"
130 |             ]),
131 |             disabled_by_default: None,
132 |             match_only_by_mime: None,
133 |             output_path_hint: None
134 |         },
135 |         CustomAdapterConfig {
136 |             name: "poppler".to_owned(),
137 |             version: 1,
138 |             description: "Uses pdftotext (from poppler-utils) to extract plain text from PDF files"
139 |                 .to_owned(),
140 | 
141 |             extensions: strs(&["pdf"]),
142 |             mimetypes: Some(strs(&["application/pdf"])),
143 | 
144 |             binary: "pdftotext".to_string(),
145 |             args: strs(&["-", "-"]),
146 |             disabled_by_default: None,
147 |             match_only_by_mime: None,
148 |             output_path_hint: Some("${input_virtual_path}.txt.asciipagebreaks".into())
149 |         }
150 |     ];
151 | }
152 | 
153 | /// replace a Command.spawn() error "File not found" with a more readable error
154 | /// to indicate some program is not installed
155 | pub fn map_exe_error(err: std::io::Error, exe_name: &str, help: &str) -> anyhow::Error {
156 |     use std::io::ErrorKind::*;
157 |     match err.kind() {
158 |         NotFound => format_err!("Could not find executable \"{}\". {}", exe_name, help),
159 |         _ => anyhow::Error::from(err),
160 |     }
161 | }
162 | 
163 | fn proc_wait(mut child: Child, context: impl FnOnce() -> String) -> impl AsyncRead {
164 |     let s = stream! {
165 |         let res = child.wait().await?;
166 |         if res.success() {
167 |             yield std::io::Result::Ok(Bytes::new());
168 |         } else {
169 |             Err(format_err!("{:?}", res)).with_context(context).map_err(to_io_err)?;
170 |         }
171 |     };
172 |     StreamReader::new(s)
173 | }
174 | 
175 | pub fn pipe_output(
176 |     _line_prefix: &str,
177 |     mut cmd: Command,
178 |     inp: ReadBox,
179 |     exe_name: &str,
180 |     help: &str,
181 | ) -> Result<ReadBox> {
182 |     let cmd_log = format!("{:?}", cmd); // todo: perf
183 |     let mut cmd = cmd
184 |         .stdin(Stdio::piped())
185 |         .stdout(Stdio::piped())
186 |         .spawn()
187 |         .map_err(|e| map_exe_error(e, exe_name, help))?;
188 |     let mut stdi = cmd.stdin.take().expect("is piped");
189 |     let stdo = cmd.stdout.take().expect("is piped");
190 | 
191 |     let join = tokio::spawn(async move {
192 |         let mut z = inp;
193 |         tokio::io::copy(&mut z, &mut stdi).await?;
194 |         std::io::Result::Ok(())
195 |     });
196 |     Ok(Box::pin(stdo.chain(
197 |         proc_wait(cmd, move || format!("subprocess: {cmd_log}")).chain(join_handle_to_stream(join)),
198 |     )))
199 | }
200 | 
201 | pub struct CustomSpawningFileAdapter {
202 |     binary: String,
203 |     args: Vec<String>,
204 |     meta: AdapterMeta,
205 |     output_path_hint: Option<String>,
206 | }
207 | impl GetMetadata for CustomSpawningFileAdapter {
208 |     fn metadata(&self) -> &AdapterMeta {
209 |         &self.meta
210 |     }
211 | }
212 | fn arg_replacer(arg: &str, filepath_hint: &Path) -> Result<String> {
213 |     expand_str_ez(arg, |s| match s {
214 |         "input_virtual_path" => Ok(filepath_hint.to_string_lossy()),
215 |         "input_file_stem" => Ok(filepath_hint
216 |             .file_stem()
217 |             .unwrap_or_default()
218 |             .to_string_lossy()),
219 |         "input_file_extension" => Ok(filepath_hint
220 |             .extension()
221 |             .unwrap_or_default()
222 |             .to_string_lossy()),
223 |         e => Err(anyhow::format_err!("unknown replacer ${{{e}}}")),
224 |     })
225 | }
226 | impl CustomSpawningFileAdapter {
227 |     fn command(
228 |         &self,
229 |         filepath_hint: &std::path::Path,
230 |         mut command: tokio::process::Command,
231 |     ) -> Result<tokio::process::Command> {
232 |         command.args(
233 |             self.args
234 |                 .iter()
235 |                 .map(|arg| arg_replacer(arg, filepath_hint))
236 |                 .collect::<Result<Vec<_>>>()?,
237 |         );
238 |         log::debug!("running command {:?}", command);
239 |         Ok(command)
240 |     }
241 | }
242 | #[async_trait]
243 | impl FileAdapter for CustomSpawningFileAdapter {
244 |     async fn adapt(
245 |         &self,
246 |         ai: AdaptInfo,
247 |         _detection_reason: &FileMatcher,
248 |     ) -> Result<AdaptedFilesIterBox> {
249 |         let AdaptInfo {
250 |             filepath_hint,
251 |             inp,
252 |             line_prefix,
253 |             archive_recursion_depth,
254 |             postprocess,
255 |             config,
256 |             ..
257 |         } = ai;
258 | 
259 |         let cmd = Command::new(&self.binary);
260 |         let cmd = self
261 |             .command(&filepath_hint, cmd)
262 |             .with_context(|| format!("Could not set cmd arguments for {}", self.binary))?;
263 |         debug!("executing {:?}", cmd);
264 |         let output = pipe_output(&line_prefix, cmd, inp, &self.binary, "")?;
265 |         Ok(one_file(AdaptInfo {
266 |             filepath_hint: PathBuf::from(arg_replacer(
267 |                 self.output_path_hint
268 |                     .as_deref()
269 |                     .unwrap_or("${input_virtual_path}.txt"),
270 |                 &filepath_hint,
271 |             )?),
272 |             inp: output,
273 |             line_prefix,
274 |             is_real_file: false,
275 |             archive_recursion_depth: archive_recursion_depth + 1,
276 |             postprocess,
277 |             config,
278 |         }))
279 |     }
280 | }
281 | impl CustomAdapterConfig {
282 |     pub fn to_adapter(&self) -> CustomSpawningFileAdapter {
283 |         CustomSpawningFileAdapter {
284 |             binary: self.binary.clone(),
285 |             args: self.args.clone(),
286 |             output_path_hint: self.output_path_hint.clone(),
287 |             meta: AdapterMeta {
288 |                 name: self.name.clone(),
289 |                 version: self.version,
290 |                 description: format!(
291 |                     "{}\nRuns: {} {}",
292 |                     self.description,
293 |                     self.binary,
294 |                     self.args.join(" ")
295 |                 ),
296 |                 recurses: true,
297 |                 fast_matchers: self
298 |                     .extensions
299 |                     .iter()
300 |                     .map(|s| FastFileMatcher::FileExtension(s.to_string()))
301 |                     .collect(),
302 |                 slow_matchers: self.mimetypes.as_ref().map(|mimetypes| {
303 |                     mimetypes
304 |                         .iter()
305 |                         .map(|s| FileMatcher::MimeType(s.to_string()))
306 |                         .collect()
307 |                 }),
308 |                 keep_fast_matchers_if_accurate: !self.match_only_by_mime.unwrap_or(false),
309 |                 disabled_by_default: self.disabled_by_default.unwrap_or(false),
310 |             },
311 |         }
312 |     }
313 | }
314 | 
315 | #[cfg(test)]
316 | mod test {
317 |     use super::super::FileAdapter;
318 |     use super::*;
319 |     use crate::preproc::loop_adapt;
320 |     use crate::test_utils::*;
321 |     use anyhow::Result;
322 |     use pretty_assertions::assert_eq;
323 |     use tokio::fs::File;
324 | 
325 |     #[tokio::test]
326 |     async fn poppler() -> Result<()> {
327 |         let adapter = poppler_adapter();
328 | 
329 |         let filepath = test_data_dir().join("short.pdf");
330 | 
331 |         let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
332 |         // let r = adapter.adapt(a, &d)?;
333 |         let r = loop_adapt(&adapter, d, a).await?;
334 |         let o = adapted_to_vec(r).await?;
335 |         assert_eq!(
336 |             String::from_utf8(o)?,
337 |             "PREFIX:Page 1: hello world
338 | PREFIX:Page 1: this is just a test.
339 | PREFIX:Page 1: 
340 | PREFIX:Page 1: 1
341 | PREFIX:Page 1: 
342 | PREFIX:Page 1: 
343 | "
344 |         );
345 |         Ok(())
346 |     }
347 | 
348 |     use crate::{
349 |         adapters::custom::CustomAdapterConfig,
350 |         test_utils::{adapted_to_vec, simple_adapt_info},
351 |     };
352 |     use std::io::Cursor;
353 | 
354 |     #[tokio::test]
355 |     async fn streaming() -> anyhow::Result<()> {
356 |         // an adapter that converts input line by line (deadlocks if the parent process tries to write everything and only then read it)
357 |         let adapter = CustomAdapterConfig {
358 |             name: "simple text replacer".to_string(),
359 |             description: "oo".to_string(),
360 |             disabled_by_default: None,
361 |             version: 1,
362 |             extensions: vec!["txt".to_string()],
363 |             mimetypes: None,
364 |             match_only_by_mime: None,
365 |             binary: "sed".to_string(),
366 |             args: vec!["s/e/u/g".to_string()],
367 |             output_path_hint: None,
368 |         };
369 | 
370 |         let adapter = adapter.to_adapter();
371 |         let input = r#"
372 |         This is the story of a
373 |         very strange lorry
374 |         with a long dead crew
375 |         and a witch with the flu
376 |         "#;
377 |         let input = format!("{input}{input}{input}{input}");
378 |         let input = format!("{input}{input}{input}{input}");
379 |         let input = format!("{input}{input}{input}{input}");
380 |         let input = format!("{input}{input}{input}{input}");
381 |         let input = format!("{input}{input}{input}{input}");
382 |         let input = format!("{input}{input}{input}{input}");
383 |         let (a, d) = simple_adapt_info(
384 |             Path::new("foo.txt"),
385 |             Box::pin(Cursor::new(Vec::from(input))),
386 |         );
387 |         let output = adapter.adapt(a, &d).await.unwrap();
388 | 
389 |         let oup = adapted_to_vec(output).await?;
390 |         println!("output: {}", String::from_utf8_lossy(&oup));
391 |         Ok(())
392 |     }
393 | }
394 | 


--------------------------------------------------------------------------------
/src/adapters/decompress.rs:
--------------------------------------------------------------------------------
  1 | use crate::adapted_iter::one_file;
  2 | 
  3 | use super::*;
  4 | 
  5 | use anyhow::Result;
  6 | use lazy_static::lazy_static;
  7 | use tokio::io::BufReader;
  8 | 
  9 | use std::path::{Path, PathBuf};
 10 | 
 11 | static EXTENSIONS: &[&str] = &["als", "bz2", "gz", "tbz", "tbz2", "tgz", "xz", "zst"];
 12 | static MIME_TYPES: &[&str] = &[
 13 |     "application/gzip",
 14 |     "application/x-bzip",
 15 |     "application/x-xz",
 16 |     "application/zstd",
 17 | ];
 18 | lazy_static! {
 19 |     static ref METADATA: AdapterMeta = AdapterMeta {
 20 |         name: "decompress".to_owned(),
 21 |         version: 1,
 22 |         description:
 23 |             "Reads compressed file as a stream and runs a different extractor on the contents."
 24 |                 .to_owned(),
 25 |         recurses: true,
 26 |         fast_matchers: EXTENSIONS
 27 |             .iter()
 28 |             .map(|s| FastFileMatcher::FileExtension(s.to_string()))
 29 |             .collect(),
 30 |         slow_matchers: Some(
 31 |             MIME_TYPES
 32 |                 .iter()
 33 |                 .map(|s| FileMatcher::MimeType(s.to_string()))
 34 |                 .collect()
 35 |         ),
 36 |         disabled_by_default: false,
 37 |         keep_fast_matchers_if_accurate: true
 38 |     };
 39 | }
 40 | #[derive(Default)]
 41 | pub struct DecompressAdapter;
 42 | 
 43 | impl DecompressAdapter {
 44 |     pub fn new() -> Self {
 45 |         Self
 46 |     }
 47 | }
 48 | impl GetMetadata for DecompressAdapter {
 49 |     fn metadata(&self) -> &AdapterMeta {
 50 |         &METADATA
 51 |     }
 52 | }
 53 | 
 54 | fn decompress_any(reason: &FileMatcher, inp: ReadBox) -> Result<ReadBox> {
 55 |     use FastFileMatcher::*;
 56 |     use FileMatcher::*;
 57 |     use async_compression::tokio::bufread;
 58 |     let gz = |inp: ReadBox| Box::pin(bufread::GzipDecoder::new(BufReader::new(inp)));
 59 |     let bz2 = |inp: ReadBox| Box::pin(bufread::BzDecoder::new(BufReader::new(inp)));
 60 |     let xz = |inp: ReadBox| Box::pin(bufread::XzDecoder::new(BufReader::new(inp)));
 61 |     let zst = |inp: ReadBox| Box::pin(bufread::ZstdDecoder::new(BufReader::new(inp)));
 62 | 
 63 |     Ok(match reason {
 64 |         Fast(FileExtension(ext)) => match ext.as_ref() {
 65 |             "als" | "gz" | "tgz" => gz(inp),
 66 |             "bz2" | "tbz" | "tbz2" => bz2(inp),
 67 |             "zst" => zst(inp),
 68 |             "xz" => xz(inp),
 69 |             ext => Err(format_err!("don't know how to decompress {}", ext))?,
 70 |         },
 71 |         MimeType(mime) => match mime.as_ref() {
 72 |             "application/gzip" => gz(inp),
 73 |             "application/x-bzip" => bz2(inp),
 74 |             "application/x-xz" => xz(inp),
 75 |             "application/zstd" => zst(inp),
 76 |             mime => Err(format_err!("don't know how to decompress mime {}", mime))?,
 77 |         },
 78 |     })
 79 | }
 80 | fn get_inner_filename(filename: &Path) -> PathBuf {
 81 |     let extension = filename
 82 |         .extension()
 83 |         .map(|e| e.to_string_lossy())
 84 |         .unwrap_or(Cow::Borrowed(""));
 85 |     let stem = filename
 86 |         .file_stem()
 87 |         .expect("no filename given?")
 88 |         .to_string_lossy();
 89 |     let new_extension = match extension.as_ref() {
 90 |         "tgz" | "tbz" | "tbz2" => ".tar",
 91 |         _other => "",
 92 |     };
 93 |     filename.with_file_name(format!("{}{}", stem, new_extension))
 94 | }
 95 | 
 96 | #[async_trait]
 97 | impl FileAdapter for DecompressAdapter {
 98 |     async fn adapt(
 99 |         &self,
100 |         ai: AdaptInfo,
101 |         detection_reason: &FileMatcher,
102 |     ) -> Result<AdaptedFilesIterBox> {
103 |         Ok(one_file(AdaptInfo {
104 |             filepath_hint: get_inner_filename(&ai.filepath_hint),
105 |             is_real_file: false,
106 |             archive_recursion_depth: ai.archive_recursion_depth + 1,
107 |             inp: decompress_any(detection_reason, ai.inp)?,
108 |             line_prefix: ai.line_prefix,
109 |             config: ai.config.clone(),
110 |             postprocess: ai.postprocess,
111 |         }))
112 |     }
113 | }
114 | 
115 | #[cfg(test)]
116 | mod tests {
117 |     use super::*;
118 |     use crate::preproc::loop_adapt;
119 |     use crate::test_utils::*;
120 |     use pretty_assertions::assert_eq;
121 |     use tokio::fs::File;
122 | 
123 |     #[test]
124 |     fn test_inner_filename() {
125 |         for (a, b) in &[
126 |             ("hi/test.tgz", "hi/test.tar"),
127 |             ("hi/hello.gz", "hi/hello"),
128 |             ("a/b/initramfs", "a/b/initramfs"),
129 |             ("hi/test.tbz2", "hi/test.tar"),
130 |             ("hi/test.tbz", "hi/test.tar"),
131 |             ("hi/test.hi.bz2", "hi/test.hi"),
132 |             ("hello.tar.gz", "hello.tar"),
133 |         ] {
134 |             assert_eq!(get_inner_filename(&PathBuf::from(a)), PathBuf::from(*b));
135 |         }
136 |     }
137 | 
138 |     #[tokio::test]
139 |     async fn gz() -> Result<()> {
140 |         let adapter = DecompressAdapter;
141 | 
142 |         let filepath = test_data_dir().join("hello.gz");
143 | 
144 |         let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
145 |         let r = adapter.adapt(a, &d).await?;
146 |         let o = adapted_to_vec(r).await?;
147 |         assert_eq!(String::from_utf8(o)?, "hello\n");
148 |         Ok(())
149 |     }
150 | 
151 |     #[tokio::test]
152 |     async fn pdf_gz() -> Result<()> {
153 |         let adapter = DecompressAdapter;
154 | 
155 |         let filepath = test_data_dir().join("short.pdf.gz");
156 | 
157 |         let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
158 |         let r = loop_adapt(&adapter, d, a).await?;
159 |         let o = adapted_to_vec(r).await?;
160 |         assert_eq!(
161 |             String::from_utf8(o)?,
162 |             "PREFIX:Page 1: hello world
163 | PREFIX:Page 1: this is just a test.
164 | PREFIX:Page 1: 
165 | PREFIX:Page 1: 1
166 | PREFIX:Page 1: 
167 | PREFIX:Page 1: 
168 | "
169 |         );
170 |         Ok(())
171 |     }
172 | }
173 | 


--------------------------------------------------------------------------------
/src/adapters/ffmpeg.rs:
--------------------------------------------------------------------------------
  1 | use super::*;
  2 | use super::{custom::map_exe_error, writing::async_writeln};
  3 | use anyhow::*;
  4 | use async_trait::async_trait;
  5 | use lazy_static::lazy_static;
  6 | use regex::Regex;
  7 | use serde::{Deserialize, Serialize};
  8 | use std::process::Stdio;
  9 | use tokio::io::AsyncWrite;
 10 | use tokio::io::{AsyncBufReadExt, BufReader};
 11 | use tokio::process::Command;
 12 | use writing::WritingFileAdapter;
 13 | // todo:
 14 | // maybe todo: read list of extensions from
 15 | // ffmpeg -demuxers | tail -n+5 | awk '{print $2}' | while read demuxer; do echo MUX=$demuxer; ffmpeg -h demuxer=$demuxer | grep 'Common extensions'; done 2>/dev/null
 16 | // but really, the probability of getting useful information from a .flv is low
 17 | static EXTENSIONS: &[&str] = &["mkv", "mp4", "avi", "mp3", "ogg", "flac", "webm"];
 18 | 
 19 | lazy_static! {
 20 |     static ref METADATA: AdapterMeta = AdapterMeta {
 21 |         name: "ffmpeg".to_owned(),
 22 |         version: 1,
 23 |         description:
 24 |             "Uses ffmpeg to extract video metadata/chapters, subtitles, lyrics, and other metadata"
 25 |                 .to_owned(),
 26 |         recurses: false,
 27 |         fast_matchers: EXTENSIONS
 28 |             .iter()
 29 |             .map(|s| FastFileMatcher::FileExtension(s.to_string()))
 30 |             .collect(),
 31 |         slow_matchers: None,
 32 |         disabled_by_default: false,
 33 |         keep_fast_matchers_if_accurate: true
 34 |     };
 35 | }
 36 | 
 37 | #[derive(Default, Clone)]
 38 | pub struct FFmpegAdapter;
 39 | 
 40 | impl FFmpegAdapter {
 41 |     pub fn new() -> Self {
 42 |         Self
 43 |     }
 44 | }
 45 | impl GetMetadata for FFmpegAdapter {
 46 |     fn metadata(&self) -> &AdapterMeta {
 47 |         &METADATA
 48 |     }
 49 | }
 50 | 
 51 | #[derive(Serialize, Deserialize)]
 52 | struct FFprobeOutput {
 53 |     streams: Vec<FFprobeStream>,
 54 | }
 55 | #[derive(Serialize, Deserialize)]
 56 | struct FFprobeStream {
 57 |     index: i32, // stream index
 58 | }
 59 | 
 60 | #[async_trait]
 61 | impl WritingFileAdapter for FFmpegAdapter {
 62 |     async fn adapt_write(
 63 |         ai: AdaptInfo,
 64 |         _detection_reason: &FileMatcher,
 65 |         mut oup: Pin<Box<dyn AsyncWrite + Send>>,
 66 |     ) -> Result<()> {
 67 |         let AdaptInfo {
 68 |             is_real_file,
 69 |             filepath_hint,
 70 |             line_prefix,
 71 |             ..
 72 |         } = ai;
 73 |         if !is_real_file {
 74 |             // we *could* probably adapt this to also work based on streams,
 75 |             // it would require using a BufReader to read at least part of the file to memory
 76 |             // but really when would you want to search for videos within archives?
 77 |             // So instead, we only run this adapter if the file is a actual file on disk for now
 78 |             async_writeln!(oup, "{line_prefix}[rga: skipping video in archive]\n")?;
 79 |             return Ok(());
 80 |         }
 81 |         let inp_fname = filepath_hint;
 82 |         let spawn_fail = |e| map_exe_error(e, "ffprobe", "Make sure you have ffmpeg installed.");
 83 |         let subtitle_streams = {
 84 |             let probe = Command::new("ffprobe")
 85 |                 .args(vec![
 86 |                     "-v",
 87 |                     "error", // show all errors
 88 |                     "-select_streams",
 89 |                     "s", // show only subtitle streams
 90 |                     "-of",
 91 |                     "json", // use json as output format
 92 |                     "-show_entries",
 93 |                     "stream=index", // show index of subtitle streams
 94 |                 ])
 95 |                 .arg("-i")
 96 |                 .arg(&inp_fname)
 97 |                 .output()
 98 |                 .await
 99 |                 .map_err(spawn_fail)?;
100 |             if !probe.status.success() {
101 |                 return Err(format_err!(
102 |                     "ffprobe failed: {:?}\n{}",
103 |                     probe.status,
104 |                     String::from_utf8_lossy(&probe.stderr)
105 |                 ));
106 |             }
107 |             let p: FFprobeOutput = serde_json::from_slice(&probe.stdout)?;
108 |             p.streams
109 |         };
110 |         {
111 |             // extract file metadata (especially chapter names in a greppable format)
112 |             let mut probe = Command::new("ffprobe")
113 |                 .args(vec![
114 |                     "-v",
115 |                     "error",
116 |                     "-show_format",
117 |                     "-show_streams",
118 |                     "-of",
119 |                     "flat",
120 |                     // "-show_data",
121 |                     "-show_error",
122 |                     "-show_programs",
123 |                     "-show_chapters",
124 |                     // "-count_frames",
125 |                     //"-count_packets",
126 |                 ])
127 |                 .arg("-i")
128 |                 .arg(&inp_fname)
129 |                 .stdout(Stdio::piped())
130 |                 .spawn()?;
131 |             let mut lines = BufReader::new(probe.stdout.as_mut().unwrap()).lines();
132 |             while let Some(line) = lines.next_line().await? {
133 |                 let line = line.replace("\\r\\n", "\n").replace("\\n", "\n"); // just unescape newlines
134 |                 async_writeln!(oup, "metadata: {line}")?;
135 |             }
136 |             let exit = probe.wait().await?;
137 |             if !exit.success() {
138 |                 return Err(format_err!("ffprobe failed: {:?}", exit));
139 |             }
140 |         }
141 |         if !subtitle_streams.is_empty() {
142 |             let time_re = Regex::new(r".*\d.*-->.*\d.*").unwrap();
143 |             for probe_stream in subtitle_streams.iter() {
144 |                 // extract subtitles
145 |                 let mut cmd = Command::new("ffmpeg");
146 |                 cmd.arg("-hide_banner")
147 |                     .arg("-loglevel")
148 |                     .arg("panic")
149 |                     .arg("-i")
150 |                     .arg(&inp_fname)
151 |                     .arg("-map")
152 |                     .arg(format!("0:{}", probe_stream.index)) // 0 for first input
153 |                     .arg("-f")
154 |                     .arg("webvtt")
155 |                     .arg("-");
156 |                 let mut cmd = cmd.stdout(Stdio::piped()).spawn().map_err(spawn_fail)?;
157 |                 let stdo = cmd.stdout.as_mut().expect("is piped");
158 |                 let mut time: String = "".to_owned();
159 |                 // rewrite subtitle times so they are shown as a prefix in every line
160 |                 let mut lines = BufReader::new(stdo).lines();
161 |                 while let Some(line) = lines.next_line().await? {
162 |                     // 09:55.195 --> 09:56.730
163 |                     if time_re.is_match(&line) {
164 |                         time = line.to_owned();
165 |                     } else if line.is_empty() {
166 |                         async_writeln!(oup)?;
167 |                     } else {
168 |                         async_writeln!(oup, "{time}: {line}")?;
169 |                     }
170 |                 }
171 |             }
172 |         }
173 |         Ok(())
174 |     }
175 | }
176 | 


--------------------------------------------------------------------------------
/src/adapters/mbox.rs:
--------------------------------------------------------------------------------
  1 | use super::*;
  2 | 
  3 | use anyhow::Result;
  4 | use async_stream::stream;
  5 | use lazy_static::lazy_static;
  6 | use mime2ext::mime2ext;
  7 | use regex::bytes::Regex;
  8 | use tokio::io::AsyncReadExt;
  9 | 
 10 | use std::{collections::VecDeque, io::Cursor};
 11 | 
 12 | static EXTENSIONS: &[&str] = &["mbox", "mbx", "eml"];
 13 | static MIME_TYPES: &[&str] = &["application/mbox", "message/rfc822"];
 14 | lazy_static! {
 15 |     static ref METADATA: AdapterMeta = AdapterMeta {
 16 |         name: "mail".to_owned(),
 17 |         version: 1,
 18 |         description:
 19 |             "Reads mailbox/mail files and runs extractors on the contents and attachments."
 20 |                 .to_owned(),
 21 |         recurses: true,
 22 |         fast_matchers: EXTENSIONS
 23 |             .iter()
 24 |             .map(|s| FastFileMatcher::FileExtension(s.to_string()))
 25 |             .collect(),
 26 |         slow_matchers: Some(
 27 |             MIME_TYPES
 28 |                 .iter()
 29 |                 .map(|s| FileMatcher::MimeType(s.to_string()))
 30 |                 .collect()
 31 |         ),
 32 |         disabled_by_default: true,
 33 |         keep_fast_matchers_if_accurate: true
 34 |     };
 35 |     static ref FROM_REGEX: Regex = Regex::new("\r?\nFrom [^\n]+\n").unwrap();
 36 | }
 37 | #[derive(Default)]
 38 | pub struct MboxAdapter;
 39 | 
 40 | impl MboxAdapter {
 41 |     pub fn new() -> Self {
 42 |         Self
 43 |     }
 44 | }
 45 | impl GetMetadata for MboxAdapter {
 46 |     fn metadata(&self) -> &AdapterMeta {
 47 |         &METADATA
 48 |     }
 49 | }
 50 | 
 51 | #[async_trait]
 52 | impl FileAdapter for MboxAdapter {
 53 |     async fn adapt(
 54 |         &self,
 55 |         ai: AdaptInfo,
 56 |         _detection_reason: &FileMatcher,
 57 |     ) -> Result<AdaptedFilesIterBox> {
 58 |         let AdaptInfo {
 59 |             filepath_hint,
 60 |             mut inp,
 61 |             line_prefix,
 62 |             archive_recursion_depth,
 63 |             config,
 64 |             postprocess,
 65 |             ..
 66 |         } = ai;
 67 | 
 68 |         let mut content = Vec::new();
 69 |         let s = stream! {
 70 |             inp.read_to_end(&mut content).await?;
 71 | 
 72 |             let mut ais = vec![];
 73 |             for mail_bytes in FROM_REGEX.splitn(&content, usize::MAX) {
 74 |                 let mail_content = mail_bytes.splitn(2, |x| *x == b'\n').nth(1).unwrap();
 75 |                 let mail = mailparse::parse_mail(mail_content);
 76 |                 if mail.is_err() {
 77 |                     continue;
 78 |                 }
 79 |                 let mail = mail.unwrap();
 80 | 
 81 |                 let mut todos = VecDeque::new();
 82 |                 todos.push_back(mail);
 83 | 
 84 |                 while let Some(mail) = todos.pop_front() {
 85 |                 let mut path = filepath_hint.clone();
 86 |                 let filename = mail.get_content_disposition().params.get("filename").cloned();
 87 |                 match &*mail.ctype.mimetype {
 88 |                     x if x.starts_with("multipart/") => {
 89 |                         todos.extend(mail.subparts);
 90 |                         continue;
 91 |                     }
 92 |                     mime => {
 93 |                         if let Some(name) = filename {
 94 |                             path.push(name);
 95 |                         } else if let Some(extension) = mime2ext(mime) {
 96 |                             path.push(format!("data.{extension}"));
 97 |                         } else {
 98 |                             path.push("data");
 99 |                         }
100 |                     }
101 |                 }
102 | 
103 |                 let mut config = config.clone();
104 |                 config.accurate = true;
105 | 
106 |                 let raw_body = mail.get_body_raw();
107 |                 if raw_body.is_err() {
108 |                     continue;
109 |                 }
110 |                 let ai2: AdaptInfo = AdaptInfo {
111 |                     filepath_hint: path,
112 |                     is_real_file: false,
113 |                     archive_recursion_depth: archive_recursion_depth + 1,
114 |                     inp: Box::pin(Cursor::new(raw_body.unwrap())),
115 |                     line_prefix: line_prefix.to_string(),
116 |                     config,
117 |                     postprocess,
118 |                 };
119 |                 ais.push(ai2);
120 |                 }
121 |             }
122 |             for a in ais {
123 |                 yield(Ok(a));
124 |             }
125 |         };
126 |         Ok(Box::pin(s))
127 |     }
128 | }
129 | 
130 | #[cfg(test)]
131 | mod tests {
132 |     use super::*;
133 |     use crate::preproc::loop_adapt;
134 |     use crate::test_utils::*;
135 |     use pretty_assertions::assert_eq;
136 |     use tokio::fs::File;
137 |     use tokio_stream::StreamExt;
138 | 
139 |     #[tokio::test]
140 |     async fn mail_simple() -> Result<()> {
141 |         let adapter = MboxAdapter;
142 | 
143 |         let filepath = test_data_dir().join("github_email.eml");
144 | 
145 |         let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
146 |         let mut r = adapter.adapt(a, &d).await?;
147 |         let mut count = 0;
148 |         while let Some(file) = r.next().await {
149 |             let mut file = file?;
150 |             let mut buf = Vec::new();
151 |             file.inp.read_to_end(&mut buf).await?;
152 |             match file
153 |                 .filepath_hint
154 |                 .components()
155 |                 .last()
156 |                 .unwrap()
157 |                 .as_os_str()
158 |                 .to_str()
159 |                 .unwrap()
160 |             {
161 |                 "data.txt" | "data.html" => {
162 |                     assert!(String::from_utf8(buf)?.contains("Thank you for your contribution"));
163 |                 }
164 |                 x => panic!("unexpected filename {x:?}"),
165 |             }
166 |             count += 1;
167 |         }
168 |         assert_eq!(2, count);
169 |         Ok(())
170 |     }
171 | 
172 |     #[tokio::test]
173 |     async fn mbox_simple() -> Result<()> {
174 |         let adapter = MboxAdapter;
175 | 
176 |         let filepath = test_data_dir().join("test.mbx");
177 | 
178 |         let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
179 |         let mut r = adapter.adapt(a, &d).await?;
180 |         let mut count = 0;
181 |         while let Some(file) = r.next().await {
182 |             let mut file = file?;
183 |             assert_eq!(
184 |                 "data.html",
185 |                 file.filepath_hint.components().last().unwrap().as_os_str()
186 |             );
187 |             let mut buf = Vec::new();
188 |             file.inp.read_to_end(&mut buf).await?;
189 |             assert_eq!(
190 |                 "<html>\r\n  <head>\r\n    <meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8\">\r\n  </head>\r\n  <body>\r\n    <p>&gt;From</p>\r\n    <p>Another word &gt;From<br>\r\n    </p>\r\n  </body>\r\n</html>",
191 |                 String::from_utf8(buf)?.trim()
192 |             );
193 |             count += 1;
194 |         }
195 |         assert_eq!(3, count);
196 |         Ok(())
197 |     }
198 | 
199 |     #[tokio::test]
200 |     async fn mbox_attachment() -> Result<()> {
201 |         init_logging();
202 | 
203 |         let adapter = MboxAdapter;
204 | 
205 |         let filepath = test_data_dir().join("mail_with_attachment.mbox");
206 | 
207 |         let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
208 |         let mut r = loop_adapt(&adapter, d, a).await?;
209 |         let mut count = 0;
210 |         while let Some(file) = r.next().await {
211 |             let mut file = file?;
212 |             let path = file
213 |                 .filepath_hint
214 |                 .components()
215 |                 .last()
216 |                 .unwrap()
217 |                 .as_os_str()
218 |                 .to_str()
219 |                 .unwrap();
220 |             let mut buf = Vec::new();
221 |             file.inp.read_to_end(&mut buf).await?;
222 |             match path {
223 |                 "data.html.txt" => {
224 |                     assert_eq!(
225 |                         "PREFIX:regular text\nPREFIX:\n",
226 |                         String::from_utf8(buf).unwrap_or("err".to_owned())
227 |                     );
228 |                 }
229 |                 "short.pdf.txt" => {
230 |                     assert_eq!(
231 |                         "PREFIX:Page 1: hello world\nPREFIX:Page 1: this is just a test.\nPREFIX:Page 1: \nPREFIX:Page 1: 1\nPREFIX:Page 1: \nPREFIX:Page 1: \n",
232 |                         String::from_utf8(buf).unwrap_or("err".to_owned())
233 |                     );
234 |                 }
235 |                 _ => {
236 |                     panic!("unrelated {path:?}");
237 |                 }
238 |             }
239 |             count += 1;
240 |         }
241 |         assert_eq!(2, count); // one message + one attachment
242 |         Ok(())
243 |     }
244 | }
245 | 


--------------------------------------------------------------------------------
/src/adapters/postproc.rs:
--------------------------------------------------------------------------------
  1 | //trait RunFnAdapter: GetMetadata {}
  2 | 
  3 | //impl<T> FileAdapter for T where T: RunFnAdapter {}
  4 | 
  5 | use anyhow::Result;
  6 | use async_stream::stream;
  7 | use async_trait::async_trait;
  8 | use bytes::Bytes;
  9 | use encoding_rs::Encoding;
 10 | use encoding_rs_io::DecodeReaderBytesBuilder;
 11 | use tokio_util::io::SyncIoBridge;
 12 | 
 13 | use std::io::Cursor;
 14 | use std::path::PathBuf;
 15 | use std::pin::Pin;
 16 | use tokio::io::{AsyncRead, AsyncReadExt};
 17 | use tokio_util::io::ReaderStream;
 18 | use tokio_util::io::StreamReader;
 19 | 
 20 | use crate::adapted_iter::AdaptedFilesIterBox;
 21 | use crate::adapted_iter::one_file;
 22 | use crate::matching::FastFileMatcher;
 23 | 
 24 | use super::{AdaptInfo, AdapterMeta, FileAdapter, GetMetadata};
 25 | 
 26 | fn add_newline(ar: impl AsyncRead + Send) -> impl AsyncRead + Send {
 27 |     ar.chain(Cursor::new(b"\n"))
 28 | }
 29 | 
 30 | pub struct PostprocPrefix {}
 31 | impl GetMetadata for PostprocPrefix {
 32 |     fn metadata(&self) -> &super::AdapterMeta {
 33 |         lazy_static::lazy_static! {
 34 |             static ref METADATA: AdapterMeta = AdapterMeta {
 35 |                 name: "postprocprefix".to_owned(),
 36 |                 version: 1,
 37 |                 description: "Adds the line prefix to each line (e.g. the filename within a zip)".to_owned(),
 38 |                 recurses: false,
 39 |                 fast_matchers: vec![],
 40 |                 slow_matchers: None,
 41 |                 keep_fast_matchers_if_accurate: false,
 42 |                 disabled_by_default: false
 43 |             };
 44 |         }
 45 |         &METADATA
 46 |     }
 47 | }
 48 | #[async_trait]
 49 | impl FileAdapter for PostprocPrefix {
 50 |     async fn adapt(
 51 |         &self,
 52 |         a: super::AdaptInfo,
 53 |         _detection_reason: &crate::matching::FileMatcher,
 54 |     ) -> Result<AdaptedFilesIterBox> {
 55 |         let read = add_newline(postproc_prefix(
 56 |             &a.line_prefix,
 57 |             postproc_encoding(&a.line_prefix, a.inp).await?,
 58 |         ));
 59 |         // keep adapt info (filename etc) except replace inp
 60 |         let ai = AdaptInfo {
 61 |             inp: Box::pin(read),
 62 |             postprocess: false,
 63 |             ..a
 64 |         };
 65 |         Ok(one_file(ai))
 66 |     }
 67 | }
 68 | 
 69 | /*struct ReadErr {
 70 |     err: Fn() -> std::io::Error,
 71 | }
 72 | impl Read for ReadErr {
 73 |     fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
 74 |         Err(self.err())
 75 |     }
 76 | }*/
 77 | 
 78 | /**
 79 |  * Detects and converts encodings other than utf-8 to utf-8.
 80 |  * If the input stream does not contain valid text, returns the string `[rga: binary data]` instead
 81 |  */
 82 | async fn postproc_encoding(
 83 |     _line_prefix: &str,
 84 |     inp: Pin<Box<dyn AsyncRead + Send>>,
 85 | ) -> Result<Pin<Box<dyn AsyncRead + Send>>> {
 86 |     // check for binary content in first 8kB
 87 |     // read the first 8kB into a buffer, check for null bytes, then return the buffer concatenated with the rest of the file
 88 |     let mut fourk = Vec::with_capacity(1 << 13);
 89 |     let mut beginning = inp.take(1 << 13);
 90 | 
 91 |     beginning.read_to_end(&mut fourk).await?;
 92 |     let has_binary = fourk.contains(&0u8);
 93 | 
 94 |     let enc = Encoding::for_bom(&fourk);
 95 |     let inp = Cursor::new(fourk).chain(beginning.into_inner());
 96 |     match enc {
 97 |         Some((enc, _)) if enc != encoding_rs::UTF_8 => {
 98 |             // detected UTF16LE or UTF16BE, convert to UTF8 in separate thread
 99 |             // TODO: parse these options from ripgrep's configuration
100 |             let encoding = None; // detect bom but usually assume utf8
101 |             let bom_sniffing = true;
102 |             let mut decode_builder = DecodeReaderBytesBuilder::new();
103 |             // https://github.com/BurntSushi/ripgrep/blob/a7d26c8f144a4957b75f71087a66692d0b25759a/grep-searcher/src/searcher/mod.rs#L706
104 |             // this detects utf-16 BOMs and transcodes to utf-8 if they are present
105 |             // it does not detect any other char encodings. that would require https://github.com/hsivonen/chardetng or similar but then binary detection is hard (?)
106 |             let mut inp = decode_builder
107 |                 .encoding(encoding)
108 |                 .utf8_passthru(true)
109 |                 .strip_bom(bom_sniffing)
110 |                 .bom_override(true)
111 |                 .bom_sniffing(bom_sniffing)
112 |                 .build(SyncIoBridge::new(inp));
113 |             let oup = tokio::task::spawn_blocking(move || -> Result<Vec<u8>> {
114 |                 let mut oup = Vec::new();
115 |                 std::io::Read::read_to_end(&mut inp, &mut oup)?;
116 |                 Ok(oup)
117 |             })
118 |             .await??;
119 |             Ok(Box::pin(Cursor::new(oup)))
120 |         }
121 |         _ => {
122 |             if has_binary {
123 |                 log::debug!("detected binary");
124 |                 return Ok(Box::pin(Cursor::new("[rga: binary data]")));
125 |             }
126 |             Ok(Box::pin(inp))
127 |         }
128 |     }
129 | }
130 | 
131 | /// Adds the given prefix to each line in an `AsyncRead`.
132 | pub fn postproc_prefix<T: AsyncRead + Send>(
133 |     line_prefix: &str,
134 |     inp: T,
135 | ) -> impl AsyncRead + Send + use<T> {
136 |     let line_prefix_n = format!("\n{line_prefix}"); // clone since we need it later
137 |     let line_prefix_o = Bytes::copy_from_slice(line_prefix.as_bytes());
138 |     let regex = regex::bytes::Regex::new("\n").unwrap();
139 |     let inp_stream = ReaderStream::new(inp);
140 |     let oup_stream = stream! {
141 |         yield Ok(line_prefix_o);
142 |         for await chunk in inp_stream {
143 |             match chunk {
144 |                 Err(e) => yield Err(e),
145 |                 Ok(chunk) => {
146 |                     if chunk.contains(&b'\n') {
147 |                         yield Ok(Bytes::copy_from_slice(&regex.replace_all(&chunk, line_prefix_n.as_bytes())));
148 |                     } else {
149 |                         yield Ok(chunk);
150 |                     }
151 |                 }
152 |             }
153 |         }
154 |     };
155 |     Box::pin(StreamReader::new(oup_stream))
156 | }
157 | 
158 | #[derive(Default)]
159 | pub struct PostprocPageBreaks {}
160 | 
161 | impl GetMetadata for PostprocPageBreaks {
162 |     fn metadata(&self) -> &super::AdapterMeta {
163 |         lazy_static::lazy_static! {
164 |             static ref METADATA: AdapterMeta = AdapterMeta {
165 |                 name: "postprocpagebreaks".to_owned(),
166 |                 version: 1,
167 |                 description: "Adds the page number to each line for an input file that specifies page breaks as ascii page break character.\nMainly to be used internally by the poppler adapter.".to_owned(),
168 |                 recurses: false,
169 |                 fast_matchers: vec![FastFileMatcher::FileExtension("asciipagebreaks".to_string())],
170 |                 slow_matchers: None,
171 |                 keep_fast_matchers_if_accurate: false,
172 |                 disabled_by_default: false
173 |             };
174 |         }
175 |         &METADATA
176 |     }
177 | }
178 | #[async_trait]
179 | impl FileAdapter for PostprocPageBreaks {
180 |     async fn adapt(
181 |         &self,
182 |         a: super::AdaptInfo,
183 |         _detection_reason: &crate::matching::FileMatcher,
184 |     ) -> Result<AdaptedFilesIterBox> {
185 |         let read = postproc_pagebreaks(postproc_encoding(&a.line_prefix, a.inp).await?);
186 |         // keep adapt info (filename etc) except replace inp
187 |         let ai = AdaptInfo {
188 |             inp: Box::pin(read),
189 |             archive_recursion_depth: a.archive_recursion_depth + 1,
190 |             filepath_hint: a
191 |                 .filepath_hint
192 |                 .parent()
193 |                 .map(PathBuf::from)
194 |                 .unwrap_or_default()
195 |                 .join(a.filepath_hint.file_stem().unwrap_or_default()),
196 |             ..a
197 |         };
198 |         Ok(one_file(ai))
199 |     }
200 | }
201 | /// Adds the prefix "Page N: " to each line,
202 | /// where N starts at one and is incremented for each ASCII Form Feed character in the input stream.
203 | /// ASCII form feeds are the page delimiters output by `pdftotext`.
204 | pub fn postproc_pagebreaks(input: impl AsyncRead + Send) -> impl AsyncRead + Send {
205 |     let regex_linefeed = regex::bytes::Regex::new(r"\x0c").unwrap();
206 |     let regex_newline = regex::bytes::Regex::new("\n").unwrap();
207 |     let mut page_count: i32 = 1;
208 |     let mut page_prefix: String = format!("\nPage {page_count}: ");
209 | 
210 |     let input_stream = ReaderStream::new(input);
211 |     let output_stream = stream! {
212 |         yield std::io::Result::Ok(Bytes::copy_from_slice(format!("Page {page_count}: ").as_bytes()));
213 |         // store Page X: line prefixes in pending and only write it to the output when there is more text to be written
214 |         // this is needed since pdftotext outputs a \x0c at the end of the last page
215 |         let mut pending: Option<Bytes> = None;
216 | 
217 |         for await read_chunk in input_stream {
218 |             let read_chunk = read_chunk?;
219 |             let page_chunks = regex_linefeed.split(&read_chunk);
220 |             for (chunk_idx, page_chunk) in page_chunks.enumerate() {
221 |                 if chunk_idx != 0 {
222 |                     page_count += 1;
223 |                     page_prefix = format!("\nPage {page_count}: ");
224 |                     if let Some(p) = pending.take() {
225 |                         yield Ok(p);
226 |                     }
227 |                     pending = Some(Bytes::copy_from_slice(page_prefix.as_bytes()));
228 |                 }
229 |                 if !page_chunk.is_empty() {
230 |                     if let Some(p) = pending.take() {
231 |                         yield Ok(p);
232 |                     }
233 |                     yield Ok(Bytes::copy_from_slice(&regex_newline.replace_all(page_chunk, page_prefix.as_bytes())));
234 |                 }
235 | 
236 |             }
237 |         }
238 | 
239 | 
240 |     };
241 |     Box::pin(StreamReader::new(output_stream))
242 | }
243 | 
244 | #[cfg(test)]
245 | mod tests {
246 |     use crate::preproc::loop_adapt;
247 |     use crate::test_utils::*;
248 | 
249 |     use super::*;
250 |     use anyhow::Result;
251 |     use pretty_assertions::assert_eq;
252 |     use tokio::fs::File;
253 |     use tokio::pin;
254 |     use tokio_test::io::Builder;
255 |     use tokio_test::io::Mock;
256 | 
257 |     #[tokio::test]
258 |     async fn test_with_pagebreaks() {
259 |         let mut output: Vec<u8> = Vec::new();
260 |         let mock: Mock = Builder::new()
261 |             .read(b"Hello\nWorld\x0cFoo Bar\n\x0cTest\x0c")
262 |             .build();
263 |         let res = postproc_pagebreaks(mock).read_to_end(&mut output).await;
264 |         println!("{}", String::from_utf8_lossy(&output));
265 |         assert!(res.is_ok());
266 |         assert_eq!(
267 |             String::from_utf8_lossy(&output),
268 |             "Page 1: Hello\nPage 1: World\nPage 2: Foo Bar\nPage 2: \nPage 3: Test"
269 |         );
270 |     }
271 | 
272 |     #[tokio::test]
273 |     async fn test_with_pagebreaks_chunks() {
274 |         let mut output: Vec<u8> = Vec::new();
275 |         let mock: Mock = Builder::new()
276 |             .read(b"Hello\nWo")
277 |             .read(b"rld\x0c")
278 |             .read(b"Foo Bar\n")
279 |             .read(b"\x0cTest\x0c")
280 |             .build();
281 |         let res = postproc_pagebreaks(mock).read_to_end(&mut output).await;
282 |         println!("{}", String::from_utf8_lossy(&output));
283 |         assert!(res.is_ok());
284 |         assert_eq!(
285 |             String::from_utf8_lossy(&output),
286 |             "Page 1: Hello\nPage 1: World\nPage 2: Foo Bar\nPage 2: \nPage 3: Test"
287 |         );
288 |     }
289 | 
290 |     #[tokio::test]
291 |     async fn test_pdf_twoblank() -> Result<()> {
292 |         let adapter = poppler_adapter();
293 |         let fname = test_data_dir().join("twoblankpages.pdf");
294 |         let rd = File::open(&fname).await?;
295 |         let (a, d) = simple_adapt_info(&fname, Box::pin(rd));
296 |         let res = loop_adapt(&adapter, d, a).await?;
297 | 
298 |         let buf = adapted_to_vec(res).await?;
299 | 
300 |         assert_eq!(
301 |             String::from_utf8(buf)?,
302 |             "PREFIX:Page 1: 
303 | PREFIX:Page 2: 
304 | PREFIX:Page 3: HelloWorld
305 | PREFIX:Page 3: 
306 | PREFIX:Page 3: 
307 | ",
308 |         );
309 | 
310 |         Ok(())
311 |     }
312 | 
313 |     #[tokio::test]
314 |     async fn test_postproc_prefix() {
315 |         let mut output: Vec<u8> = Vec::new();
316 |         let mock: Mock = Builder::new().read(b"Hello\nWorld").build();
317 |         let res = postproc_prefix("prefix: ", mock)
318 |             .read_to_end(&mut output)
319 |             .await;
320 |         println!("{}", String::from_utf8_lossy(&output));
321 |         assert!(res.is_ok());
322 |         assert_eq!(output, b"prefix: Hello\nprefix: World");
323 |     }
324 | 
325 |     async fn test_from_strs(
326 |         pagebreaks: bool,
327 |         line_prefix: &str,
328 |         a: &'static str,
329 |         b: &str,
330 |     ) -> Result<()> {
331 |         test_from_bytes(pagebreaks, line_prefix, a.as_bytes(), b).await
332 |     }
333 | 
334 |     async fn test_from_bytes(
335 |         pagebreaks: bool,
336 |         line_prefix: &str,
337 |         a: &'static [u8],
338 |         b: &str,
339 |     ) -> Result<()> {
340 |         let mut oup = Vec::new();
341 |         let inp = Box::pin(Cursor::new(a));
342 |         let inp = postproc_encoding("", inp).await?;
343 |         if pagebreaks {
344 |             postproc_pagebreaks(inp).read_to_end(&mut oup).await?;
345 |         } else {
346 |             let x = postproc_prefix(line_prefix, inp);
347 |             pin!(x);
348 |             x.read_to_end(&mut oup).await?;
349 |         }
350 |         let c = String::from_utf8_lossy(&oup);
351 |         assert_eq!(c, b, "source: {}", String::from_utf8_lossy(a));
352 | 
353 |         Ok(())
354 |     }
355 | 
356 |     #[tokio::test]
357 |     async fn test_utf16() -> Result<()> {
358 |         let utf16lebom: &[u8] = &[
359 |             0xff, 0xfe, 0x68, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x20, 0x00,
360 |             0x77, 0x00, 0x6f, 0x00, 0x72, 0x00, 0x6c, 0x00, 0x64, 0x00, 0x20, 0x00, 0x3d, 0xd8,
361 |             0xa9, 0xdc, 0x0a, 0x00,
362 |         ];
363 |         let utf16bebom: &[u8] = &[
364 |             0xfe, 0xff, 0x00, 0x68, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00, 0x20,
365 |             0x00, 0x77, 0x00, 0x6f, 0x00, 0x72, 0x00, 0x6c, 0x00, 0x64, 0x00, 0x20, 0xd8, 0x3d,
366 |             0xdc, 0xa9, 0x00, 0x0a,
367 |         ];
368 |         test_from_bytes(false, "", utf16lebom, "hello world 💩\n").await?;
369 |         test_from_bytes(false, "", utf16bebom, "hello world 💩\n").await?;
370 |         Ok(())
371 |     }
372 | 
373 |     #[tokio::test]
374 |     async fn post1() -> Result<()> {
375 |         let inp = "What is this\nThis is a test\nFoo";
376 |         let oup = "Page 1: What is this\nPage 1: This is a test\nPage 1: Foo";
377 | 
378 |         test_from_strs(true, "", inp, oup).await?;
379 | 
380 |         println!("\n\n\n\n");
381 | 
382 |         let inp = "What is this\nThis is a test\nFoo\x0c\nHelloooo\nHow are you?\x0c\nGreat!";
383 |         let oup = "Page 1: What is this\nPage 1: This is a test\nPage 1: Foo\nPage 2: \nPage 2: Helloooo\nPage 2: How are you?\nPage 3: \nPage 3: Great!";
384 | 
385 |         test_from_strs(true, "", inp, oup).await?;
386 | 
387 |         let inp = "What is this\nThis is a test\nFoo\x0c\nHelloooo\nHow are you?\x0c\nGreat!";
388 |         let oup = "foo.pdf:What is this\nfoo.pdf:This is a test\nfoo.pdf:Foo\x0c\nfoo.pdf:Helloooo\nfoo.pdf:How are you?\x0c\nfoo.pdf:Great!";
389 | 
390 |         test_from_strs(false, "foo.pdf:", inp, oup).await?;
391 | 
392 |         Ok(())
393 |     }
394 | 
395 |     #[tokio::test]
396 |     async fn test_binary_content() -> Result<()> {
397 |         test_from_strs(
398 |             false,
399 |             "foo:",
400 |             "this is a test \n\n \0 foo",
401 |             "foo:[rga: binary data]",
402 |         )
403 |         .await?;
404 |         test_from_strs(false, "foo:", "\0", "foo:[rga: binary data]").await?;
405 |         Ok(())
406 |     }
407 | 
408 |     /*#[test]
409 |     fn chardet() -> Result<()> {
410 |         let mut d = chardetng::EncodingDetector::new();
411 |         let mut v = Vec::new();
412 |         std::fs::File::open("/home/phire/passwords-2018.kdbx.old").unwrap().read_to_end(&mut v).unwrap();
413 |         d.feed(&v, false);
414 |         println!("foo {:?}", d.guess(None, true));
415 |         Ok(())
416 |     }*/
417 | }
418 | 


--------------------------------------------------------------------------------
/src/adapters/sqlite.rs:
--------------------------------------------------------------------------------
  1 | use super::{writing::WritingFileAdapter, *};
  2 | use anyhow::Result;
  3 | use async_trait::async_trait;
  4 | use lazy_static::lazy_static;
  5 | use log::*;
  6 | use rusqlite::types::ValueRef;
  7 | use rusqlite::*;
  8 | use std::{convert::TryInto, io::Write};
  9 | use tokio::io::AsyncWrite;
 10 | 
 11 | use tokio_util::io::SyncIoBridge;
 12 | 
 13 | static EXTENSIONS: &[&str] = &["db", "db3", "sqlite", "sqlite3"];
 14 | 
 15 | lazy_static! {
 16 |     static ref METADATA: AdapterMeta = AdapterMeta {
 17 |         name: "sqlite".to_owned(),
 18 |         version: 1,
 19 |         description:
 20 |             "Uses sqlite bindings to convert sqlite databases into a simple plain text format"
 21 |                 .to_owned(),
 22 |         recurses: false, // set to true if we decide to make sqlite blobs searchable (gz blob in db is kinda common I think)
 23 |         fast_matchers: EXTENSIONS
 24 |             .iter()
 25 |             .map(|s| FastFileMatcher::FileExtension(s.to_string()))
 26 |             .collect(),
 27 |         slow_matchers: Some(vec![FileMatcher::MimeType(
 28 |             "application/x-sqlite3".to_owned()
 29 |         )]),
 30 |         keep_fast_matchers_if_accurate: false,
 31 |         disabled_by_default: false
 32 |     };
 33 | }
 34 | 
 35 | #[derive(Default, Clone)]
 36 | pub struct SqliteAdapter;
 37 | 
 38 | impl SqliteAdapter {
 39 |     pub fn new() -> Self {
 40 |         Self
 41 |     }
 42 | }
 43 | impl GetMetadata for SqliteAdapter {
 44 |     fn metadata(&self) -> &AdapterMeta {
 45 |         &METADATA
 46 |     }
 47 | }
 48 | 
 49 | fn format_blob(b: ValueRef) -> String {
 50 |     use ValueRef::*;
 51 |     match b {
 52 |         Null => "NULL".to_owned(),
 53 |         Integer(i) => format!("{}", i),
 54 |         Real(i) => format!("{}", i),
 55 |         Text(i) => format!("'{}'", String::from_utf8_lossy(i).replace('\'', "''")),
 56 |         Blob(b) => format!(
 57 |             "[blob {}B]",
 58 |             size_format::SizeFormatterSI::new(
 59 |                 // can't be larger than 2GB anyways
 60 |                 b.len().try_into().unwrap()
 61 |             )
 62 |         ),
 63 |     }
 64 | }
 65 | 
 66 | fn synchronous_dump_sqlite(ai: AdaptInfo, mut s: impl Write) -> Result<()> {
 67 |     let AdaptInfo {
 68 |         is_real_file,
 69 |         filepath_hint,
 70 |         line_prefix,
 71 |         ..
 72 |     } = ai;
 73 |     if !is_real_file {
 74 |         // db is in an archive
 75 |         // todo: read to memory and then use that blob if size < max
 76 |         writeln!(s, "{line_prefix}[rga: skipping sqlite in archive]",)?;
 77 |         return Ok(());
 78 |     }
 79 |     let inp_fname = filepath_hint;
 80 |     let conn = Connection::open_with_flags(&inp_fname, OpenFlags::SQLITE_OPEN_READ_ONLY)
 81 |         .with_context(|| format!("opening sqlite connection to {}", inp_fname.display()))?;
 82 |     let tables: Vec<String> = conn
 83 |         .prepare("select name from sqlite_master where type='table'")
 84 |         .context("while preparing query")?
 85 |         .query_map([], |r| r.get::<_, String>(0))
 86 |         .context("while executing query")?
 87 |         .filter_map(|e| e.ok())
 88 |         .collect();
 89 |     debug!("db has {} tables", tables.len());
 90 |     for table in tables {
 91 |         // can't use query param at that position
 92 |         let mut sel = conn.prepare(&format!(
 93 |             "select * from {}",
 94 |             rusqlite::vtab::escape_double_quote(&table)
 95 |         ))?;
 96 |         let col_names: Vec<String> = sel
 97 |             .column_names()
 98 |             .into_iter()
 99 |             .map(|e| e.to_owned())
100 |             .collect();
101 |         let mut z = sel.query([])?;
102 |         // writeln!(oup, "{}: {}", table, cols.join(", "))?;
103 | 
104 |         // kind of shitty (lossy) output. maybe output real csv or something?
105 |         while let Some(row) = z.next()? {
106 |             let row_str = col_names
107 |                 .iter()
108 |                 .enumerate()
109 |                 .map(|(i, e)| Ok(format!("{}={}", e, format_blob(row.get_ref(i)?))))
110 |                 .collect::<Result<Vec<String>>>()?
111 |                 .join(", ");
112 |             writeln!(s, "{line_prefix}{table}: {row_str}",)?;
113 |         }
114 |     }
115 |     Ok(())
116 | }
117 | 
118 | #[async_trait]
119 | impl WritingFileAdapter for SqliteAdapter {
120 |     async fn adapt_write(
121 |         ai: AdaptInfo,
122 |         _detection_reason: &FileMatcher,
123 |         oup: Pin<Box<dyn AsyncWrite + Send>>,
124 |     ) -> Result<()> {
125 |         if ai.filepath_hint.file_name().and_then(|e| e.to_str()) == Some("Thumbs.db") {
126 |             // skip windows thumbnail cache
127 |             return Ok(());
128 |         }
129 |         let oup_sync = SyncIoBridge::new(oup);
130 |         tokio::task::spawn_blocking(|| synchronous_dump_sqlite(ai, oup_sync))
131 |             .await?
132 |             .context("in synchronous sqlite task")?;
133 |         Ok(())
134 |     }
135 | }
136 | 
137 | #[cfg(test)]
138 | mod test {
139 |     use super::*;
140 |     use crate::test_utils::*;
141 |     use pretty_assertions::assert_eq;
142 | 
143 |     #[tokio::test]
144 |     async fn simple() -> Result<()> {
145 |         let adapter: Box<dyn FileAdapter> = Box::<SqliteAdapter>::default();
146 |         let fname = test_data_dir().join("hello.sqlite3");
147 |         let (a, d) = simple_fs_adapt_info(&fname).await?;
148 |         let res = adapter.adapt(a, &d).await?;
149 | 
150 |         let buf = adapted_to_vec(res).await?;
151 | 
152 |         assert_eq!(
153 |             String::from_utf8(buf)?,
154 |             "PREFIX:tbl: greeting='hello', from='sqlite database!'\nPREFIX:tbl2: x=123, y=456.789\n",
155 |         );
156 | 
157 |         Ok(())
158 |     }
159 | }
160 | 


--------------------------------------------------------------------------------
/src/adapters/tar.rs:
--------------------------------------------------------------------------------
  1 | use crate::{
  2 |     adapted_iter::AdaptedFilesIterBox,
  3 |     adapters::AdapterMeta,
  4 |     matching::{FastFileMatcher, FileMatcher},
  5 |     print_bytes,
  6 | };
  7 | use anyhow::*;
  8 | use async_stream::stream;
  9 | use async_trait::async_trait;
 10 | use lazy_static::lazy_static;
 11 | use log::*;
 12 | use std::path::PathBuf;
 13 | 
 14 | use tokio_stream::StreamExt;
 15 | 
 16 | use super::{AdaptInfo, FileAdapter, GetMetadata};
 17 | 
 18 | static EXTENSIONS: &[&str] = &["tar"];
 19 | 
 20 | lazy_static! {
 21 |     static ref METADATA: AdapterMeta = AdapterMeta {
 22 |         name: "tar".to_owned(),
 23 |         version: 1,
 24 |         description: "Reads a tar file as a stream and recurses down into its contents".to_owned(),
 25 |         recurses: true,
 26 |         fast_matchers: EXTENSIONS
 27 |             .iter()
 28 |             .map(|s| FastFileMatcher::FileExtension(s.to_string()))
 29 |             .collect(),
 30 |         slow_matchers: None,
 31 |         keep_fast_matchers_if_accurate: true,
 32 |         disabled_by_default: false
 33 |     };
 34 | }
 35 | #[derive(Default, Clone)]
 36 | pub struct TarAdapter;
 37 | 
 38 | impl TarAdapter {
 39 |     pub fn new() -> Self {
 40 |         Self
 41 |     }
 42 | }
 43 | impl GetMetadata for TarAdapter {
 44 |     fn metadata(&self) -> &AdapterMeta {
 45 |         &METADATA
 46 |     }
 47 | }
 48 | 
 49 | #[async_trait]
 50 | impl FileAdapter for TarAdapter {
 51 |     async fn adapt(
 52 |         &self,
 53 |         ai: AdaptInfo,
 54 |         _detection_reason: &FileMatcher,
 55 |     ) -> Result<AdaptedFilesIterBox> {
 56 |         let AdaptInfo {
 57 |             filepath_hint,
 58 |             inp,
 59 |             line_prefix,
 60 |             archive_recursion_depth,
 61 |             config,
 62 |             postprocess,
 63 |             ..
 64 |         } = ai;
 65 |         let mut archive = ::tokio_tar::Archive::new(inp);
 66 | 
 67 |         let mut entries = archive.entries()?;
 68 |         let s = stream! {
 69 |             while let Some(entry) = entries.next().await {
 70 |                 let file = entry?;
 71 |                 if tokio_tar::EntryType::Regular == file.header().entry_type() {
 72 |                     let path = PathBuf::from(file.path()?.to_owned());
 73 |                     debug!(
 74 |                         "{}|{}: {}",
 75 |                         filepath_hint.display(),
 76 |                         path.display(),
 77 |                         print_bytes(file.header().size().unwrap_or(0) as f64),
 78 |                     );
 79 |                     let line_prefix = &format!("{}{}: ", line_prefix, path.display());
 80 |                     let ai2: AdaptInfo = AdaptInfo {
 81 |                         filepath_hint: path,
 82 |                         is_real_file: false,
 83 |                         archive_recursion_depth: archive_recursion_depth + 1,
 84 |                         inp: Box::pin(file),
 85 |                         line_prefix: line_prefix.to_string(),
 86 |                         config: config.clone(),
 87 |                         postprocess,
 88 |                     };
 89 |                     yield Ok(ai2);
 90 |                 }
 91 |             }
 92 |         };
 93 | 
 94 |         Ok(Box::pin(s))
 95 |     }
 96 | }
 97 | 
 98 | #[cfg(test)]
 99 | mod tests {
100 |     use super::*;
101 |     use crate::{preproc::loop_adapt, test_utils::*};
102 |     use pretty_assertions::assert_eq;
103 |     use tokio::fs::File;
104 | 
105 |     #[tokio::test]
106 |     async fn test_simple_tar() -> Result<()> {
107 |         let filepath = test_data_dir().join("hello.tar");
108 | 
109 |         let (a, d) = simple_adapt_info(&filepath, Box::pin(File::open(&filepath).await?));
110 | 
111 |         let adapter = TarAdapter::new();
112 |         let r = loop_adapt(&adapter, d, a).await.context("adapt")?;
113 |         let o = adapted_to_vec(r).await.context("adapted_to_vec")?;
114 |         assert_eq!(
115 |             String::from_utf8(o).context("parsing utf8")?,
116 |             "PREFIX:dir/file-b.pdf: Page 1: hello world
117 | PREFIX:dir/file-b.pdf: Page 1: this is just a test.
118 | PREFIX:dir/file-b.pdf: Page 1: 
119 | PREFIX:dir/file-b.pdf: Page 1: 1
120 | PREFIX:dir/file-b.pdf: Page 1: 
121 | PREFIX:dir/file-b.pdf: Page 1: 
122 | PREFIX:dir/file-a.pdf: Page 1: hello world
123 | PREFIX:dir/file-a.pdf: Page 1: this is just a test.
124 | PREFIX:dir/file-a.pdf: Page 1: 
125 | PREFIX:dir/file-a.pdf: Page 1: 1
126 | PREFIX:dir/file-a.pdf: Page 1: 
127 | PREFIX:dir/file-a.pdf: Page 1: 
128 | "
129 |         );
130 |         Ok(())
131 |     }
132 | }
133 | 


--------------------------------------------------------------------------------
/src/adapters/writing.rs:
--------------------------------------------------------------------------------
 1 | use std::pin::Pin;
 2 | 
 3 | use crate::{adapted_iter::one_file, join_handle_to_stream, to_io_err};
 4 | 
 5 | use super::{AdaptInfo, FileAdapter, GetMetadata};
 6 | use anyhow::{Context, Result};
 7 | use async_trait::async_trait;
 8 | use tokio::io::{AsyncReadExt, AsyncWrite};
 9 | 
10 | #[async_trait]
11 | pub trait WritingFileAdapter: GetMetadata + Send + Sync + Clone {
12 |     async fn adapt_write(
13 |         a: super::AdaptInfo,
14 |         detection_reason: &crate::matching::FileMatcher,
15 |         oup: Pin<Box<dyn AsyncWrite + Send>>,
16 |     ) -> Result<()>;
17 | }
18 | 
19 | macro_rules! async_writeln {
20 |     ($dst: expr_2021) => {
21 |         {
22 |             tokio::io::AsyncWriteExt::write_all(&mut $dst, b"\n").await
23 |         }
24 |     };
25 |     ($dst: expr_2021, $fmt: expr_2021) => {
26 |         {
27 |             use std::io::Write;
28 |             let mut buf = Vec::<u8>::new();
29 |             writeln!(buf, $fmt)?;
30 |             tokio::io::AsyncWriteExt::write_all(&mut $dst, &buf).await
31 |         }
32 |     };
33 |     ($dst: expr_2021, $fmt: expr_2021, $($arg: tt)*) => {
34 |         {
35 |             use std::io::Write;
36 |             let mut buf = Vec::<u8>::new();
37 |             writeln!(buf, $fmt, $( $arg )*)?;
38 |             tokio::io::AsyncWriteExt::write_all(&mut $dst, &buf).await
39 |         }
40 |     };
41 | }
42 | pub(crate) use async_writeln;
43 | 
44 | #[async_trait]
45 | impl<T> FileAdapter for T
46 | where
47 |     T: WritingFileAdapter,
48 | {
49 |     async fn adapt(
50 |         &self,
51 |         a: super::AdaptInfo,
52 |         detection_reason: &crate::matching::FileMatcher,
53 |     ) -> Result<crate::adapted_iter::AdaptedFilesIterBox> {
54 |         let name = self.metadata().name.clone();
55 |         let (w, r) = tokio::io::duplex(128 * 1024);
56 |         let d2 = detection_reason.clone();
57 |         let archive_recursion_depth = a.archive_recursion_depth + 1;
58 |         let filepath_hint = format!("{}.txt", a.filepath_hint.to_string_lossy());
59 |         let postprocess = a.postprocess;
60 |         let line_prefix = a.line_prefix.clone();
61 |         let config = a.config.clone();
62 |         let joiner = tokio::spawn(async move {
63 |             let x = d2;
64 |             T::adapt_write(a, &x, Box::pin(w))
65 |                 .await
66 |                 .with_context(|| format!("in {}.adapt_write", name))
67 |                 .map_err(to_io_err)
68 |         });
69 | 
70 |         Ok(one_file(AdaptInfo {
71 |             is_real_file: false,
72 |             filepath_hint: filepath_hint.into(),
73 |             archive_recursion_depth,
74 |             config,
75 |             inp: Box::pin(r.chain(join_handle_to_stream(joiner))),
76 |             line_prefix,
77 |             postprocess,
78 |         }))
79 |     }
80 | }
81 | 


--------------------------------------------------------------------------------
/src/adapters/zip.rs:
--------------------------------------------------------------------------------
  1 | use super::*;
  2 | use crate::print_bytes;
  3 | use anyhow::*;
  4 | use async_stream::stream;
  5 | use lazy_static::lazy_static;
  6 | use log::*;
  7 | 
  8 | // TODO: allow users to configure file extensions instead of hard coding the list
  9 | // https://github.com/phiresky/ripgrep-all/pull/208#issuecomment-2173241243
 10 | static EXTENSIONS: &[&str] = &["zip", "jar", "xpi", "kra", "snagx"];
 11 | 
 12 | lazy_static! {
 13 |     static ref METADATA: AdapterMeta = AdapterMeta {
 14 |         name: "zip".to_owned(),
 15 |         version: 1,
 16 |         description: "Reads a zip file as a stream and recurses down into its contents".to_owned(),
 17 |         recurses: true,
 18 |         fast_matchers: EXTENSIONS
 19 |             .iter()
 20 |             .map(|s| FastFileMatcher::FileExtension(s.to_string()))
 21 |             .collect(),
 22 |         slow_matchers: Some(vec![FileMatcher::MimeType("application/zip".to_owned())]),
 23 |         keep_fast_matchers_if_accurate: false,
 24 |         disabled_by_default: false
 25 |     };
 26 | }
 27 | #[derive(Default, Clone)]
 28 | pub struct ZipAdapter;
 29 | 
 30 | impl ZipAdapter {
 31 |     pub fn new() -> Self {
 32 |         Self
 33 |     }
 34 | }
 35 | impl GetMetadata for ZipAdapter {
 36 |     fn metadata(&self) -> &AdapterMeta {
 37 |         &METADATA
 38 |     }
 39 | }
 40 | 
 41 | #[async_trait]
 42 | impl FileAdapter for ZipAdapter {
 43 |     async fn adapt(
 44 |         &self,
 45 |         ai: AdaptInfo,
 46 |         _detection_reason: &FileMatcher,
 47 |     ) -> Result<AdaptedFilesIterBox> {
 48 |         // let (s, r) = mpsc::channel(1);
 49 |         let AdaptInfo {
 50 |             inp,
 51 |             filepath_hint,
 52 |             archive_recursion_depth,
 53 |             postprocess,
 54 |             line_prefix,
 55 |             config,
 56 |             is_real_file,
 57 |             ..
 58 |         } = ai;
 59 |         if is_real_file {
 60 |             use async_zip::read::fs::ZipFileReader;
 61 | 
 62 |             let zip = ZipFileReader::new(&filepath_hint).await?;
 63 |             let s = stream! {
 64 |                 for i in 0..zip.file().entries().len() {
 65 |                     let file = zip.get_entry(i)?;
 66 |                     let reader = zip.entry(i).await?;
 67 |                     if file.filename().ends_with('/') {
 68 |                         continue;
 69 |                     }
 70 |                     debug!(
 71 |                         "{}{}|{}: {} ({} packed)",
 72 |                         line_prefix,
 73 |                         filepath_hint.display(),
 74 |                         file.filename(),
 75 |                         print_bytes(file.uncompressed_size() as f64),
 76 |                         print_bytes(file.compressed_size() as f64)
 77 |                     );
 78 |                     let new_line_prefix = format!("{}{}: ", line_prefix, file.filename());
 79 |                     let fname = PathBuf::from(file.filename());
 80 |                     tokio::pin!(reader);
 81 |                     // SAFETY: this should be solvable without unsafe but idk how :(
 82 |                     // the issue is that ZipEntryReader borrows from ZipFileReader, but we need to yield it here into the stream
 83 |                     // but then it can't borrow from the ZipFile
 84 |                     let reader2 = unsafe {
 85 |                         std::intrinsics::transmute::<
 86 |                             Pin<&mut (dyn AsyncRead + Send)>,
 87 |                             Pin<&'static mut (dyn AsyncRead + Send)>,
 88 |                         >(reader)
 89 |                     };
 90 |                     yield Ok(AdaptInfo {
 91 |                         filepath_hint: fname,
 92 |                         is_real_file: false,
 93 |                         inp: Box::pin(reader2),
 94 |                         line_prefix: new_line_prefix,
 95 |                         archive_recursion_depth: archive_recursion_depth + 1,
 96 |                         postprocess,
 97 |                         config: config.clone(),
 98 |                     });
 99 |                 }
100 |             };
101 | 
102 |             Ok(Box::pin(s))
103 |         } else {
104 |             use async_zip::read::stream::ZipFileReader;
105 |             let mut zip = ZipFileReader::new(inp);
106 | 
107 |             let s = stream! {
108 |                     trace!("begin zip");
109 |                     while let Some(mut entry) = zip.next_entry().await? {
110 |                         trace!("zip next entry");
111 |                         let file = entry.entry();
112 |                         if file.filename().ends_with('/') {
113 |                             zip = entry.skip().await?;
114 | 
115 |                             continue;
116 |                         }
117 |                         debug!(
118 |                             "{}{}|{}: {} ({} packed)",
119 |                             line_prefix,
120 |                             filepath_hint.display(),
121 |                             file.filename(),
122 |                             print_bytes(file.uncompressed_size() as f64),
123 |                             print_bytes(file.compressed_size() as f64)
124 |                         );
125 |                         let new_line_prefix = format!("{}{}: ", line_prefix, file.filename());
126 |                         let fname = PathBuf::from(file.filename());
127 |                         let reader = entry.reader();
128 |                         tokio::pin!(reader);
129 |                         // SAFETY: this should be solvable without unsafe but idk how :(
130 |                         // the issue is that ZipEntryReader borrows from ZipFileReader, but we need to yield it here into the stream
131 |                         // but then it can't borrow from the ZipFile
132 |                         let reader2 = unsafe {
133 |                             std::intrinsics::transmute::<
134 |                                 Pin<&mut (dyn AsyncRead + Send)>,
135 |                                 Pin<&'static mut (dyn AsyncRead + Send)>,
136 |                             >(reader)
137 |                         };
138 |                         yield Ok(AdaptInfo {
139 |                             filepath_hint: fname,
140 |                             is_real_file: false,
141 |                             inp: Box::pin(reader2),
142 |                             line_prefix: new_line_prefix,
143 |                             archive_recursion_depth: archive_recursion_depth + 1,
144 |                             postprocess,
145 |                             config: config.clone(),
146 |                         });
147 |                         zip = entry.done().await.context("going to next file in zip but entry was not read fully")?;
148 | 
149 |                 }
150 |                 trace!("zip over");
151 |             };
152 | 
153 |             Ok(Box::pin(s))
154 |         }
155 |     }
156 | }
157 | 
158 | /*struct ZipAdaptIter {
159 |     inp: AdaptInfo,
160 | }
161 | impl<'a> AdaptedFilesIter for ZipAdaptIter<'a> {
162 |     fn next<'b>(&'b mut self) -> Option<AdaptInfo<'b>> {
163 |         let line_prefix = &self.inp.line_prefix;
164 |         let filepath_hint = &self.inp.filepath_hint;
165 |         let archive_recursion_depth = &self.inp.archive_recursion_depth;
166 |         let postprocess = self.inp.postprocess;
167 |         ::zip::read::read_zipfile_from_stream(&mut self.inp.inp)
168 |             .unwrap()
169 |             .and_then(|file| {
170 |                 if file.is_dir() {
171 |                     return None;
172 |                 }
173 |                 debug!(
174 |                     "{}{}|{}: {} ({} packed)",
175 |                     line_prefix,
176 |                     filepath_hint.to_string_lossy(),
177 |                     file.name(),
178 |                     print_bytes(file.size() as f64),
179 |                     print_bytes(file.compressed_size() as f64)
180 |                 );
181 |                 let line_prefix = format!("{}{}: ", line_prefix, file.name());
182 |                 Some(AdaptInfo {
183 |                     filepath_hint: PathBuf::from(file.name()),
184 |                     is_real_file: false,
185 |                     inp: Box::new(file),
186 |                     line_prefix,
187 |                     archive_recursion_depth: archive_recursion_depth + 1,
188 |                     postprocess,
189 |                     config: RgaConfig::default(), //config.clone(),
190 |                 })
191 |             })
192 |     }
193 | }*/
194 | 
195 | #[cfg(test)]
196 | mod test {
197 |     use async_zip::{Compression, ZipEntryBuilder, write::ZipFileWriter};
198 | 
199 |     use super::*;
200 |     use crate::{preproc::loop_adapt, test_utils::*};
201 |     use pretty_assertions::assert_eq;
202 | 
203 |     #[async_recursion::async_recursion]
204 |     async fn create_zip(fname: &str, content: &str, add_inner: bool) -> Result<Vec<u8>> {
205 |         let v = Vec::new();
206 |         let mut cursor = std::io::Cursor::new(v);
207 |         let mut zip = ZipFileWriter::new(&mut cursor);
208 | 
209 |         let options = ZipEntryBuilder::new(fname.to_string(), Compression::Stored);
210 |         zip.write_entry_whole(options, content.as_bytes()).await?;
211 | 
212 |         if add_inner {
213 |             let opts = ZipEntryBuilder::new("inner.zip".to_string(), Compression::Stored);
214 |             zip.write_entry_whole(
215 |                 opts,
216 |                 &create_zip("inner.txt", "inner text file", false).await?,
217 |             )
218 |             .await?;
219 |         }
220 |         zip.close().await?;
221 |         Ok(cursor.into_inner())
222 |     }
223 | 
224 |     #[tokio::test]
225 |     async fn only_seek_zip_fs() -> Result<()> {
226 |         let zip = test_data_dir().join("only-seek-zip.zip");
227 |         let (a, d) = simple_fs_adapt_info(&zip).await?;
228 |         let _v = adapted_to_vec(loop_adapt(&ZipAdapter::new(), d, a).await?).await?;
229 |         // assert_eq!(String::from_utf8(v)?, "");
230 | 
231 |         Ok(())
232 |     }
233 |     /*#[tokio::test]
234 |     async fn only_seek_zip_mem() -> Result<()> {
235 |         let zip = test_data_dir().join("only-seek-zip.zip");
236 |         let (a, d) = simple_adapt_info(&zip, Box::pin(File::open(&zip).await?));
237 |         let v = adapted_to_vec(loop_adapt(&ZipAdapter::new(), d, a)?).await?;
238 |         // assert_eq!(String::from_utf8(v)?, "");
239 | 
240 |         Ok(())
241 |     }*/
242 |     #[tokio::test]
243 |     async fn recurse() -> Result<()> {
244 |         let zipfile = create_zip("outer.txt", "outer text file", true).await?;
245 |         let adapter = ZipAdapter::new();
246 | 
247 |         let (a, d) = simple_adapt_info(
248 |             &PathBuf::from("outer.zip"),
249 |             Box::pin(std::io::Cursor::new(zipfile)),
250 |         );
251 |         let buf = adapted_to_vec(loop_adapt(&adapter, d, a).await?).await?;
252 | 
253 |         assert_eq!(
254 |             String::from_utf8(buf)?,
255 |             "PREFIX:outer.txt: outer text file\nPREFIX:inner.zip: inner.txt: inner text file\n",
256 |         );
257 | 
258 |         Ok(())
259 |     }
260 | }
261 | 


--------------------------------------------------------------------------------
/src/bin/rga-fzf-open.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Context;
 2 | 
 3 | use std::process::Command;
 4 | 
 5 | // TODO: add --rg-params=..., --rg-preview-params=... and --fzf-params=... params
 6 | // TODO: remove passthrough_args
 7 | fn main() -> anyhow::Result<()> {
 8 |     env_logger::init();
 9 |     let mut args = std::env::args().skip(1);
10 |     let query = args.next().context("no query")?;
11 |     let fname = args.next().context("no filename")?;
12 |     // let instance_id = std::env::var("RGA_FZF_INSTANCE").unwrap_or("unk".to_string());
13 | 
14 |     if fname.ends_with(".pdf") {
15 |         use std::io::ErrorKind::*;
16 | 
17 |         let worked = Command::new("evince")
18 |             .arg("--find")
19 |             .arg(&query)
20 |             .arg(&fname)
21 |             .spawn()
22 |             .map_or_else(
23 |                 |err| match err.kind() {
24 |                     NotFound => Ok(false),
25 |                     _ => Err(err),
26 |                 },
27 |                 |_| Ok(true),
28 |             )?;
29 |         if worked {
30 |             return Ok(());
31 |         }
32 |     }
33 |     Ok(open::that_detached(&fname)?)
34 | }
35 | 


--------------------------------------------------------------------------------
/src/bin/rga-fzf.rs:
--------------------------------------------------------------------------------
 1 | use anyhow::Context;
 2 | use rga::adapters::custom::map_exe_error;
 3 | use ripgrep_all as rga;
 4 | 
 5 | use std::process::{Command, Stdio};
 6 | 
 7 | // TODO: add --rg-params=..., --rg-preview-params=... and --fzf-params=... params
 8 | // TODO: remove passthrough_args
 9 | fn main() -> anyhow::Result<()> {
10 |     env_logger::init();
11 |     let mut passthrough_args: Vec<String> = std::env::args().skip(1).collect();
12 |     let inx = passthrough_args.iter().position(|e| !e.starts_with('-'));
13 |     let initial_query = if let Some(inx) = inx {
14 |         passthrough_args.remove(inx)
15 |     } else {
16 |         "".to_string()
17 |     };
18 | 
19 |     let exe = std::env::current_exe().context("Could not get executable location")?;
20 |     let preproc_exe = exe.with_file_name("rga");
21 |     let preproc_exe = preproc_exe
22 |         .to_str()
23 |         .context("rga executable is in non-unicode path")?;
24 |     let open_exe = exe.with_file_name("rga-fzf-open");
25 |     let open_exe = open_exe
26 |         .to_str()
27 |         .context("rga-fzf-open executable is in non-unicode path")?;
28 | 
29 |     let rg_prefix = format!("{preproc_exe} --files-with-matches --rga-cache-max-blob-len=10M");
30 | 
31 |     let child = Command::new("fzf")
32 |         .arg(format!(
33 |             "--preview={preproc_exe} --pretty --context 5 {{q}} --rga-fzf-path=_{{}}"
34 |         ))
35 |         .arg("--preview-window=70%:wrap")
36 |         .arg("--phony")
37 |         .arg("--query")
38 |         .arg(&initial_query)
39 |         .arg("--print-query")
40 |         .arg(format!("--bind=change:reload: {rg_prefix} {{q}}"))
41 |         .arg(format!("--bind=ctrl-m:execute:{open_exe} {{q}} {{}}"))
42 |         .env(
43 |             "FZF_DEFAULT_COMMAND",
44 |             format!("{} '{}'", rg_prefix, &initial_query),
45 |         )
46 |         .env("RGA_FZF_INSTANCE", format!("{}", std::process::id())) // may be useful to open stuff in the same tab
47 |         .stdout(Stdio::piped())
48 |         .spawn()
49 |         .map_err(|e| map_exe_error(e, "fzf", "Please make sure you have fzf installed."))?;
50 | 
51 |     let output = child.wait_with_output()?;
52 |     let mut x = output.stdout.split(|e| e == &b'\n');
53 |     let final_query =
54 |         std::str::from_utf8(x.next().context("fzf output empty")?).context("fzf query not utf8")?;
55 |     let selected_file = std::str::from_utf8(x.next().context("fzf output not two line")?)
56 |         .context("fzf ofilename not utf8")?;
57 |     println!("query='{final_query}', file='{selected_file}'");
58 | 
59 |     Ok(())
60 | }
61 | 


--------------------------------------------------------------------------------
/src/bin/rga-preproc.rs:
--------------------------------------------------------------------------------
 1 | use rga::adapters::*;
 2 | use rga::preproc::*;
 3 | use rga::print_dur;
 4 | use ripgrep_all as rga;
 5 | 
 6 | use anyhow::Context;
 7 | use log::debug;
 8 | use std::time::Instant;
 9 | use tokio::fs::File;
10 | 
11 | #[tokio::main]
12 | async fn main() -> anyhow::Result<()> {
13 |     env_logger::init();
14 |     let mut arg_arr: Vec<std::ffi::OsString> = std::env::args_os().collect();
15 |     let last = arg_arr.pop().expect("No filename specified");
16 |     let config = rga::config::parse_args(arg_arr, true)?;
17 |     //clap::App::new("rga-preproc").arg(Arg::from_usage())
18 |     let path = {
19 |         let filepath = last;
20 |         std::env::current_dir()?.join(filepath)
21 |     };
22 | 
23 |     let i = File::open(&path)
24 |         .await
25 |         .context("Specified input file not found")?;
26 |     let mut o = tokio::io::stdout();
27 |     let ai = AdaptInfo {
28 |         inp: Box::pin(i),
29 |         filepath_hint: path,
30 |         is_real_file: true,
31 |         line_prefix: "".to_string(),
32 |         archive_recursion_depth: 0,
33 |         postprocess: !config.no_prefix_filenames,
34 |         config,
35 |     };
36 | 
37 |     let start = Instant::now();
38 |     let mut oup = rga_preproc(ai).await.context("during preprocessing")?;
39 |     debug!("finding and starting adapter took {}", print_dur(start));
40 |     let res = tokio::io::copy(&mut oup, &mut o).await;
41 |     if let Err(e) = res {
42 |         if e.kind() == std::io::ErrorKind::BrokenPipe {
43 |             // happens if e.g. ripgrep detects binary data in the pipe so it cancels reading
44 |             debug!("output cancelled (broken pipe)");
45 |         } else {
46 |             Err(e).context("copying adapter output to stdout")?;
47 |         }
48 |     }
49 |     debug!("running adapter took {} total", print_dur(start));
50 |     Ok(())
51 | }
52 | 


--------------------------------------------------------------------------------
/src/bin/rga.rs:
--------------------------------------------------------------------------------
  1 | use anyhow::Result;
  2 | use rga::adapters::custom::map_exe_error;
  3 | use rga::adapters::*;
  4 | use rga::config::{RgaConfig, split_args};
  5 | use rga::matching::*;
  6 | use rga::print_dur;
  7 | use ripgrep_all as rga;
  8 | use structopt::StructOpt;
  9 | 
 10 | use schemars::schema_for;
 11 | use std::process::Command;
 12 | use std::time::Instant;
 13 | 
 14 | fn list_adapters(args: RgaConfig) -> Result<()> {
 15 |     let (enabled_adapters, disabled_adapters) = get_all_adapters(args.custom_adapters);
 16 | 
 17 |     println!("Adapters:\n");
 18 |     let print = |adapter: std::sync::Arc<dyn FileAdapter>| {
 19 |         let meta = adapter.metadata();
 20 |         let matchers = meta
 21 |             .fast_matchers
 22 |             .iter()
 23 |             .map(|m| match m {
 24 |                 FastFileMatcher::FileExtension(ext) => format!(".{ext}"),
 25 |             })
 26 |             .collect::<Vec<_>>()
 27 |             .join(", ");
 28 |         let slow_matchers = meta
 29 |             .slow_matchers
 30 |             .as_ref()
 31 |             .unwrap_or(&vec![])
 32 |             .iter()
 33 |             .filter_map(|m| match m {
 34 |                 FileMatcher::MimeType(x) => Some(x.to_string()),
 35 |                 FileMatcher::Fast(_) => None,
 36 |             })
 37 |             .collect::<Vec<_>>()
 38 |             .join(", ");
 39 |         print!(
 40 |             " - **{name}**\n     {desc}  \n     Extensions: {matchers}  \n     Mime Types: {mime}  \n",
 41 |             name = meta.name,
 42 |             desc = meta.description.replace('\n', "\n     "),
 43 |             matchers = matchers,
 44 |             mime = slow_matchers,
 45 |         );
 46 |         println!();
 47 |     };
 48 |     for adapter in enabled_adapters {
 49 |         print(adapter)
 50 |     }
 51 |     println!(
 52 |         "The following adapters are disabled by default, and can be enabled using '--rga-adapters=+foo,bar':\n"
 53 |     );
 54 |     for adapter in disabled_adapters {
 55 |         print(adapter)
 56 |     }
 57 |     Ok(())
 58 | }
 59 | fn main() -> anyhow::Result<()> {
 60 |     // set debugging as early as possible
 61 |     if std::env::args().any(|e| e == "--debug") {
 62 |         // TODO: Audit that the environment access only happens in single-threaded code.
 63 |         unsafe { std::env::set_var("RUST_LOG", "debug") };
 64 |     }
 65 | 
 66 |     env_logger::init();
 67 | 
 68 |     let (config, mut passthrough_args) = split_args(false)?;
 69 | 
 70 |     if config.print_config_schema {
 71 |         println!("{}", serde_json::to_string_pretty(&schema_for!(RgaConfig))?);
 72 |         return Ok(());
 73 |     }
 74 |     if config.list_adapters {
 75 |         return list_adapters(config);
 76 |     }
 77 |     if let Some(path) = config.fzf_path {
 78 |         if path == "_" {
 79 |             // fzf found no result, ignore everything and return
 80 |             println!("[no file found]");
 81 |             return Ok(());
 82 |         }
 83 |         passthrough_args.push(std::ffi::OsString::from(&path[1..]));
 84 |     }
 85 | 
 86 |     if passthrough_args.is_empty() {
 87 |         // rg would show help. Show own help instead.
 88 |         RgaConfig::clap().print_help()?;
 89 |         println!();
 90 |         return Ok(());
 91 |     }
 92 | 
 93 |     let adapters = get_adapters_filtered(config.custom_adapters.clone(), &config.adapters)?;
 94 | 
 95 |     let pre_glob = if !config.accurate {
 96 |         let extensions = adapters
 97 |             .iter()
 98 |             .flat_map(|a| &a.metadata().fast_matchers)
 99 |             .flat_map(|m| match m {
100 |                 FastFileMatcher::FileExtension(ext) => vec![ext.clone(), ext.to_ascii_uppercase()],
101 |             })
102 |             .collect::<Vec<_>>()
103 |             .join(",");
104 |         format!("*.{{{extensions}}}")
105 |     } else {
106 |         "*".to_owned()
107 |     };
108 | 
109 |     add_exe_to_path()?;
110 | 
111 |     let rg_args = vec![
112 |         "--no-line-number",
113 |         // smart case by default because within weird files
114 |         // we probably can't really trust casing anyways
115 |         "--smart-case",
116 |     ];
117 | 
118 |     let exe = std::env::current_exe().expect("Could not get executable location");
119 |     let preproc_exe = exe.with_file_name("rga-preproc");
120 | 
121 |     let before = Instant::now();
122 |     let mut cmd = Command::new("rg");
123 |     cmd.args(rg_args)
124 |         .arg("--pre")
125 |         .arg(preproc_exe)
126 |         .arg("--pre-glob")
127 |         .arg(pre_glob)
128 |         .args(passthrough_args);
129 |     log::debug!("rg command to run: {:?}", cmd);
130 |     let mut child = cmd
131 |         .spawn()
132 |         .map_err(|e| map_exe_error(e, "rg", "Please make sure you have ripgrep installed."))?;
133 | 
134 |     let result = child.wait()?;
135 | 
136 |     log::debug!("running rg took {}", print_dur(before));
137 |     if !result.success() {
138 |         std::process::exit(result.code().unwrap_or(1));
139 |     }
140 |     Ok(())
141 | }
142 | 
143 | /// add the directory that contains `rga` to PATH, so rga-preproc can find pandoc etc (if we are on Windows where we include dependent binaries)
144 | fn add_exe_to_path() -> Result<()> {
145 |     use std::env;
146 |     let mut exe = env::current_exe().expect("Could not get executable location");
147 |     // let preproc_exe = exe.with_file_name("rga-preproc");
148 |     exe.pop(); // dirname
149 | 
150 |     let path = env::var_os("PATH").unwrap_or_default();
151 |     let paths = env::split_paths(&path).collect::<Vec<_>>();
152 |     // prepend: prefer bundled versions to system-installed versions of binaries
153 |     // solves https://github.com/phiresky/ripgrep-all/issues/32
154 |     // may be somewhat of a security issue if rga binary is in installed in unprivileged locations
155 |     let paths = [&[exe.to_owned(), exe.join("lib")], &paths[..]].concat();
156 |     let new_path = env::join_paths(paths)?;
157 |     // TODO: Audit that the environment access only happens in single-threaded code.
158 |     unsafe { env::set_var("PATH", new_path) };
159 |     Ok(())
160 | }
161 | 


--------------------------------------------------------------------------------
/src/caching_writer.rs:
--------------------------------------------------------------------------------
 1 | use std::{future::Future, pin::Pin};
 2 | 
 3 | use anyhow::{Context, Result};
 4 | use async_compression::tokio::write::ZstdEncoder;
 5 | use async_stream::stream;
 6 | 
 7 | use crate::to_io_err;
 8 | use log::*;
 9 | use tokio::io::{AsyncRead, AsyncWriteExt};
10 | use tokio_stream::StreamExt;
11 | use tokio_util::io::{ReaderStream, StreamReader};
12 | 
13 | type FinishHandler =
14 |     dyn FnOnce((u64, Option<Vec<u8>>)) -> Pin<Box<dyn Future<Output = Result<()>> + Send>> + Send;
15 | /**
16 |  * wrap a AsyncRead so that it is passthrough,
17 |  * but also the written data is compressed and written into a buffer,
18 |  * unless more than max_cache_size bytes is written, then the cache is dropped and it is pure passthrough.
19 |  */
20 | pub fn async_read_and_write_to_cache<'a>(
21 |     inp: impl AsyncRead + Send + 'a,
22 |     max_cache_size: usize,
23 |     compression_level: i32,
24 |     on_finish: Box<FinishHandler>,
25 | ) -> Result<Pin<Box<dyn AsyncRead + Send + 'a>>> {
26 |     let inp = Box::pin(inp);
27 |     let mut zstd_writer = Some(ZstdEncoder::with_quality(
28 |         Vec::new(),
29 |         async_compression::Level::Precise(compression_level),
30 |     ));
31 |     let mut bytes_written = 0;
32 | 
33 |     let s = stream! {
34 |         let mut stream = ReaderStream::new(inp);
35 |         while let Some(bytes) = stream.next().await {
36 |             trace!("read bytes: {:?}", bytes);
37 |             if let Ok(bytes) = &bytes {
38 |                 if let Some(writer) = zstd_writer.as_mut() {
39 |                     writer.write_all(bytes).await?;
40 |                     bytes_written += bytes.len() as u64;
41 |                     let compressed_len = writer.get_ref().len();
42 |                     trace!("wrote {} to zstd, len now {}", bytes.len(), compressed_len);
43 |                     if compressed_len > max_cache_size {
44 |                         debug!("cache longer than max, dropping");
45 |                         //writer.finish();
46 |                         zstd_writer.take();
47 |                     }
48 |                 }
49 |             }
50 |             yield bytes;
51 |         }
52 |         trace!("eof");
53 |         // EOF, call on_finish
54 |         let finish = {
55 |             match zstd_writer.take() { Some(mut writer) => {
56 |                 writer.shutdown().await?;
57 |                 let res = writer.into_inner();
58 |                 trace!("EOF");
59 |                 if res.len() <= max_cache_size {
60 |                     trace!("writing {} bytes to cache", res.len());
61 |                     (bytes_written, Some(res))
62 |                 } else {
63 |                     trace!("cache longer than max, dropping");
64 |                     (bytes_written, None)
65 |                 }
66 |             } _ => {
67 |                 (bytes_written, None)
68 |             }}
69 |         };
70 | 
71 |         // EOF, finish!
72 |         on_finish(finish).await.context("write_to_cache on_finish")
73 |             .map_err(to_io_err)?;
74 | 
75 |     };
76 | 
77 |     Ok(Box::pin(StreamReader::new(s)))
78 | }
79 | 


--------------------------------------------------------------------------------
/src/config.rs:
--------------------------------------------------------------------------------
  1 | use crate::{adapters::custom::CustomAdapterConfig, project_dirs};
  2 | use anyhow::{Context, Result};
  3 | use derive_more::FromStr;
  4 | use log::*;
  5 | use schemars::JsonSchema;
  6 | use serde::{Deserialize, Serialize};
  7 | use std::ffi::OsString;
  8 | use std::io::Read;
  9 | use std::{fs::File, io::Write, iter::IntoIterator, path::PathBuf, str::FromStr};
 10 | use structopt::StructOpt;
 11 | 
 12 | #[derive(Debug, Deserialize, Serialize)]
 13 | struct ReadableBytesCount(i64);
 14 | 
 15 | fn is_default<T: Default + PartialEq>(t: &T) -> bool {
 16 |     t == &T::default()
 17 | }
 18 | #[derive(JsonSchema, Debug, Serialize, Deserialize, Copy, Clone, PartialEq, FromStr)]
 19 | pub struct CacheCompressionLevel(pub i32);
 20 | 
 21 | impl std::fmt::Display for CacheCompressionLevel {
 22 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 23 |         write!(f, "{}", self.0)
 24 |     }
 25 | }
 26 | impl Default for CacheCompressionLevel {
 27 |     fn default() -> Self {
 28 |         Self(12)
 29 |     }
 30 | }
 31 | #[derive(JsonSchema, Debug, Serialize, Deserialize, Copy, Clone, PartialEq, FromStr)]
 32 | pub struct MaxArchiveRecursion(pub i32);
 33 | 
 34 | impl std::fmt::Display for MaxArchiveRecursion {
 35 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 36 |         write!(f, "{}", self.0)
 37 |     }
 38 | }
 39 | impl Default for MaxArchiveRecursion {
 40 |     fn default() -> Self {
 41 |         Self(5)
 42 |     }
 43 | }
 44 | 
 45 | #[derive(JsonSchema, Debug, Serialize, Deserialize, Clone, PartialEq, FromStr)]
 46 | pub struct CachePath(pub String);
 47 | 
 48 | impl std::fmt::Display for CachePath {
 49 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 50 |         write!(f, "{}", self.0)
 51 |     }
 52 | }
 53 | impl Default for CachePath {
 54 |     fn default() -> Self {
 55 |         let pd = project_dirs().expect("could not get cache path");
 56 |         let app_cache = pd.cache_dir();
 57 |         Self(app_cache.to_str().expect("cache path not utf8").to_owned())
 58 |     }
 59 | }
 60 | 
 61 | #[derive(JsonSchema, Debug, Serialize, Deserialize, Copy, Clone, PartialEq, Eq)]
 62 | pub struct CacheMaxBlobLen(pub usize);
 63 | 
 64 | impl std::fmt::Display for CacheMaxBlobLen {
 65 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 66 |         write!(f, "{}", self.0)
 67 |     }
 68 | }
 69 | impl Default for CacheMaxBlobLen {
 70 |     fn default() -> Self {
 71 |         Self(2000000)
 72 |     }
 73 | }
 74 | 
 75 | impl FromStr for CacheMaxBlobLen {
 76 |     type Err = anyhow::Error;
 77 |     fn from_str(s: &str) -> Result<Self, Self::Err> {
 78 |         let suffix = s.chars().last();
 79 |         if let Some(suffix) = suffix {
 80 |             Ok(Self(match suffix {
 81 |                 'k' | 'M' | 'G' => usize::from_str(s.trim_end_matches(suffix))
 82 |                     .with_context(|| "Could not parse int".to_string())
 83 |                     .map(|e| {
 84 |                         e * match suffix {
 85 |                             'k' => 1000,
 86 |                             'M' => 1_000_000,
 87 |                             'G' => 1_000_000_000,
 88 |                             _ => panic!("impossible"),
 89 |                         }
 90 |                     }),
 91 |                 _ => usize::from_str(s).with_context(|| "Could not parse int".to_string()),
 92 |             }?))
 93 |         } else {
 94 |             Err(anyhow::format_err!("empty byte input"))
 95 |         }
 96 |     }
 97 | }
 98 | 
 99 | /// # rga configuration
100 | ///
101 | /// This is kind of a "polyglot" struct serving multiple purposes:
102 | ///
103 | /// 1. Declare the command line arguments using structopt+clap
104 | /// 1. Provide information for manpage / readme generation.
105 | /// 1. Describe the config file format (output as JSON schema via schemars).
106 | #[derive(StructOpt, Debug, Deserialize, Serialize, JsonSchema, Default, Clone)]
107 | #[structopt(
108 |     name = "ripgrep-all",
109 |     rename_all = "kebab-case",
110 |     about = env!("CARGO_PKG_DESCRIPTION"),
111 |     author = env!("CARGO_PKG_HOMEPAGE"),
112 |     long_about="rga: ripgrep, but also search in PDFs, E-Books, Office documents, zip, tar.gz, etc.",
113 |     // TODO: long_about does not seem to work to only show this on short help
114 |     after_help = "-h shows a concise overview, --help shows more detail and advanced options.\n\nAll other options not shown here are passed directly to rg, especially [PATTERN] and [PATH ...]",
115 |     usage = "rga [RGA OPTIONS] [RG OPTIONS] PATTERN [PATH ...]"
116 | )]
117 | pub struct RgaConfig {
118 |     /// Use more accurate but slower matching by mime type.
119 |     ///
120 |     /// By default, rga will match files using file extensions.
121 |     /// Some programs, such as sqlite3, don't care about the file extension at all, so users sometimes use any or no extension at all.
122 |     /// With this flag, rga will try to detect the mime type of input files using the magic bytes (similar to the `file` utility), and use that to choose the adapter.
123 |     /// Detection is only done on the first 8KiB of the file, since we can't always seek on the input (in archives).
124 |     #[serde(default, skip_serializing_if = "is_default")]
125 |     #[structopt(long = "--rga-accurate")]
126 |     pub accurate: bool,
127 | 
128 |     /// Change which adapters to use and in which priority order (descending).
129 |     ///
130 |     /// - "foo,bar" means use only adapters foo and bar.
131 |     /// - "-bar,baz" means use all default adapters except for bar and baz.
132 |     /// - "+bar,baz" means use all default adapters and also bar and baz.
133 |     #[serde(default, skip_serializing_if = "is_default")]
134 |     #[structopt(
135 |         long = "--rga-adapters",
136 |         require_equals = true,
137 |         require_delimiter = true
138 |     )]
139 |     pub adapters: Vec<String>,
140 | 
141 |     #[serde(default, skip_serializing_if = "is_default")]
142 |     #[structopt(flatten)]
143 |     pub cache: CacheConfig,
144 | 
145 |     /// Maximum depth of nested archives to recurse into.
146 |     ///
147 |     /// When searching in archives, rga will recurse into archives inside archives.
148 |     /// This option limits the depth.
149 |     #[serde(default, skip_serializing_if = "is_default")]
150 |     #[structopt(
151 |         default_value,
152 |         long = "--rga-max-archive-recursion",
153 |         require_equals = true,
154 |         hidden_short_help = true
155 |     )]
156 |     pub max_archive_recursion: MaxArchiveRecursion,
157 | 
158 |     /// Don't prefix lines of files within archive with the path inside the archive.
159 |     ///
160 |     /// Inside archives, by default rga prefixes the content of each file with the file path within the archive.
161 |     /// This is usually useful, but can cause problems because then the inner path is also searched for the pattern.
162 |     #[serde(default, skip_serializing_if = "is_default")]
163 |     #[structopt(long = "--rga-no-prefix-filenames")]
164 |     pub no_prefix_filenames: bool,
165 | 
166 |     #[serde(default, skip_serializing_if = "is_default")]
167 |     #[structopt(skip)] // config file only
168 |     pub custom_adapters: Option<Vec<CustomAdapterConfig>>,
169 | 
170 |     #[serde(skip)]
171 |     #[structopt(long = "--rga-config-file", require_equals = true)]
172 |     pub config_file_path: Option<String>,
173 | 
174 |     /// Same as passing path directly, except if argument is empty.
175 |     ///
176 |     /// Kinda hacky, but if no file is found, `fzf` calls `rga` with empty string as path, which causes "No such file or directory from rg".
177 |     /// So filter those cases and return specially.
178 |     #[serde(skip)] // CLI only
179 |     #[structopt(long = "--rga-fzf-path", require_equals = true, hidden = true)]
180 |     pub fzf_path: Option<String>,
181 | 
182 |     #[serde(skip)] // CLI only
183 |     #[structopt(long = "--rga-list-adapters", help = "List all known adapters")]
184 |     pub list_adapters: bool,
185 | 
186 |     #[serde(skip)] // CLI only
187 |     #[structopt(
188 |         long = "--rga-print-config-schema",
189 |         help = "Print the JSON Schema of the configuration file"
190 |     )]
191 |     pub print_config_schema: bool,
192 | 
193 |     #[serde(skip)] // CLI only
194 |     #[structopt(long, help = "Show help for ripgrep itself")]
195 |     pub rg_help: bool,
196 | 
197 |     #[serde(skip)] // CLI only
198 |     #[structopt(long, help = "Show version of ripgrep itself")]
199 |     pub rg_version: bool,
200 | }
201 | 
202 | #[derive(StructOpt, Debug, Deserialize, Serialize, JsonSchema, Default, Clone, PartialEq)]
203 | pub struct CacheConfig {
204 |     /// Disable caching of results.
205 |     ///
206 |     /// By default, rga caches the extracted text, if it is small enough, to a database.
207 |     /// This way, repeated searches on the same set of files will be much faster.
208 |     /// The location of the DB varies by platform:
209 |     /// - `${XDG_CACHE_DIR-~/.cache}/ripgrep-all` on Linux
210 |     /// - `~/Library/Caches/ripgrep-all` on macOS
211 |     /// - `C:\Users\username\AppData\Local\ripgrep-all` on Windows
212 |     ///
213 |     /// If you pass this flag, all caching will be disabled.
214 |     #[serde(default, skip_serializing_if = "is_default")]
215 |     #[structopt(long = "--rga-no-cache")]
216 |     pub disabled: bool,
217 | 
218 |     /// Max compressed size to cache.
219 |     ///
220 |     /// Longest byte length (after compression) to store in cache.
221 |     /// Longer adapter outputs will not be cached and recomputed every time.
222 |     ///
223 |     /// Allowed suffixes on command line: k M G
224 |     #[serde(default, skip_serializing_if = "is_default")]
225 |     #[structopt(
226 |         default_value,
227 |         long = "--rga-cache-max-blob-len",
228 |         hidden_short_help = true,
229 |         require_equals = true,
230 |         // parse(try_from_str = parse_readable_bytes_str)
231 |     )]
232 |     pub max_blob_len: CacheMaxBlobLen,
233 | 
234 |     /// ZSTD compression level to apply to adapter outputs before storing in cache DB.
235 |     ///
236 |     /// Ranges from 1 - 22.
237 |     #[serde(default, skip_serializing_if = "is_default")]
238 |     #[structopt(
239 |         default_value,
240 |         long = "--rga-cache-compression-level",
241 |         hidden_short_help = true,
242 |         require_equals = true,
243 |         help = ""
244 |     )]
245 |     pub compression_level: CacheCompressionLevel,
246 | 
247 |     /// Path to store cache DB.
248 |     #[serde(default, skip_serializing_if = "is_default")]
249 |     #[structopt(
250 |         default_value,
251 |         long = "--rga-cache-path",
252 |         hidden_short_help = true,
253 |         require_equals = true
254 |     )]
255 |     pub path: CachePath,
256 | }
257 | 
258 | static RGA_CONFIG: &str = "RGA_CONFIG";
259 | 
260 | use serde_json::Value;
261 | fn json_merge(a: &mut Value, b: &Value) {
262 |     match (a, b) {
263 |         (&mut Value::Object(ref mut a), Value::Object(b)) => {
264 |             for (k, v) in b {
265 |                 json_merge(a.entry(k.clone()).or_insert(Value::Null), v);
266 |             }
267 |         }
268 |         (a, b) => {
269 |             *a = b.clone();
270 |         }
271 |     }
272 | }
273 | 
274 | fn read_config_file(path_override: Option<String>) -> Result<(String, Value)> {
275 |     let proj = project_dirs()?;
276 |     let config_dir = proj.config_dir();
277 |     let config_filename = path_override
278 |         .as_ref()
279 |         .map(PathBuf::from)
280 |         .unwrap_or_else(|| config_dir.join("config.jsonc"));
281 |     let config_filename_str = config_filename.to_string_lossy().into_owned();
282 |     if config_filename.exists() {
283 |         let config_file_contents = {
284 |             let raw = std::fs::read_to_string(config_filename).with_context(|| {
285 |                 format!("Could not read config file json {config_filename_str}")
286 |             })?;
287 |             let mut s = String::new();
288 |             json_comments::StripComments::new(raw.as_bytes())
289 |                 .read_to_string(&mut s)
290 |                 .context("strip comments")?;
291 |             s
292 |         };
293 |         {
294 |             // just for error messages, actual deserialization happens after merging with cmd args
295 |             serde_json::from_str::<RgaConfig>(&config_file_contents).with_context(|| {
296 |                 format!("Error in config file {config_filename_str}: {config_file_contents}")
297 |             })?;
298 |         }
299 |         let config_json: serde_json::Value =
300 |             serde_json::from_str(&config_file_contents).context("Could not parse config json")?;
301 |         Ok((config_filename_str, config_json))
302 |     } else if let Some(p) = path_override.as_ref() {
303 |         Err(anyhow::anyhow!("Config file not found: {}", p))?
304 |     } else {
305 |         // write default config
306 |         std::fs::create_dir_all(config_dir)?;
307 |         let mut schemafile = File::create(config_dir.join("config.v1.schema.json"))?;
308 | 
309 |         schemafile.write_all(
310 |             serde_json::to_string_pretty(&schemars::schema_for!(RgaConfig))?.as_bytes(),
311 |         )?;
312 | 
313 |         let mut configfile = File::create(config_filename)?;
314 |         configfile.write_all(include_str!("../doc/config.default.jsonc").as_bytes())?;
315 |         Ok((
316 |             config_filename_str,
317 |             serde_json::Value::Object(Default::default()),
318 |         ))
319 |     }
320 | }
321 | fn read_config_env() -> Result<Value> {
322 |     let val = std::env::var(RGA_CONFIG).ok();
323 |     if let Some(val) = val {
324 |         serde_json::from_str(&val).context("could not parse config from env RGA_CONFIG")
325 |     } else {
326 |         serde_json::to_value(RgaConfig::default()).context("could not create default config")
327 |     }
328 | }
329 | pub fn parse_args<I>(args: I, is_rga_preproc: bool) -> Result<RgaConfig>
330 | where
331 |     I: IntoIterator,
332 |     I::Item: Into<OsString> + Clone,
333 | {
334 |     // TODO: don't read config file in rga-preproc for performance (called for every file)
335 | 
336 |     let arg_matches: RgaConfig = RgaConfig::from_iter(args);
337 |     let args_config = serde_json::to_value(&arg_matches)?;
338 | 
339 |     let merged_config = {
340 |         if is_rga_preproc {
341 |             // only read from env and args
342 |             let mut merged_config = read_config_env()?;
343 |             json_merge(&mut merged_config, &args_config);
344 |             log::debug!("Config: {}", serde_json::to_string(&merged_config)?);
345 |             merged_config
346 |         } else {
347 |             // read from config file, env and args
348 |             let (config_filename, config_file_config) =
349 |                 read_config_file(arg_matches.config_file_path)?;
350 |             let env_var_config = read_config_env()?;
351 |             let mut merged_config = config_file_config.clone();
352 |             json_merge(&mut merged_config, &env_var_config);
353 |             json_merge(&mut merged_config, &args_config);
354 |             log::debug!(
355 |                 "Configs:\n{}: {}\n{}: {}\nArgs: {}\nMerged: {}",
356 |                 config_filename,
357 |                 serde_json::to_string_pretty(&config_file_config)?,
358 |                 RGA_CONFIG,
359 |                 serde_json::to_string_pretty(&env_var_config)?,
360 |                 serde_json::to_string_pretty(&args_config)?,
361 |                 serde_json::to_string_pretty(&merged_config)?
362 |             );
363 |             // pass to child processes
364 |             // TODO: Audit that the environment access only happens in single-threaded code.
365 |             unsafe { std::env::set_var(RGA_CONFIG, merged_config.to_string()) };
366 |             merged_config
367 |         }
368 |     };
369 | 
370 |     let mut res: RgaConfig = serde_json::from_value(merged_config.clone())
371 |         .map_err(|e| {
372 |             println!("{e:?}");
373 |             e
374 |         })
375 |         .with_context(|| {
376 |             format!(
377 |                 "Error parsing merged config: {}",
378 |                 serde_json::to_string_pretty(&merged_config).expect("no tostring")
379 |             )
380 |         })?;
381 |     {
382 |         // readd values with [serde(skip)]
383 |         res.fzf_path = arg_matches.fzf_path;
384 |         res.list_adapters = arg_matches.list_adapters;
385 |         res.print_config_schema = arg_matches.print_config_schema;
386 |         res.rg_help = arg_matches.rg_help;
387 |         res.rg_version = arg_matches.rg_version;
388 |     }
389 |     Ok(res)
390 | }
391 | 
392 | /// Split arguments into the ones we care about and the ones rg cares about
393 | pub fn split_args(is_rga_preproc: bool) -> Result<(RgaConfig, Vec<OsString>)> {
394 |     let mut app = RgaConfig::clap();
395 | 
396 |     app.p.create_help_and_version();
397 |     let mut firstarg = true;
398 |     // debug!("{:#?}", app.p.flags);
399 |     let (our_args, mut passthrough_args): (Vec<OsString>, Vec<OsString>) = std::env::args_os()
400 |         .partition(|os_arg| {
401 |             if firstarg {
402 |                 // hacky, but .enumerate() would be ugly because partition is too simplistic
403 |                 firstarg = false;
404 |                 return true;
405 |             }
406 |             if let Some(arg) = os_arg.to_str() {
407 |                 arg.starts_with("--rga-")
408 |                     || arg.starts_with("--rg-")
409 |                     || arg == "--help"
410 |                     || arg == "-h"
411 |                     || arg == "--version"
412 |                     || arg == "-V"
413 |             } else {
414 |                 // args that are not unicode can only be filenames, pass them to rg
415 |                 false
416 |             }
417 |         });
418 |     debug!("rga (our) args: {:?}", our_args);
419 |     let matches = parse_args(our_args, is_rga_preproc).context("Could not parse config")?;
420 |     if matches.rg_help {
421 |         passthrough_args.insert(0, "--help".into());
422 |     }
423 |     if matches.rg_version {
424 |         passthrough_args.insert(0, "--version".into());
425 |     }
426 |     debug!("rga (passthrough) args: {:?}", passthrough_args);
427 |     Ok((matches, passthrough_args))
428 | }
429 | 


--------------------------------------------------------------------------------
/src/expand.rs:
--------------------------------------------------------------------------------
  1 | use std::borrow::Cow;
  2 | 
  3 | use anyhow::Result;
  4 | 
  5 | // from https://github.com/phiresky/timetrackrs/blob/1c3df09ba2c1fda6065f2927045bd28dea0738d3/src/expand.rs
  6 | 
  7 | pub fn find_byte(needle: u8, haystack: &[u8]) -> Option<usize> {
  8 |     #[cfg(not(feature = "perf-literal"))]
  9 |     fn imp(needle: u8, haystack: &[u8]) -> Option<usize> {
 10 |         haystack.iter().position(|&b| b == needle)
 11 |     }
 12 | 
 13 |     #[cfg(feature = "perf-literal")]
 14 |     fn imp(needle: u8, haystack: &[u8]) -> Option<usize> {
 15 |         use memchr::memchr;
 16 |         memchr(needle, haystack)
 17 |     }
 18 | 
 19 |     imp(needle, haystack)
 20 | }
 21 | 
 22 | pub fn expand_str_ez<'a, F>(replacement: &'a str, lambda: F) -> Result<String>
 23 | where
 24 |     F: Fn(&str) -> Result<Cow<'a, str>>,
 25 | {
 26 |     let mut dst = String::new();
 27 |     expand_str_lambda(lambda, replacement, &mut dst)?;
 28 |     Ok(dst)
 29 | }
 30 | 
 31 | pub fn expand_str_lambda<'a, F>(cap: F, replacement: &'a str, dst: &mut String) -> Result<()>
 32 | where
 33 |     F: Fn(&str) -> Result<Cow<'a, str>>,
 34 | {
 35 |     let mut replacement = replacement;
 36 |     while !replacement.is_empty() {
 37 |         match find_byte(b'$', replacement.as_bytes()) {
 38 |             None => break,
 39 |             Some(i) => {
 40 |                 dst.push_str(&replacement[..i]);
 41 |                 replacement = &replacement[i..];
 42 |             }
 43 |         }
 44 |         if replacement.as_bytes().get(1).is_some_and(|&b| b == b'$') {
 45 |             dst.push('$');
 46 |             replacement = &replacement[2..];
 47 |             continue;
 48 |         }
 49 |         debug_assert!(!replacement.is_empty());
 50 |         let cap_ref = match find_cap_ref(replacement.as_bytes()) {
 51 |             Some(cap_ref) => cap_ref,
 52 |             None => {
 53 |                 dst.push('$');
 54 |                 replacement = &replacement[1..];
 55 |                 continue;
 56 |             }
 57 |         };
 58 |         replacement = &replacement[cap_ref.end..];
 59 |         dst.push_str(cap(cap_ref.cap)?.as_ref());
 60 |     }
 61 |     dst.push_str(replacement);
 62 |     Ok(())
 63 | }
 64 | 
 65 | /// `CaptureRef` represents a reference to a capture group inside some text.
 66 | /// The reference is either a capture group name or a number.
 67 | ///
 68 | /// It is also tagged with the position in the text following the
 69 | /// capture reference.
 70 | #[derive(Clone, Copy, Debug, Eq, PartialEq)]
 71 | struct CaptureRef<'a> {
 72 |     cap: &'a str,
 73 |     end: usize,
 74 | }
 75 | 
 76 | /// Parses a possible reference to a capture group name in the given text,
 77 | /// starting at the beginning of `replacement`.
 78 | ///
 79 | /// If no such valid reference could be found, None is returned.
 80 | fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef> {
 81 |     let mut i = 0;
 82 |     let rep: &[u8] = replacement;
 83 |     if rep.len() <= 1 || rep[0] != b'$' {
 84 |         return None;
 85 |     }
 86 |     i += 1;
 87 |     if rep[i] == b'{' {
 88 |         return find_cap_ref_braced(rep, i + 1);
 89 |     }
 90 |     let mut cap_end = i;
 91 |     while rep.get(cap_end).is_some_and(is_valid_cap_letter) {
 92 |         cap_end += 1;
 93 |     }
 94 |     if cap_end == i {
 95 |         return None;
 96 |     }
 97 |     // We just verified that the range 0..cap_end is valid ASCII, so it must
 98 |     // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8
 99 |     // check with either unsafe or by parsing the number straight from &[u8].
100 |     let cap = std::str::from_utf8(&rep[i..cap_end]).expect("valid UTF-8 capture name");
101 |     Some(CaptureRef { cap, end: cap_end })
102 | }
103 | 
104 | fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef> {
105 |     let start = i;
106 |     while rep.get(i).is_some_and(|&b| b != b'}') {
107 |         i += 1;
108 |     }
109 |     if rep.get(i).is_none_or(|&b| b != b'}') {
110 |         return None;
111 |     }
112 |     // When looking at braced names, we don't put any restrictions on the name,
113 |     // so it's possible it could be invalid UTF-8. But a capture group name
114 |     // can never be invalid UTF-8, so if we have invalid UTF-8, then we can
115 |     // safely return None.
116 |     let cap = match std::str::from_utf8(&rep[start..i]) {
117 |         Err(_) => return None,
118 |         Ok(cap) => cap,
119 |     };
120 |     Some(CaptureRef { cap, end: i + 1 })
121 | }
122 | 
123 | /// Returns true if and only if the given byte is allowed in a capture name.
124 | fn is_valid_cap_letter(b: &u8) -> bool {
125 |     matches!(b, b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_')
126 | }
127 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![warn(clippy::all)]
 2 | 
 3 | pub mod adapted_iter;
 4 | pub mod adapters;
 5 | mod caching_writer;
 6 | pub mod config;
 7 | pub mod expand;
 8 | pub mod matching;
 9 | pub mod preproc;
10 | pub mod preproc_cache;
11 | pub mod recurse;
12 | #[cfg(test)]
13 | pub mod test_utils;
14 | use anyhow::Context;
15 | use anyhow::Result;
16 | use async_stream::stream;
17 | use directories_next::ProjectDirs;
18 | use std::time::Instant;
19 | use tokio::io::AsyncRead;
20 | use tokio::task::JoinHandle;
21 | use tokio_util::io::StreamReader;
22 | 
23 | pub fn project_dirs() -> Result<ProjectDirs> {
24 |     directories_next::ProjectDirs::from("", "", "ripgrep-all")
25 |         .context("no home directory found! :(")
26 | }
27 | 
28 | // no "significant digits" format specifier in rust??
29 | // https://stackoverflow.com/questions/60497397/how-do-you-format-a-float-to-the-first-significant-decimal-and-with-specified-pr
30 | fn meh(float: f32, precision: usize) -> usize {
31 |     // compute absolute value
32 |     let a = float.abs();
33 | 
34 |     // if abs value is greater than 1, then precision becomes less than "standard"
35 | 
36 |     if a >= 1. {
37 |         // reduce by number of digits, minimum 0
38 |         let n = (1. + a.log10().floor()) as usize;
39 |         precision.saturating_sub(n)
40 |     // if precision is less than 1 (but non-zero), then precision becomes greater than "standard"
41 |     } else if a > 0. {
42 |         // increase number of digits
43 |         let n = -(1. + a.log10().floor()) as usize;
44 |         precision + n
45 |     // special case for 0
46 |     } else {
47 |         0
48 |     }
49 | }
50 | 
51 | pub fn print_dur(start: Instant) -> String {
52 |     let mut dur = Instant::now().duration_since(start).as_secs_f32();
53 |     let mut suffix = "";
54 |     if dur < 0.1 {
55 |         suffix = "m";
56 |         dur *= 1000.0;
57 |     }
58 |     let precision = meh(dur, 3);
59 |     format!("{dur:.precision$}{suffix}s")
60 | }
61 | 
62 | pub fn print_bytes(bytes: impl Into<f64>) -> String {
63 |     pretty_bytes::converter::convert(bytes.into())
64 | }
65 | 
66 | pub fn to_io_err(e: anyhow::Error) -> std::io::Error {
67 |     std::io::Error::new(std::io::ErrorKind::Other, e)
68 | }
69 | 
70 | #[cfg(test)]
71 | #[ctor::ctor]
72 | fn init() {
73 |     env_logger::init();
74 | }
75 | 
76 | /** returns an AsyncRead that is empty but returns an io error if the given task had an io error or join error */
77 | pub fn join_handle_to_stream(join: JoinHandle<std::io::Result<()>>) -> impl AsyncRead {
78 |     let st = stream! {
79 |         join.await.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))??;
80 |         yield std::io::Result::Ok(&b""[..])
81 |     };
82 | 
83 |     StreamReader::new(st)
84 | }
85 | 


--------------------------------------------------------------------------------
/src/matching.rs:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Module for matching adapters to files based on file name or mime type
  3 |  */
  4 | use crate::adapters::*;
  5 | 
  6 | use anyhow::*;
  7 | 
  8 | use regex::{Regex, RegexSet};
  9 | 
 10 | use std::iter::Iterator;
 11 | 
 12 | use std::sync::Arc;
 13 | 
 14 | // match only based on file path
 15 | #[derive(Clone, Debug)]
 16 | pub enum FastFileMatcher {
 17 |     // MimeType(Regex),
 18 |     /**
 19 |      * without the leading dot, e.g. "jpg" or "tar.gz". Matched as /.*\.ext$/
 20 |      *
 21 |      */
 22 |     FileExtension(String),
 23 |     // todo: maybe add others, e.g. regex on whole filename or even paths
 24 |     // todo: maybe allow matching a directory (e.g. /var/lib/postgres)
 25 | }
 26 | 
 27 | #[derive(Clone, Debug)]
 28 | pub enum FileMatcher {
 29 |     /// any type of fast matcher
 30 |     Fast(FastFileMatcher),
 31 |     ///
 32 |     /// match by exact mime type extracted using tree_magic
 33 |     /// TODO: allow match ignoring suffix etc?
 34 |     MimeType(String),
 35 | }
 36 | 
 37 | impl From<FastFileMatcher> for FileMatcher {
 38 |     fn from(t: FastFileMatcher) -> Self {
 39 |         Self::Fast(t)
 40 |     }
 41 | }
 42 | 
 43 | pub struct FileMeta {
 44 |     // filename is not actually a utf8 string, but since we can't do regex on OsStr and can't get a &[u8] from OsStr either,
 45 |     // and since we probably only want to do only matching on ascii stuff anyways, this is the filename as a string with non-valid bytes removed
 46 |     pub lossy_filename: String,
 47 |     // only given when slow matching is enabled
 48 |     pub mimetype: Option<&'static str>,
 49 | }
 50 | 
 51 | pub fn extension_to_regex(extension: &str) -> Regex {
 52 |     Regex::new(&format!("(?i)\\.{}$", &regex::escape(extension)))
 53 |         .expect("we know this regex compiles")
 54 | }
 55 | 
 56 | #[allow(clippy::type_complexity)]
 57 | pub fn adapter_matcher(
 58 |     adapters: &[Arc<dyn FileAdapter>],
 59 |     slow: bool,
 60 | ) -> Result<impl Fn(FileMeta) -> Option<(Arc<dyn FileAdapter>, FileMatcher)> + use<>> {
 61 |     // need order later
 62 |     let adapter_names: Vec<String> = adapters.iter().map(|e| e.metadata().name.clone()).collect();
 63 |     let mut fname_regexes = vec![];
 64 |     let mut mime_regexes = vec![];
 65 |     for adapter in adapters.iter() {
 66 |         let metadata = adapter.metadata();
 67 |         use FileMatcher::*;
 68 |         for matcher in metadata.get_matchers(slow) {
 69 |             match matcher.as_ref() {
 70 |                 MimeType(re) => {
 71 |                     mime_regexes.push((re.clone(), adapter.clone(), MimeType(re.clone())))
 72 |                 }
 73 |                 Fast(FastFileMatcher::FileExtension(re)) => fname_regexes.push((
 74 |                     extension_to_regex(re),
 75 |                     adapter.clone(),
 76 |                     Fast(FastFileMatcher::FileExtension(re.clone())),
 77 |                 )),
 78 |             };
 79 |         }
 80 |     }
 81 |     let fname_regex_set = RegexSet::new(fname_regexes.iter().map(|p| p.0.as_str()))?;
 82 |     let mime_regex_set = RegexSet::new(mime_regexes.iter().map(|p| p.0.as_str()))?;
 83 |     Ok(move |meta: FileMeta| {
 84 |         let fname_matches: Vec<_> = fname_regex_set
 85 |             .matches(&meta.lossy_filename)
 86 |             .into_iter()
 87 |             .collect();
 88 |         let mime_matches: Vec<_> = if slow {
 89 |             mime_regex_set
 90 |                 .matches(meta.mimetype.expect("No mimetype?"))
 91 |                 .into_iter()
 92 |                 .collect()
 93 |         } else {
 94 |             vec![]
 95 |         };
 96 |         if fname_matches.len() + mime_matches.len() > 1 {
 97 |             // get first according to original priority list...
 98 |             // todo: kinda ugly
 99 |             let fa = fname_matches
100 |                 .iter()
101 |                 .map(|e| (fname_regexes[*e].1.clone(), fname_regexes[*e].2.clone()));
102 |             let fb = mime_matches
103 |                 .iter()
104 |                 .map(|e| (mime_regexes[*e].1.clone(), mime_regexes[*e].2.clone()));
105 |             let mut v = vec![];
106 |             v.extend(fa);
107 |             v.extend(fb);
108 |             v.sort_by_key(|e| {
109 |                 adapter_names
110 |                     .iter()
111 |                     .position(|r| r == &e.0.metadata().name)
112 |                     .expect("impossib7")
113 |             });
114 |             eprintln!(
115 |                 "Warning: found multiple adapters for {}:",
116 |                 meta.lossy_filename
117 |             );
118 |             for mmatch in v.iter() {
119 |                 eprintln!(" - {}", mmatch.0.metadata().name);
120 |             }
121 |             return Some(v[0].clone());
122 |         }
123 |         if mime_matches.is_empty() {
124 |             if fname_matches.is_empty() {
125 |                 None
126 |             } else {
127 |                 let (_, adapter, matcher) = &fname_regexes[fname_matches[0]];
128 |                 Some((adapter.clone(), matcher.clone()))
129 |             }
130 |         } else {
131 |             let (_, adapter, matcher) = &mime_regexes[mime_matches[0]];
132 |             Some((adapter.clone(), matcher.clone()))
133 |         }
134 |     })
135 | }
136 | 


--------------------------------------------------------------------------------
/src/preproc.rs:
--------------------------------------------------------------------------------
  1 | use crate::adapted_iter::AdaptedFilesIterBox;
  2 | use crate::adapters::*;
  3 | use crate::caching_writer::async_read_and_write_to_cache;
  4 | use crate::config::RgaConfig;
  5 | use crate::matching::*;
  6 | use crate::preproc_cache::CacheKey;
  7 | use crate::recurse::concat_read_streams;
  8 | use crate::{
  9 |     preproc_cache::{PreprocCache, open_cache_db},
 10 |     print_bytes,
 11 | };
 12 | use anyhow::*;
 13 | use async_compression::tokio::bufread::ZstdDecoder;
 14 | use async_stream::stream;
 15 | // use futures::future::{BoxFuture, FutureExt};
 16 | use log::*;
 17 | use postproc::PostprocPrefix;
 18 | use std::future::Future;
 19 | use std::io::Cursor;
 20 | use std::path::Path;
 21 | use std::pin::Pin;
 22 | use std::sync::Arc;
 23 | use tokio::io::AsyncBufReadExt;
 24 | use tokio::io::BufReader;
 25 | use tokio::io::{AsyncBufRead, AsyncReadExt};
 26 | 
 27 | pub type ActiveAdapters = Vec<Arc<dyn FileAdapter>>;
 28 | 
 29 | async fn choose_adapter(
 30 |     config: &RgaConfig,
 31 |     filepath_hint: &Path,
 32 |     archive_recursion_depth: i32,
 33 |     inp: &mut (impl AsyncBufRead + Unpin),
 34 | ) -> Result<Option<(Arc<dyn FileAdapter>, FileMatcher, ActiveAdapters)>> {
 35 |     let active_adapters = get_adapters_filtered(config.custom_adapters.clone(), &config.adapters)?;
 36 |     let adapters = adapter_matcher(&active_adapters, config.accurate)?;
 37 |     let filename = filepath_hint
 38 |         .file_name()
 39 |         .ok_or_else(|| format_err!("Empty filename"))?;
 40 |     debug!("Archive recursion depth: {}", archive_recursion_depth);
 41 | 
 42 |     let mimetype = if config.accurate {
 43 |         let buf = inp.fill_buf().await?; // fill but do not consume!
 44 |         if buf.starts_with(b"From \x0d") || buf.starts_with(b"From -") {
 45 |             Some("application/mbox")
 46 |         } else {
 47 |             let mimetype = tree_magic::from_u8(buf);
 48 |             debug!("mimetype: {:?}", mimetype);
 49 |             Some(mimetype)
 50 |         }
 51 |     } else {
 52 |         None
 53 |     };
 54 |     let adapter = adapters(FileMeta {
 55 |         mimetype,
 56 |         lossy_filename: filename.to_string_lossy().to_string(),
 57 |     });
 58 |     Ok(adapter.map(|e| (e.0, e.1, active_adapters)))
 59 | }
 60 | 
 61 | enum Ret {
 62 |     Recurse(AdaptInfo, Arc<dyn FileAdapter>, FileMatcher, ActiveAdapters),
 63 |     Passthrough(AdaptInfo),
 64 | }
 65 | async fn buf_choose_adapter(ai: AdaptInfo) -> Result<Ret> {
 66 |     let mut inp = BufReader::with_capacity(1 << 16, ai.inp);
 67 |     let adapter = choose_adapter(
 68 |         &ai.config,
 69 |         &ai.filepath_hint,
 70 |         ai.archive_recursion_depth,
 71 |         &mut inp,
 72 |     )
 73 |     .await?;
 74 |     let ai = AdaptInfo {
 75 |         inp: Box::pin(inp),
 76 |         ..ai
 77 |     };
 78 |     let (a, b, c) = match adapter {
 79 |         Some(x) => x,
 80 |         None => {
 81 |             // allow passthrough if the file is in an archive or accurate matching is enabled
 82 |             // otherwise it should have been filtered out by rg pre-glob since rg can handle those better than us
 83 |             let allow_cat = !ai.is_real_file || ai.config.accurate;
 84 |             if allow_cat {
 85 |                 if ai.postprocess {
 86 |                     (
 87 |                         Arc::new(PostprocPrefix {}) as Arc<dyn FileAdapter>,
 88 |                         FileMatcher::Fast(FastFileMatcher::FileExtension("default".to_string())),
 89 |                         Vec::new(),
 90 |                     )
 91 |                 } else {
 92 |                     return Ok(Ret::Passthrough(ai));
 93 |                 }
 94 |             } else {
 95 |                 return Err(format_err!(
 96 |                     "No adapter found for file {:?}, passthrough disabled.",
 97 |                     ai.filepath_hint
 98 |                         .file_name()
 99 |                         .ok_or_else(|| format_err!("Empty filename"))?
100 |                 ));
101 |             }
102 |         }
103 |     };
104 |     Ok(Ret::Recurse(ai, a, b, c))
105 | }
106 | 
107 | /**
108 |  * preprocess a file as defined in `ai`.
109 |  *
110 |  * If a cache is passed, read/write to it.
111 |  *
112 |  */
113 | pub async fn rga_preproc(ai: AdaptInfo) -> Result<ReadBox> {
114 |     debug!("path (hint) to preprocess: {:?}", ai.filepath_hint);
115 | 
116 |     // todo: figure out when using a bufreader is a good idea and when it is not
117 |     // seems to be good for File::open() reads, but not sure about within archives (tar, zip)
118 |     let (ai, adapter, detection_reason, active_adapters) = match buf_choose_adapter(ai).await? {
119 |         Ret::Recurse(ai, a, b, c) => (ai, a, b, c),
120 |         Ret::Passthrough(ai) => {
121 |             return Ok(ai.inp);
122 |         }
123 |     };
124 |     let path_hint_copy = ai.filepath_hint.clone();
125 |     adapt_caching(ai, adapter, detection_reason, active_adapters)
126 |         .await
127 |         .with_context(|| format!("run_adapter({})", &path_hint_copy.to_string_lossy()))
128 | }
129 | 
130 | async fn adapt_caching(
131 |     ai: AdaptInfo,
132 |     adapter: Arc<dyn FileAdapter>,
133 |     detection_reason: FileMatcher,
134 |     active_adapters: ActiveAdapters,
135 | ) -> Result<ReadBox> {
136 |     let meta = adapter.metadata();
137 |     debug!(
138 |         "Chose adapter '{}' because of matcher {:?}",
139 |         &meta.name, &detection_reason
140 |     );
141 |     eprintln!(
142 |         "{} adapter: {}",
143 |         ai.filepath_hint.to_string_lossy(),
144 |         &meta.name
145 |     );
146 |     let cache_compression_level = ai.config.cache.compression_level;
147 |     let cache_max_blob_len = ai.config.cache.max_blob_len;
148 | 
149 |     let cache = if ai.is_real_file && !ai.config.cache.disabled {
150 |         Some(open_cache_db(Path::new(&ai.config.cache.path.0)).await?)
151 |     } else {
152 |         None
153 |     };
154 | 
155 |     let mut cache = cache.context("No cache?")?;
156 |     let cache_key = CacheKey::new(
157 |         ai.postprocess,
158 |         &ai.filepath_hint,
159 |         adapter.as_ref(),
160 |         &active_adapters,
161 |     )?;
162 |     // let dbg_ctx = format!("adapter {}", &adapter.metadata().name);
163 |     let cached = cache.get(&cache_key).await.context("cache.get")?;
164 |     match cached {
165 |         Some(cached) => Ok(Box::pin(ZstdDecoder::new(Cursor::new(cached)))),
166 |         None => {
167 |             debug!("cache MISS, running adapter with caching...");
168 |             let inp = loop_adapt(adapter.as_ref(), detection_reason, ai).await?;
169 |             let inp = concat_read_streams(inp);
170 |             let inp = async_read_and_write_to_cache(
171 |                 inp,
172 |                 cache_max_blob_len.0,
173 |                 cache_compression_level.0,
174 |                 Box::new(move |(uncompressed_size, compressed)| {
175 |                     Box::pin(async move {
176 |                         debug!(
177 |                             "uncompressed output: {}",
178 |                             print_bytes(uncompressed_size as f64)
179 |                         );
180 |                         if let Some(cached) = compressed {
181 |                             debug!("compressed output: {}", print_bytes(cached.len() as f64));
182 |                             cache
183 |                                 .set(&cache_key, cached)
184 |                                 .await
185 |                                 .context("writing to cache")?
186 |                         }
187 |                         Ok(())
188 |                     })
189 |                 }),
190 |             )?;
191 | 
192 |             Ok(Box::pin(inp))
193 |         }
194 |     }
195 | }
196 | 
197 | async fn read_discard(mut x: ReadBox) -> Result<()> {
198 |     let mut buf = [0u8; 1 << 16];
199 |     loop {
200 |         let n = x.read(&mut buf).await?;
201 |         if n == 0 {
202 |             break;
203 |         }
204 |     }
205 |     Ok(())
206 | }
207 | 
208 | pub fn loop_adapt(
209 |     adapter: &dyn FileAdapter,
210 |     detection_reason: FileMatcher,
211 |     ai: AdaptInfo,
212 | ) -> Pin<Box<dyn Future<Output = anyhow::Result<AdaptedFilesIterBox>> + Send + '_>> {
213 |     Box::pin(async move { loop_adapt_inner(adapter, detection_reason, ai).await })
214 | }
215 | pub async fn loop_adapt_inner(
216 |     adapter: &dyn FileAdapter,
217 |     detection_reason: FileMatcher,
218 |     ai: AdaptInfo,
219 | ) -> anyhow::Result<AdaptedFilesIterBox> {
220 |     let fph = ai.filepath_hint.clone();
221 |     let inp = adapter.adapt(ai, &detection_reason).await;
222 |     let inp = if adapter.metadata().name == "postprocprefix" {
223 |         // don't add confusing error context
224 |         inp?
225 |     } else {
226 |         inp.with_context(|| {
227 |             format!(
228 |                 "adapting {} via {} failed",
229 |                 fph.to_string_lossy(),
230 |                 adapter.metadata().name
231 |             )
232 |         })?
233 |     };
234 |     let s = stream! {
235 |         for await file in inp {
236 |             trace!("next file");
237 |             match buf_choose_adapter(file?).await? {
238 |                 Ret::Recurse(ai, adapter, detection_reason, _active_adapters) => {
239 |                     if ai.archive_recursion_depth >= ai.config.max_archive_recursion.0 {
240 |                         // some adapters (esp. zip) assume that the entry is read fully and might hang otherwise
241 |                         read_discard(ai.inp).await?;
242 |                         let s = format!("{}[rga: max archive recursion reached ({})]\n", ai.line_prefix, ai.archive_recursion_depth).into_bytes();
243 |                         yield Ok(AdaptInfo {
244 |                             inp: Box::pin(Cursor::new(s)),
245 |                             ..ai
246 |                         });
247 |                         continue;
248 |                     }
249 |                     debug!(
250 |                         "Chose adapter '{}' because of matcher {:?}",
251 |                         &adapter.metadata().name, &detection_reason
252 |                     );
253 |                     eprintln!(
254 |                         "{} adapter: {}",
255 |                         ai.filepath_hint.to_string_lossy(),
256 |                         &adapter.metadata().name
257 |                     );
258 |                     for await ifile in loop_adapt(adapter.as_ref(), detection_reason, ai).await? {
259 |                         yield ifile;
260 |                     }
261 |                 }
262 |                 Ret::Passthrough(ai) => {
263 |                     debug!("no adapter for {}, ending recursion", ai.filepath_hint.to_string_lossy());
264 |                     yield Ok(ai);
265 |                 }
266 |             }
267 |             trace!("done with files");
268 |         }
269 |         trace!("stream ended");
270 |     };
271 |     Ok(Box::pin(s))
272 | }
273 | 


--------------------------------------------------------------------------------
/src/preproc_cache.rs:
--------------------------------------------------------------------------------
  1 | use crate::{adapters::FileAdapter, preproc::ActiveAdapters};
  2 | use anyhow::{Context, Result};
  3 | use log::warn;
  4 | use path_clean::PathClean;
  5 | use rusqlite::{OptionalExtension, named_params};
  6 | use std::{path::Path, time::UNIX_EPOCH};
  7 | use tokio_rusqlite::Connection;
  8 | 
  9 | static SCHEMA_VERSION: i32 = 3;
 10 | #[derive(Clone)]
 11 | pub struct CacheKey {
 12 |     config_hash: String,
 13 |     adapter: String,
 14 |     adapter_version: i32,
 15 |     active_adapters: String,
 16 |     file_path: String,
 17 |     file_mtime_unix_ms: i64,
 18 | }
 19 | impl CacheKey {
 20 |     pub fn new(
 21 |         postprocess: bool,
 22 |         filepath_hint: &Path,
 23 |         adapter: &dyn FileAdapter,
 24 |         active_adapters: &ActiveAdapters,
 25 |     ) -> Result<Self> {
 26 |         let meta = std::fs::metadata(filepath_hint)
 27 |             .with_context(|| format!("reading metadata for {}", filepath_hint.to_string_lossy()))?;
 28 |         let modified = meta.modified().expect("weird OS that can't into mtime");
 29 |         let file_mtime_unix_ms = modified.duration_since(UNIX_EPOCH)?.as_millis() as i64;
 30 |         let active_adapters = if adapter.metadata().recurses {
 31 |             serde_json::to_string(
 32 |                 &active_adapters
 33 |                     .iter()
 34 |                     .map(|a| format!("{}.v{}", a.metadata().name, a.metadata().version))
 35 |                     .collect::<Vec<_>>(),
 36 |             )?
 37 |         } else {
 38 |             "null".to_string()
 39 |         };
 40 |         Ok(Self {
 41 |             config_hash: if postprocess {
 42 |                 "a41e2e9".to_string()
 43 |             } else {
 44 |                 "f1502a3".to_string()
 45 |             }, // todo: when we add more config options that affect caching, create a struct and actually hash it
 46 |             adapter: adapter.metadata().name.clone(),
 47 |             adapter_version: adapter.metadata().version,
 48 |             file_path: filepath_hint.clean().to_string_lossy().to_string(),
 49 |             file_mtime_unix_ms,
 50 |             active_adapters,
 51 |         })
 52 |     }
 53 | }
 54 | 
 55 | #[async_trait::async_trait]
 56 | pub trait PreprocCache {
 57 |     async fn get(&self, key: &CacheKey) -> Result<Option<Vec<u8>>>;
 58 |     async fn set(&mut self, key: &CacheKey, value: Vec<u8>) -> Result<()>;
 59 | }
 60 | 
 61 | async fn connect_pragmas(db: &Connection) -> Result<()> {
 62 |     // https://phiresky.github.io/blog/2020/sqlite-performance-tuning/
 63 |     //let want_page_size = 32768;
 64 |     //db.execute(&format!("pragma page_size = {};", want_page_size))
 65 |     //    .context("setup pragma 1")?;
 66 |     db.call(|db| {
 67 |         // db.busy_timeout(Duration::from_secs(10))?;
 68 |         db.pragma_update(None, "journal_mode", "wal")?;
 69 |         db.pragma_update(None, "foreign_keys", "on")?;
 70 |         db.pragma_update(None, "temp_store", "memory")?;
 71 |         db.pragma_update(None, "synchronous", "off")?; // integrity isn't very important here
 72 |         db.pragma_update(None, "mmap_size", "2000000000")?;
 73 |         db.execute("
 74 |             create table if not exists preproc_cache (
 75 |                 config_hash text not null,
 76 |                 adapter text not null,
 77 |                 adapter_version integer not null,
 78 |                 created_unix_ms integer not null default (unixepoch() * 1000),
 79 |                 active_adapters text not null, -- 'null' if adapter cannot recurse
 80 |                 file_path text not null,
 81 |                 file_mtime_unix_ms integer not null,
 82 |                 text_content_zstd blob not null
 83 |             ) strict", []
 84 |         )?;
 85 | 
 86 |         db.execute("create unique index if not exists preproc_cache_idx on preproc_cache (config_hash, adapter, adapter_version, file_path, active_adapters)", [])?;
 87 | 
 88 |         Ok(())
 89 |     })
 90 |     .await.context("connect_pragmas")?;
 91 |     let jm: i64 = db
 92 |         .call(|db| Ok(db.pragma_query_value(None, "application_id", |r| r.get(0))?))
 93 |         .await?;
 94 |     if jm != 924716026 {
 95 |         // (probably) newly created db
 96 |         db.call(|db| Ok(db.pragma_update(None, "application_id", "924716026")?))
 97 |             .await?;
 98 |     }
 99 |     Ok(())
100 | }
101 | 
102 | struct SqliteCache {
103 |     db: Connection,
104 | }
105 | impl SqliteCache {
106 |     async fn new(path: &Path) -> Result<Self> {
107 |         let db = Connection::open(path.join("cache.sqlite3")).await?;
108 |         db.call(|db| {
109 |             let schema_version: i32 = db.pragma_query_value(None, "user_version", |r| r.get(0))?;
110 |             if schema_version != SCHEMA_VERSION {
111 |                 warn!("Cache schema version mismatch, clearing cache");
112 |                 db.execute("drop table if exists preproc_cache", [])?;
113 |                 db.pragma_update(None, "user_version", format!("{SCHEMA_VERSION}"))?;
114 |             }
115 |             Ok(())
116 |         })
117 |         .await?;
118 | 
119 |         connect_pragmas(&db).await?;
120 | 
121 |         Ok(Self { db })
122 |     }
123 | }
124 | 
125 | #[async_trait::async_trait]
126 | impl PreprocCache for SqliteCache {
127 |     async fn get(&self, key: &CacheKey) -> Result<Option<Vec<u8>>> {
128 |         let key = (*key).clone(); // todo: without cloning
129 |         Ok(self
130 |             .db
131 |             .call(move |db| {
132 |                 Ok(db
133 |                     .query_row(
134 |                         "select text_content_zstd from preproc_cache where
135 |                             adapter = :adapter
136 |                         and config_hash = :config_hash
137 |                         and adapter_version = :adapter_version
138 |                         and active_adapters = :active_adapters
139 |                         and file_path = :file_path
140 |                         and file_mtime_unix_ms = :file_mtime_unix_ms
141 |                 ",
142 |                         named_params! {
143 |                             ":config_hash": &key.config_hash,
144 |                             ":adapter": &key.adapter,
145 |                             ":adapter_version": &key.adapter_version,
146 |                             ":active_adapters": &key.active_adapters,
147 |                             ":file_path": &key.file_path,
148 |                             ":file_mtime_unix_ms": &key.file_mtime_unix_ms
149 |                         },
150 |                         |r| r.get::<_, Vec<u8>>(0),
151 |                     )
152 |                     .optional()?)
153 |             })
154 |             .await
155 |             .context("reading from cache")?)
156 |     }
157 | 
158 |     async fn set(&mut self, key: &CacheKey, value: Vec<u8>) -> Result<()> {
159 |         let key = (*key).clone(); // todo: without cloning
160 |         log::trace!(
161 |             "Writing to cache: {}, {}, {} byte",
162 |             key.adapter,
163 |             key.file_path,
164 |             value.len()
165 |         );
166 |         Ok(self
167 |             .db
168 |             .call(move |db| {
169 |                 db.execute(
170 |                     "insert into preproc_cache (config_hash, adapter, adapter_version, active_adapters, file_path, file_mtime_unix_ms, text_content_zstd) values
171 |                         (:config_hash, :adapter, :adapter_version, :active_adapters, :file_path, :file_mtime_unix_ms, :text_content_zstd)
172 |                     on conflict (config_hash, adapter, adapter_version, active_adapters, file_path) do update set
173 |                         file_mtime_unix_ms = :file_mtime_unix_ms,
174 |                         created_unix_ms = unixepoch() * 1000,
175 |                         text_content_zstd = :text_content_zstd",
176 |                     named_params! {
177 |                         ":config_hash": &key.config_hash,
178 |                         ":adapter": &key.adapter,
179 |                         ":adapter_version": &key.adapter_version,
180 |                         ":active_adapters": &key.active_adapters,
181 |                         ":file_path": &key.file_path,
182 |                         ":file_mtime_unix_ms": &key.file_mtime_unix_ms,
183 |                         ":text_content_zstd": value
184 |                     })?;
185 |                 Ok(())
186 |             })
187 |             .await?)
188 |     }
189 | }
190 | /// opens a default cache
191 | pub async fn open_cache_db(path: &Path) -> Result<impl PreprocCache + use<>> {
192 |     std::fs::create_dir_all(path)?;
193 |     SqliteCache::new(path).await
194 | }
195 | 
196 | #[cfg(test)]
197 | mod test {
198 | 
199 |     use crate::preproc_cache::*;
200 | 
201 |     #[tokio::test]
202 |     async fn test_read_write() -> anyhow::Result<()> {
203 |         let path = tempfile::tempdir()?;
204 |         let _db = open_cache_db(&path.path().join("foo.sqlite3")).await?;
205 |         // db.set();
206 |         Ok(())
207 |     }
208 | }
209 | 


--------------------------------------------------------------------------------
/src/recurse.rs:
--------------------------------------------------------------------------------
 1 | use tokio_util::io::{ReaderStream, StreamReader};
 2 | 
 3 | use crate::{adapted_iter::AdaptedFilesIterBox, adapters::*, to_io_err};
 4 | use async_stream::stream;
 5 | 
 6 | pub fn concat_read_streams(input: AdaptedFilesIterBox) -> ReadBox {
 7 |     let s = stream! {
 8 |         for await output in input {
 9 |             let o = output.map_err(to_io_err)?.inp;
10 |             let stream = ReaderStream::new(o);
11 |             for await bytes in stream {
12 |                 yield bytes;
13 |             }
14 |         }
15 |     };
16 |     Box::pin(StreamReader::new(s))
17 | }
18 | 


--------------------------------------------------------------------------------
/src/test_utils.rs:
--------------------------------------------------------------------------------
 1 | use crate::{
 2 |     adapted_iter::AdaptedFilesIterBox,
 3 |     adapters::{
 4 |         AdaptInfo, ReadBox,
 5 |         custom::{BUILTIN_SPAWNING_ADAPTERS, CustomSpawningFileAdapter},
 6 |     },
 7 |     config::RgaConfig,
 8 |     matching::{FastFileMatcher, FileMatcher},
 9 |     recurse::concat_read_streams,
10 | };
11 | use anyhow::Result;
12 | use std::path::{Path, PathBuf};
13 | use tokio::{fs::File, io::AsyncReadExt};
14 | 
15 | pub use pretty_assertions::{assert_eq, assert_ne};
16 | pub fn test_data_dir() -> PathBuf {
17 |     let mut d = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
18 |     d.push("exampledir/test/");
19 |     d
20 | }
21 | 
22 | pub async fn simple_fs_adapt_info(filepath: &Path) -> Result<(AdaptInfo, FileMatcher)> {
23 |     Ok(simple_adapt_info_full(
24 |         filepath,
25 |         Box::pin(File::open(filepath).await?),
26 |         true,
27 |     ))
28 | }
29 | pub fn simple_adapt_info(filepath: &Path, inp: ReadBox) -> (AdaptInfo, FileMatcher) {
30 |     simple_adapt_info_full(filepath, inp, false)
31 | }
32 | 
33 | pub fn simple_adapt_info_full(
34 |     filepath: &Path,
35 |     inp: ReadBox,
36 |     is_real_file: bool,
37 | ) -> (AdaptInfo, FileMatcher) {
38 |     (
39 |         AdaptInfo {
40 |             filepath_hint: filepath.to_owned(),
41 |             is_real_file,
42 |             archive_recursion_depth: 0,
43 |             inp,
44 |             line_prefix: "PREFIX:".to_string(),
45 |             config: RgaConfig::default(),
46 |             postprocess: true,
47 |         },
48 |         FastFileMatcher::FileExtension(
49 |             filepath
50 |                 .extension()
51 |                 .unwrap_or_default()
52 |                 .to_string_lossy()
53 |                 .into_owned(),
54 |         )
55 |         .into(),
56 |     )
57 | }
58 | 
59 | pub async fn adapted_to_vec(adapted: AdaptedFilesIterBox) -> Result<Vec<u8>> {
60 |     let mut res = concat_read_streams(adapted);
61 | 
62 |     let mut buf = Vec::new();
63 |     res.read_to_end(&mut buf).await?;
64 |     Ok(buf)
65 | }
66 | 
67 | pub fn poppler_adapter() -> CustomSpawningFileAdapter {
68 |     let adapter = BUILTIN_SPAWNING_ADAPTERS
69 |         .iter()
70 |         .find(|e| e.name == "poppler")
71 |         .expect("no poppler adapter");
72 | 
73 |     adapter.to_adapter()
74 | }
75 | 
76 | #[cfg(test)]
77 | pub fn init_logging() {
78 |     let _ = env_logger::builder().is_test(true).try_init();
79 | }
80 | 


--------------------------------------------------------------------------------