├── .github └── workflows │ └── rust.yml ├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── Makefile ├── README.md ├── build.rs ├── docs ├── exclusion.md └── parser.md ├── examples ├── docker-ce.yaml ├── mysql-repo.yaml ├── node.yaml ├── openresty.yaml ├── proxmox.yaml ├── python.yaml ├── vyos.yaml ├── wine-builds.yaml └── zerotier.yaml ├── fixtures ├── artifactrepo │ ├── 10 │ │ └── index.html │ └── index.html ├── bmclapi │ └── index.html ├── buildroot │ ├── acl │ │ └── index.html │ ├── index.html │ └── mimalloc │ │ ├── build.html │ │ ├── index.html │ │ ├── test │ │ └── test │ │ └── using.html ├── caddy-symlink │ └── index.html ├── clickhouse │ ├── clickhouse-client │ │ └── index.html │ ├── index.html │ └── stable │ │ ├── index.html │ │ └── index2.html ├── docker │ ├── armv7l │ │ └── index.html │ └── index.html ├── ghettoforge │ └── index.html ├── gradle │ └── index.html ├── grml │ └── index.html ├── loongnix │ └── index.html ├── misc │ └── 1 │ │ └── index.html ├── monitoring-plugins │ └── index.html ├── mozilla │ ├── OJI │ │ └── index.html │ └── index.html ├── mysql │ └── index.html ├── nodejs │ └── v4.9.1 │ │ └── index.html ├── proxmox │ └── index.html ├── raspberrypi │ └── index.html ├── sdumirror-ubuntu │ └── index.html ├── start_fileserver.sh ├── vscode │ └── index.html ├── vyos │ ├── index.html │ └── vyos-accel-ppp │ │ └── index.html ├── wine-builds │ └── index.html └── zabbix │ └── index.html └── src ├── bar.rs ├── cli ├── list.rs ├── mod.rs └── sync.rs ├── compare.rs ├── extensions ├── apt.rs ├── mod.rs └── yum.rs ├── listing.rs ├── main.rs ├── parser ├── apache_f2.rs ├── caddy.rs ├── denoflare_r2.rs ├── directory_lister.rs ├── docker.rs ├── fallback.rs ├── fancyindex.rs ├── gradle.rs ├── lighttpd.rs ├── mod.rs └── nginx.rs ├── regex_manager ├── mod.rs ├── v1.rs └── v2.rs ├── timezone.rs └── utils.rs /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: Rust 2 | 3 | on: 4 | - push 5 | - pull_request 6 | 7 | env: 8 | CARGO_TERM_COLOR: always 9 | 10 | jobs: 11 | build: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v4 15 | with: 16 | submodules: 'recursive' 17 | - name: Generate Cargo.toml.cache (Ignore version=) 18 | run: | 19 | sed '/^version = /d' Cargo.toml > Cargo.toml.cache 20 | - uses: actions/cache@v4 21 | with: 22 | path: | 23 | ~/.cargo/registry/index 24 | ~/.cargo/registry/cache 25 | ~/.cargo/git 26 | target 27 | key: musl-cargo-${{ hashFiles('./Cargo.toml.cache') }} 28 | - name: Run fileserver 29 | run: | 30 | fixtures/start_fileserver.sh & 31 | 32 | - name: Remove files for a build with correct version info 33 | run: | 34 | find target/ -name 'shadow*' -exec rm -r {} + || true 35 | find target/ -name 'tsumugu' -delete || true 36 | - name: Test & Compile 37 | run: | 38 | mkdir -p ~/.cargo/{git,registry} 39 | # Fix git permission issue with Docker and shadow-rs 40 | sudo chown -R root . 41 | docker run --rm -t \ 42 | --mount type=bind,source=${{ github.workspace }},target=/volume \ 43 | --mount type=bind,source=$HOME/.cargo/registry,target=/root/.cargo/registry \ 44 | --mount type=bind,source=$HOME/.cargo/git,target=/root/.cargo/git \ 45 | --network=host \ 46 | clux/muslrust:stable \ 47 | cargo test 48 | docker run --rm -t \ 49 | --mount type=bind,source=${{ github.workspace }},target=/volume \ 50 | --mount type=bind,source=$HOME/.cargo/registry,target=/root/.cargo/registry \ 51 | --mount type=bind,source=$HOME/.cargo/git,target=/root/.cargo/git \ 52 | --network=host \ 53 | clux/muslrust:stable \ 54 | cargo build --release 55 | sudo chown -R runner ~/.cargo/ 56 | sudo chown -R runner . 57 | # show version info 58 | RUST_LOG=debug target/x86_64-unknown-linux-musl/release/tsumugu --version 59 | 60 | - name: Deploy - Create and Upload Release 61 | if: startsWith(github.ref, 'refs/tags/') 62 | uses: ncipollo/release-action@v1 63 | with: 64 | artifacts: target/x86_64-unknown-linux-musl/release/tsumugu 65 | - name: Release to crates.io 66 | if: startsWith(github.ref, 'refs/tags/') 67 | uses: katyo/publish-crates@v2 68 | with: 69 | registry-token: ${{ secrets.CARGO_REGISTRY_TOKEN }} 70 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | Cargo.toml.cache 3 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "tsumugu" 3 | version = "0.20250422.0" 4 | edition = "2021" 5 | description = "A HTTP(S) syncing tool with lower overhead, for OSS mirrors" 6 | license = "MIT" 7 | repository = "https://github.com/taoky/tsumugu" 8 | 9 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 10 | 11 | [dependencies] 12 | anyhow = { version = "1.0.71", features = ["backtrace"] } 13 | chrono = { version = "0.4.26", default-features = false, features = ["clock"] } 14 | clap = { version = "4.3.12", features = ["derive"] } 15 | regex = "1.9.1" 16 | reqwest = { version = "0.12.9", features = ["stream", "gzip", "deflate", "brotli", "socks"] } 17 | scraper = "0.23.1" 18 | url = "2.5.4" 19 | tracing = "0.1" 20 | tracing-subscriber = { version = "0.3", features = ["env-filter"] } 21 | filetime = "0.2.21" 22 | crossbeam-deque = "0.8.3" 23 | walkdir = "2.3.3" 24 | tokio = { version = "1.43.1", features = ["rt-multi-thread"] } 25 | kyuri = "0.2.0" 26 | futures-util = "0.3.31" 27 | humansize = "2.1.3" 28 | apt-parser = "1.0.0" 29 | flate2 = "1.0.28" 30 | shadow-rs = "0.32.0" 31 | thiserror = "1.0.63" 32 | percent-encoding = "2.3.1" 33 | 34 | [build-dependencies] 35 | shadow-rs = "0.32.0" 36 | 37 | [dev-dependencies] 38 | test-log = { version = "0.2.14", default-features = false, features = ["trace"] } 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 taoky 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: check release 2 | 3 | check: 4 | cargo fmt --check 5 | cargo clippy 6 | cargo test 7 | 8 | release: 9 | ifndef version 10 | $(error version is not set. Usage: make release version= msg="") 11 | endif 12 | ifndef msg 13 | $(error msg is not set. Usage: make release version= msg="") 14 | endif 15 | @full_version=$(shell echo $(version) | grep -q '\.' && echo "0.$(version)" || echo "0.$(version).0"); \ 16 | echo $$full_version; \ 17 | cargo set-version $$full_version; \ 18 | git commit -a -m "Bump version to $$full_version" ; \ 19 | git tag $(version) -m "$(msg)" 20 | echo "Run 'git push' and 'git push --tag' afterwards." 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tsumugu 2 | 3 | A HTTP(S) syncing tool with lower overhead, for OSS mirrors. 4 | 5 | Instead of `HEAD`ing every single file, tsumugu parses directory listing HTML and downloads only files that do not seem to be up-to-date. 6 | 7 | ## Design goals 8 | 9 | To successfully sync from these domains, where lftp/rclone fails or finds difficulties: 10 | 11 | - [x] http://download.proxmox.com/ 12 | - [x] https://download.docker.com/ 13 | - [x] https://dl.winehq.org/wine-builds/ 14 | 15 | ## TODOs 16 | 17 | - [x] Add "--include": Sync even if the file is excluded by `--exclude` regex. 18 | - [x] Add supported Debian, Ubuntu, Fedora and RHEL versions support to `--include` regex. 19 | - Something like `--include debian/${DEBIAN_VERSIONS}`? 20 | - [x] Check for APT/YUM repo integrity (avoid keeping old invalid metadata files) 21 | - (This is experimental and may not work well) 22 | 23 | ## Usage 24 | 25 | ```console 26 | > ./tsumugu --help 27 | A HTTP(S) syncing tool with lower overhead, for OSS mirrors 28 | 29 | Usage: tsumugu 30 | 31 | Commands: 32 | sync Sync files from upstream to local 33 | list List files from upstream 34 | help Print this message or the help of the given subcommand(s) 35 | 36 | Options: 37 | -h, --help Print help 38 | -V, --version Print version 39 | > ./tsumugu sync --help 40 | Sync files from upstream to local 41 | 42 | Usage: tsumugu sync [OPTIONS] 43 | 44 | Arguments: 45 | The upstream URL 46 | The local directory 47 | 48 | Options: 49 | --user-agent 50 | Customize tsumugu's user agent [default: tsumugu] 51 | --dry-run 52 | Do not download files and cleanup 53 | --threads 54 | Threads at work [default: 2] 55 | --no-delete 56 | Do not clean up after sync 57 | --max-delete 58 | Set max delete count [default: 100] 59 | --timezone-file 60 | You can set a valid URL for guessing. Set it to "no" to disable this behavior. By default it would recursively find the first file to HEAD for guessing 61 | --timezone 62 | Manually set timezone (+- hrs). This overrides timezone_file 63 | --retry 64 | Retry count for each request [default: 3] 65 | --head-before-get 66 | Do an HEAD before actual GET. Otherwise when head-before-get and allow-time-from-parser are not set, when GETting tsumugu would try checking if we still need to download it 67 | --parser 68 | Choose a main parser [default: nginx] [possible values: nginx, apache-f2, docker, directory-lister, lighttpd, caddy, fancy-index, gradle, fallback] 69 | --parser-match 70 | Choose supplementary parsers. Format: "parsername:matchpattern". matchpattern is a relative path regex. Supports multiple 71 | --exclude 72 | Excluded relative path regex. Supports multiple 73 | --include 74 | Included relative path regex (even if excluded). Supports multiple 75 | --skip-if-exists 76 | Skip relative path regex if they exist. Supports multiple 77 | --compare-size-only 78 | Relative path regex for those compare size only **after** HEAD (head_before_get on) or GET (head_before_get off) 79 | --trust-mtime-from-parser 80 | Allow mtime from parser if not available from HTTP headers [aliases: allow-mtime-from-parser] 81 | --apt-packages 82 | (Experimental) APT Packages file parser to find out missing packages 83 | --yum-packages 84 | (Experimental) YUM Packages file parser to find out missing packages 85 | --ignore-nonexist 86 | Ignore 404 NOT FOUND as error when downloading files 87 | --auto-fallback 88 | Allow automatically choose fallback parser when ParseError occurred 89 | --header
90 | Custom header for HTTP(S) requests in format "Headerkey: headervalue". Supports multiple 91 | --exclusion-v2 92 | The exclusion v2 mode. To keep compatibility, this is off by default 93 | -h, --help 94 | Print help 95 | -V, --version 96 | Print version 97 | > ./tsumugu list --help 98 | List files from upstream 99 | 100 | Usage: tsumugu list [OPTIONS] 101 | 102 | Arguments: 103 | The upstream URL 104 | 105 | Options: 106 | --user-agent Customize tsumugu's user agent [default: tsumugu] 107 | --parser Choose a main parser [default: nginx] [possible values: nginx, apache-f2, docker, directory-lister, lighttpd, caddy, fancy-index, gradle, fallback] 108 | --exclude Excluded relative path regex. Supports multiple 109 | --include Included relative path regex (even if excluded). Supports multiple 110 | --upstream-base The upstream base starting with "/" [default: /] 111 | --header
Custom header for HTTP(S) requests in format "Headerkey: headervalue". Supports multiple 112 | --exclusion-v2 The exclusion v2 mode. To keep compatibility, this is off by default 113 | -h, --help Print help 114 | -V, --version Print version 115 | ``` 116 | 117 | For a very brief introduction of parser, see [./docs/parser.md](./docs/parser.md). 118 | 119 | ## Exit code 120 | 121 | - 0: Success 122 | - 1: Failed to list 123 | - 2: Failed to download 124 | - 3: A panic!() occurred 125 | - 4: Error when cleaning up 126 | - 25: The limit stopped deletions 127 | 128 | ## Building with musl 129 | 130 | Unfortunately, this requires openssl-sys, which is not included in cross's prebuilt images. Try https://github.com/clux/muslrust. 131 | 132 | ## Evaluation 133 | 134 | Default concurrency is 2 threads. 135 | 136 | (Note: Please see [examples](./examples/) for latest commands to sync.) 137 | 138 | ### http://download.proxmox.com/ 139 | 140 | Proxmox uses a self-hosted CDN server architecture, and unfortunately its server limits concurrency to only 1 (as far as I could test). With traditional lftp/rclone it could take > 10 hours to sync once (even when your local files are identical with remote ones). 141 | 142 | Note: Consider using [Proxmox Offline Mirror](https://pom.proxmox.com/) or other tools like `apt-mirror` if you only need its APT repository. 143 | 144 | ```console 145 | > time ./tsumugu sync --threads 1 --dry-run --exclude '^temp' http://download.proxmox.com/ /srv/repo/proxmox/ 146 | ... 147 | 148 | real 1m48.746s 149 | user 0m3.468s 150 | sys 0m3.385s 151 | ``` 152 | 153 | ### https://download.docker.com/ 154 | 155 | We use [a special script](https://github.com/ustclug/ustcmirror-images/blob/master/docker-ce/tunasync/sync.py) for syncing docker-ce before, but tsumugu can also handle this now. And also, for 30x inside linux/centos/ and linux/rhel/, tsumugu could create symlinks as what this script do before. 156 | 157 | ```console 158 | > time ./tsumugu sync --timezone-file https://download.docker.com/linux/centos/docker-ce-staging.repo --parser docker --dry-run https://download.docker.com/ /srv/repo/docker-ce/ 159 | ... 160 | 161 | real 8m32.674s 162 | user 0m4.532s 163 | sys 0m2.855s 164 | ``` 165 | 166 | ### https://dl.winehq.org/wine-builds/ 167 | 168 | lftp/rclone fails to handle complex HTML. 169 | 170 | ```console 171 | > time ./tsumugu sync --parser apache-f2 --dry-run --exclude '^mageia' --exclude '^macosx' --exclude '^debian' --exclude '^ubuntu' --exclude '^fedora' --include '^debian/dists/${DEBIAN_CURRENT}' --include '^ubuntu/dists/${UBUNTU_LTS}' --include '^fedora/${FEDORA_CURRENT}' https://dl.winehq.org/wine-builds/ /srv/repo/wine/wine-builds/ 172 | ... 173 | 174 | INFO ThreadId(01) tsumugu: (Estimated) Total objects: 17514, total size: 342.28 GiB 175 | 176 | real 0m5.664s 177 | user 0m1.475s 178 | sys 0m0.294s 179 | ``` 180 | 181 | ## Notes 182 | 183 | ### Yuki integration 184 | 185 | See . 186 | 187 | YAML example: 188 | 189 | ```yaml 190 | envs: 191 | UPSTREAM: http://download.proxmox.com/ 192 | TSUMUGU_EXCLUDE: --exclude ^temp --exclude pmg/dists/.+changelog$ --exclude devel/dists/.+changelog$ 193 | TSUMUGU_TIMEZONEFILE: http://download.proxmox.com/images/aplinfo.dat 194 | TSUMUGU_THREADS: 1 195 | image: ustcmirror/tsumugu:latest 196 | interval: 12 3 * * * 197 | logRotCycle: 10 198 | name: proxmox 199 | storageDir: /srv/repo/proxmox/ 200 | ``` 201 | 202 | More examples in [examples/](./examples/). 203 | 204 | ### Regex variables 205 | 206 | See [./src/regex_manager/mod.rs](./src/regex_manager/mod.rs). 207 | 208 | ### Exclusion and inclusion 209 | 210 | **There's a breaking change since 20240902. User regexes with `^` and `$` would be affected.** 211 | 212 | See [./docs/exclusion.md](./docs/exclusion.md). 213 | 214 | ### Deduplication 215 | 216 | Tsumugu relies on local file size and mtime to check if file shall be downloaded. Some file-level deduplicators like [jdupes](https://codeberg.org/jbruchon/jdupes) would ignore file mtime when deduplicating with hard links. This could be an issue for some repos, as some files would be redownloaded again and again every time as it does not have a correct mtime locally. 217 | 218 | Workarounds: 219 | 220 | - Set `--compare-size-only`. 221 | - Use filesystem-level/block-level deduplication like `zfs dedup`. 222 | - Use another file-level deduplicator which considers mtime (though I don't know which would do this). 223 | 224 | Also, if you are sure that some directory is identical with another, you could manually create a symlink for that. Tsumugu would ignore symlinks during syncing. 225 | 226 | ## Acknowledgements 227 | 228 | Special thanks to [NJU Mirror](https://mirrors.nju.edu.cn/) for extensive testing and bug reporting. 229 | 230 | ## Naming 231 | 232 | The name "tsumugu", and current branch name "pudding", are derived from the manga *A Drift Girl and a Noble Moon*. 233 | 234 |
235 | And... 236 | tsumugu, drawn as simplified version of hitori 237 | 238 | Tsumugu in the appearance of a very simplified version of Hitori (Obviously I am not very good at drawing though). 239 |
240 | 241 | Old (2020), unfinished golang version is named as "traverse", under the `main-old` branch. 242 | -------------------------------------------------------------------------------- /build.rs: -------------------------------------------------------------------------------- 1 | fn main() -> shadow_rs::SdResult<()> { 2 | shadow_rs::new() 3 | } 4 | -------------------------------------------------------------------------------- /docs/exclusion.md: -------------------------------------------------------------------------------- 1 | # Tsumugu exclusion/inclusion logic and rules 2 | 3 | ## v2 4 | 5 | v2 is a breaking but much simpler change. It is based on two simple rules: 6 | 7 | - Match excludes and includes by their order in argv. 8 | - If nothing is matched, include. 9 | 10 | You need `--exclusion-v2` to enable this new behavior. 11 | 12 | > [!TIP] 13 | > To include `/a/b/c/d`, `/`, `/a/`, `/a/b/`, `/a/b/c/` and `/a/b/c/d` shall all be included. 14 | 15 | ## v1 16 | 17 | Currently tsumugu follows a simple algorithm to determine whether a path should be completely excluded, partially excluded, or included: 18 | 19 | 0. When parsing regex, a `rev_inner` regex will be generated by replacing variables (`${UBUNTU_LTS}`, etc.) to `(?.+)` (aka, match everything). The `rev_inner` would be used like this: 20 | 21 | ```rust 22 | pub fn is_others_match(&self, text: &str) -> bool { 23 | !self.inner.is_match(text) && self.rev_inner.is_match(text) 24 | } 25 | ``` 26 | 27 | 1. First, users' exclusions and inclusions are preprocessed. For all **exclusions, if it is a prefix of any inclusion**, it will be put into the `list_only_regexes`, otherwise it will be put into `instant_stop_regexes`. All inclusions are in `include_regexes`. 28 | 2. While working threads are handling listing requests: 29 | 1. Check with `instant_stop_regexes` and `include_regexes`: 30 | 31 | ```rust 32 | for regex in &self.instant_stop_regexes { 33 | if regex.is_match(text) { 34 | return Comparison::Stop; 35 | } 36 | } 37 | for regex in &self.include_regexes { 38 | if regex.is_match(text) { 39 | return Comparison::Ok; 40 | } 41 | } 42 | ``` 43 | 44 | 2. Then, the path will be checked with `rev_inner` regex by `is_others_match()`, and also completely excluded if matches (a fast shortcut). 45 | 46 | This is used for cases like Fedora -- it has many versions (currently from 1 to 40). Listing other version folders not in `${FEDORA_CURRENT}` is a waste of time and network. With this trick we could skip these unmatched versions. 47 | 3. Finally, if the path matches `list_only_regexes`, files under this directory will be ignored (unless they are matched by `include_regexes`), but subdirectories will still be listed. Paths that are not matched by any regexes will be included as usual. 48 | 49 | In this process some paths, which would be unnecessary, will still be listed. However, this logic suits needs of filtering OS versions well. 50 | 51 | ## Relative path 52 | 53 | Also note that currently, this is used when generating relative path for comparison: 54 | 55 | ```rust 56 | pub fn relative_to_str(relative: &[String], filename: Option<&str>) -> String { 57 | let mut r = relative.join("/"); 58 | if r.starts_with('/') { 59 | warn!("unexpected / at the beginning of relative ({r})"); 60 | } else { 61 | r.insert(0, '/'); 62 | } 63 | if r.len() != 1 { 64 | if r.ends_with('/') { 65 | warn!("unexpected / at the end of relative ({r})") 66 | } else { 67 | r.push('/') 68 | } 69 | } 70 | 71 | // here r already has / at the end 72 | match filename { 73 | None => r, 74 | Some(filename) => { 75 | assert!(!filename.starts_with('/') && !filename.ends_with('/')); 76 | format!("{}{}", r, filename) 77 | } 78 | } 79 | } 80 | ``` 81 | 82 | As a result: 83 | 84 | 1. All relative paths for comparison have "/" at front. 85 | 2. Directory paths have "/" at back, and files don't. 86 | 87 | Examples: 88 | 89 | 1. `http://example.com/file` => `/file` 90 | 2. `http://example.com/dir` => `/dir/` 91 | 3. `http://example.com/dir/file` => `/dir/file` 92 | 93 | Not that for compatibilities considerations, this trick is done: User regex which starts with `^` and not `^/`, would be replaced: `^` -> `^/` (this might break some very rare regexes). 94 | 95 | So you could **write `/something$` to exclude ALL files and directories with name `something`**, instead of using 2 regexes (`^something$` and `/something$`, to match `something` at root and others not in root). 96 | 97 | And also, `upstream` itself is NOT included when comparing. So if your upstream is set to `https://some.example.com/dir/`, you need to exclude `^something/` to exclude `https://some.example.com/dir/something/` instead of `^dir/something/`. 98 | 99 | Test with [tsumugu list](./parser.md#debugging), if in doubt. 100 | -------------------------------------------------------------------------------- /docs/parser.md: -------------------------------------------------------------------------------- 1 | # Parsers of tsumugu 2 | 3 | This is a list of parsers that tsumugu supports: 4 | 5 | - apache_f2: [Apache2's autoindex](https://httpd.apache.org/docs/2.4/mod/mod_autoindex.html) with HTMLTable FancyIndexed list (`F=2`). 6 | - directory_lister: [Directory Lister](https://www.directorylister.com/). 7 | - docker: A specialized parser for . 8 | - lighttpd: [lighttpd's mod_dirlisting](https://redmine.lighttpd.net/projects/lighttpd/wiki/Docs_ModDirlisting). 9 | - nginx: [Nginx's autoindex](https://nginx.org/en/docs/http/ngx_http_autoindex_module.html). It should also work with Apache2's autoindex `F=1` mode. 10 | - caddy: [Caddy's file_server](https://caddyserver.com/docs/caddyfile/directives/file_server). 11 | - fancyindex: [Nginx fancyindex](https://github.com/aperezdc/ngx-fancyindex). 12 | - gradle: A specialized parser for , might suitable for other websites like this: 13 | 14 | ```html 15 |
  • 16 | 17 | gradle-8.10-wrapper.jar.sha256 18 | 14-Aug-2024 11:18 +0000 19 | 64.00B 20 | 21 |
  • 22 | ``` 23 | 24 | - denoflare-r2: Specialized parser for . 25 | - fallback: An inefficient fallback parser for `index.htm(l)` which is NOT a file listing: 26 | 27 | ```rust 28 | // An inefficient fallback parser only for non-listing HTML. 29 | // Limitations: 30 | // 1. It requires /index.html or /index.htm available. 31 | // Parser cannot write to disk, so index file would be accessed twice during sync. 32 | // 2. Currently it ignores files in directories. 33 | // For example, it recognizes "static/css.css" as contains a "static" directory only. 34 | // If "static/" is inaccessible, "static/css.css" would NOT be synced. 35 | // In future it might be implemented when we have another parser returning a full file tree. 36 | // 3. It would always try HEAD to confirm existence and get file mtime & size. Items with 403/404 code would be ignored. 37 | // 4. It does not try parse other html files. 38 | // 5. It only looks for . , 31 | 32 | 33 | -------------------------------------------------------------------------------- /fixtures/artifactrepo/index.html: -------------------------------------------------------------------------------- 1 | 2 | Index of 3 | openjdk-local 4 | 5 | 6 | 7 |

    8 |

    Index of openjdk-local

    9 |

    10 |
    Name         Last modified     Size
    11 |
    12 |
    10/          22-Aug-2021 15:18 -
    13 | 10.0.1/      22-Aug-2021 15:18 -
    14 | 10.0.2/      22-Aug-2021 15:18 -
    15 | 11.0.1/      22-Aug-2021 15:19 -
    16 | 11.0.2/      22-Aug-2021 15:19 -
    17 | 12/          22-Aug-2021 15:19 -
    18 | 12.0.1/      22-Aug-2021 15:19 -
    19 | 12.0.2/      22-Aug-2021 15:19 -
    20 | 13/          22-Aug-2021 15:19 -
    21 | 13.0.1/      22-Aug-2021 15:20 -
    22 | 13.0.2/      22-Aug-2021 15:20 -
    23 | 14/          22-Aug-2021 15:20 -
    24 | 14.0.1/      22-Aug-2021 15:20 -
    25 | 14.0.2/      22-Aug-2021 15:20 -
    26 | 15/          22-Aug-2021 15:21 -
    27 | 15.0.1/      22-Aug-2021 15:21 -
    28 | 15.0.2/      22-Aug-2021 15:21 -
    29 | 16/          22-Aug-2021 15:21 -
    30 | 16.0.1/      22-Aug-2021 15:21 -
    31 | 16.0.2/      22-Aug-2021 15:22 -
    32 | 17/          15-Sep-2021 03:36 -
    33 | 17.0.1/      20-Oct-2021 03:38 -
    34 | 17.0.2/      19-Jan-2022 03:39 -
    35 | 18/          23-Mar-2022 03:36 -
    36 | 18.0.1/      21-Apr-2022 03:36 -
    37 | 18.0.1.1/    03-May-2022 03:40 -
    38 | 18.0.2/      20-Jul-2022 03:37 -
    39 | 18.0.2.1/    20-Aug-2022 03:42 -
    40 | 19/          21-Sep-2022 03:36 -
    41 | 19.0.1/      19-Oct-2022 03:37 -
    42 | 19.0.2/      20-Dec-2022 12:17 -
    43 | 20/          14-Feb-2023 00:53 -
    44 | 20.0.1/      30-Mar-2023 16:43 -
    45 | 20.0.2/      26-Jun-2023 15:00 -
    46 | 21/          12-Aug-2023 00:39 -
    47 | 21.0.1/      06-Oct-2023 16:18 -
    48 | 21.0.2/      06-Jan-2024 15:26 -
    49 | 22/          17-Feb-2024 00:20 -
    50 | 22.0.1/      26-Mar-2024 13:00 -
    51 | 22.0.2/      20-Jun-2024 17:06 -
    52 | 9/           22-Aug-2021 15:22 -
    53 | 9.0.1/       21-May-2022 03:39 -
    54 | 9.0.4/       22-Aug-2021 15:22 -
    55 | java-jse-ri/ 22-Aug-2021 15:22 -
    56 | 
    57 |
    58 |
    59 | ArtifactRepo/ Server Port 443 60 |
    61 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /fixtures/buildroot/acl/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Index of /acl/ 6 | 20 | 21 | 22 |

    Index of /acl/

    23 |
    24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 |
    NameLast ModifiedSizeType
    ../ -  Directory
    acl-2.2.52.src.tar.gz2013-May-19 06:10:38377.5Kapplication/x-gtar-compressed
    acl-2.2.53.tar.gz2018-Jul-10 22:18:45512.0Kapplication/x-gtar-compressed
    acl-2.3.1.tar.xz2021-Apr-09 21:39:55347.3Kapplication/x-xz
    acl-2.3.2.tar.xz2024-Feb-07 03:04:10362.9Kapplication/x-xz
    34 |
    35 |
    lighttpd/1.4.67
    36 | 37 | 226 | 227 | 228 | 229 | -------------------------------------------------------------------------------- /fixtures/buildroot/mimalloc/build.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taoky/tsumugu/7b4673cdb05d87c24e8eb0c3e53dab5bece3efc3/fixtures/buildroot/mimalloc/build.html -------------------------------------------------------------------------------- /fixtures/buildroot/mimalloc/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | mi-malloc: Main Page 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 24 | 25 | 26 | 27 | 28 |
    29 |
    30 | 31 | 32 | 33 | 34 | 39 | 54 | 55 | 56 |
    35 |
    mi-malloc 36 |  1.7/2.0 37 |
    38 |
    40 | 41 | 45 | 49 | 50 | 51 | 52 |
    53 |
    57 |
    58 | 59 | 60 | 65 |
    66 |
    67 | 72 |
    74 |
    75 |
    76 | 81 |
    82 | 83 |
    87 |
    88 | 89 | 90 |
    91 | 94 |
    95 | 96 |
    97 |
    98 |
    mi-malloc Documentation
    99 |
    100 |
    101 |

    This is the API documentation of the mimalloc allocator (pronounced "me-malloc") – a general purpose allocator with excellent performance characteristics. Initially developed by Daan Leijen for the run-time systems of the Koka and Lean languages.

    102 |

    It is a drop-in replacement for malloc and can be used in other programs without code changes, for example, on Unix you can use it as:

    > LD_PRELOAD=/usr/bin/libmimalloc.so myprogram
    103 |

    Notable aspects of the design include:

    104 |
      105 |
    • small and consistent: the library is about 8k LOC using simple and consistent data structures. This makes it very suitable to integrate and adapt in other projects. For runtime systems it provides hooks for a monotonic heartbeat and deferred freeing (for bounded worst-case times with reference counting).
    • 106 |
    • free list sharding: instead of one big free list (per size class) we have many smaller lists per "mimalloc page" which reduces fragmentation and increases locality – things that are allocated close in time get allocated close in memory. (A mimalloc page contains blocks of one size class and is usually 64KiB on a 64-bit system).
    • 107 |
    • free list multi-sharding: the big idea! Not only do we shard the free list per mimalloc page, but for each page we have multiple free lists. In particular, there is one list for thread-local free operations, and another one for concurrent free operations. Free-ing from another thread can now be a single CAS without needing sophisticated coordination between threads. Since there will be thousands of separate free lists, contention is naturally distributed over the heap, and the chance of contending on a single location will be low – this is quite similar to randomized algorithms like skip lists where adding a random oracle removes the need for a more complex algorithm.
    • 108 |
    • eager page reset: when a "page" becomes empty (with increased chance due to free list sharding) the memory is marked to the OS as unused ("reset" or "purged") reducing (real) memory pressure and fragmentation, especially in long running programs.
    • 109 |
    • secure: mimalloc can be build in secure mode, adding guard pages, randomized allocation, encrypted free lists, etc. to protect against various heap vulnerabilities. The performance penalty is only around 5% on average over our benchmarks.
    • 110 |
    • first-class heaps: efficiently create and use multiple heaps to allocate across different regions. A heap can be destroyed at once instead of deallocating each object separately.
    • 111 |
    • bounded: it does not suffer from blowup [1], has bounded worst-case allocation times (wcat), bounded space overhead (~0.2% meta-data, with at most 12.5% waste in allocation sizes), and has no internal points of contention using only atomic operations.
    • 112 |
    • fast: In our benchmarks (see below), mimalloc outperforms all other leading allocators (jemalloc, tcmalloc, Hoard, etc), and usually uses less memory (up to 25% more in the worst case). A nice property is that it does consistently well over a wide range of benchmarks.
    • 113 |
    114 |

    You can read more on the design of mimalloc in the technical report which also has detailed benchmark results.

    115 |

    Further information:

    116 | 133 |
    134 |
    135 |
    136 | 137 | 142 | 143 | 144 | -------------------------------------------------------------------------------- /fixtures/buildroot/mimalloc/test/test: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taoky/tsumugu/7b4673cdb05d87c24e8eb0c3e53dab5bece3efc3/fixtures/buildroot/mimalloc/test/test -------------------------------------------------------------------------------- /fixtures/buildroot/mimalloc/using.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/taoky/tsumugu/7b4673cdb05d87c24e8eb0c3e53dab5bece3efc3/fixtures/buildroot/mimalloc/using.html -------------------------------------------------------------------------------- /fixtures/clickhouse/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 18 | 19 | 20 |
    21 |
    root ⟩ repo-archive
    22 |
     
    23 | deb/ 24 | rpm/ 25 | tgz/ 26 |
     
    27 |
    0
    2022-09-23T13:39:52.080Z
    28 | CLICKHOUSE-KEY.GPG
    3,133
    (3.06 kb)
    2022-09-23T13:53:51.925Z
    29 |
    30 | 31 | -------------------------------------------------------------------------------- /fixtures/clickhouse/stable/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 18 | 19 | 20 |
    21 |
    roottgz ⟩ stable
    22 |
     
    23 |
    0
    2022-09-21T12:08:53.750Z
    24 | clickhouse-client-21.1.9.41.tgz.sha512
    162
    2023-11-24T13:12:00.540Z
    25 | clickhouse-client-21.1.9.41.tgz
    161,531
    (157.75 kb)
    2022-09-21T23:58:17.236Z
    26 |
     
    27 | 28 |
    29 | 30 | -------------------------------------------------------------------------------- /fixtures/clickhouse/stable/index2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 18 | 19 | 20 |
    21 |
    roottgz ⟩ stable
    22 |
     
    23 | clickhouse-client-23.7.3.14-arm64.tgz.sha512
    176
    2023-08-05T19:31:50.283Z
    24 | clickhouse-client-23.7.3.14-arm64.tgz
    79,837
    (77.97 kb)
    2023-08-05T19:30:42.398Z
    25 |
    26 | 27 | -------------------------------------------------------------------------------- /fixtures/docker/armv7l/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Index of linux/centos/7/armv7l/ 6 | 7 | 8 |

    Index of linux/centos/7/armv7l/

    9 |
    10 |
    ../
    11 | nightly/                                    2020-01-21 07:38  -
    12 | test/                                       2020-01-21 07:38  -
    13 | 

    14 | -------------------------------------------------------------------------------- /fixtures/docker/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Index of linux/centos/ 6 | 7 | 8 |

    Index of linux/centos/

    9 |
    10 |
    ../
    11 | 7.0/
    12 | 7.1/
    13 | 7.2/
    14 | 7.3/
    15 | 7.4/
    16 | 7.5/
    17 | 7.6/
    18 | 7.7/
    19 | 7.8/
    20 | 7.9/
    21 | 7/
    22 | 7Client/
    23 | 7Server/
    24 | 7Workstation/
    25 | 8.0/
    26 | 8.1/
    27 | 8.2/
    28 | 8.3/
    29 | 8.4/
    30 | 8.5/
    31 | 8.6/
    32 | 8.7/
    33 | 8.8/
    34 | 8.9/
    35 | 8/
    36 | 8Client/
    37 | 8Server/
    38 | 8Workstation/
    39 | 9.0/
    40 | 9.1/
    41 | 9.2/
    42 | 9.3/
    43 | 9.4/
    44 | 9.5/
    45 | 9.6/
    46 | 9.7/
    47 | 9.8/
    48 | 9.9/
    49 | 9/
    50 | 9Client/
    51 | 9Server/
    52 | 9Workstation/
    53 | docker-ce-staging.repo                                                                2023-07-07 20:20:56 2.0 KiB
    54 | docker-ce.repo                                                                        2023-07-07 20:20:51 1.9 KiB
    55 | gpg                                                                                   2023-07-07 20:21:31 1.6 KiB
    56 | 

    57 | -------------------------------------------------------------------------------- /fixtures/ghettoforge/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Index of /distributions/gf 5 | 6 | 7 |

    Index of /distributions/gf

    8 |
    Icon  Name                                Last modified      Size  Description
    [PARENTDIR] Parent Directory - 9 | [   ] RPM-GPG-KEY-gf.el7 2014-12-30 02:53 3.0K 10 | [   ] RPM-GPG-KEY-gf.el8 2020-01-13 09:40 3.1K 11 | [   ] RPM-GPG-KEY-gf.el9 2022-08-03 11:28 1.6K 12 | [DIR] archive/ 2020-12-21 02:34 - 13 | [DIR] el/ 2022-08-02 11:57 - 14 | [   ] gf-release-latest.gf.el7.noarch.rpm 2021-08-21 10:38 8.0K 15 | [   ] gf-release-latest.gf.el8.noarch.rpm 2021-08-21 10:39 11K 16 | [   ] gf-release-latest.gf.el9.noarch.rpm 2022-08-03 12:16 9.2K 17 |
    18 | 19 | -------------------------------------------------------------------------------- /fixtures/grml/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Index of /pool/main/m 5 | 6 | 7 |

    Index of /pool/main/m

    8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
    [ICO]NameLast modifiedSizeDescription

    [PARENTDIR]Parent Directory  -  
    [DIR]mozilla-firefox-adblock/2006-11-27 07:00 -  
    [DIR]msgid-chooser/2006-11-27 07:00 -  
    [DIR]magicrescue/2006-12-10 10:30 -  
    [DIR]memtest86+/2007-03-27 22:41 -  
    [DIR]misdn-kernel/2007-07-11 19:45 -  
    [DIR]minised/2007-11-04 19:15 -  
    [DIR]md5deep/2007-11-04 21:09 -  
    [DIR]multiseat/2010-01-08 17:40 -  
    [DIR]mdadm/2013-02-22 10:40 -  
    [DIR]madwifi/2024-10-07 18:11 -  

    24 | 25 | -------------------------------------------------------------------------------- /fixtures/loongnix/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Nginx Directory 7 | 8 | 9 | 10 | 11 | 12 | 18 |
    19 |

    Loongnix操作系统源

    20 |

    访问地址 http://pkg.loongnix.cn

    21 |

    Directory: 22 | /loongnix/dists/DaoXiangHu-cartoons/

    23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 |
    File Name  ↓ File Size  ↓ Date  ↓ 
    --
    -2023-08-15 05:48
    -2023-08-15 05:48
    -2023-08-15 05:48
    10.0 MiB2023-08-15 05:48
    659.9 KiB2023-08-15 05:48
    454.6 KiB2023-08-15 05:48
    7.9 KiB2023-08-15 05:48
    7.2 KiB2023-08-15 05:48
    659 B2023-08-15 05:48
    35 | 38 | 39 | 40 | 41 | 68 | 69 | 70 | 76 | -------------------------------------------------------------------------------- /fixtures/misc/1/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Misc test index 1 4 | 5 | 6 |

    /etc/

    7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 |
    FilenameSizeLast ModifiedSHA256
    passwd3.3 KB2024-08-24 15:04:11 +0000477a3d43f692aeaf1c7f40c0c91bffde3e2e638d8e90c668422373ee82a18521
    20 | 21 | 22 | -------------------------------------------------------------------------------- /fixtures/monitoring-plugins/index.html: -------------------------------------------------------------------------------- 1 | 2 | Index of /monitoring-plugins/ 3 | 4 |

    Index of /monitoring-plugins/


    ../
     5 | archive/                                           09-Oct-2015 16:12                   -
     6 | mib/                                               29-Nov-2013 20:20                   -
     7 | presentation/                                      26-Sep-2013 05:15                   -
     8 | snapshot/                                          10-Feb-2023 17:42                   -
     9 | monitoring-plugins-2.0.tar.gz                      11-Jul-2014 23:17             2610000
    10 | monitoring-plugins-2.0.tar.gz.sha1                 11-Jul-2014 23:17                  72
    11 | monitoring-plugins-2.1.1.tar.gz                    02-Dec-2014 07:46             2612331
    12 | monitoring-plugins-2.1.1.tar.gz.sha1               02-Dec-2014 07:46                  74
    13 | monitoring-plugins-2.1.2.tar.gz                    16-Oct-2015 17:40             2613060
    14 | monitoring-plugins-2.1.2.tar.gz.sha1               16-Oct-2015 17:40                  74
    15 | monitoring-plugins-2.1.tar.gz                      15-Oct-2014 20:32             2611940
    16 | monitoring-plugins-2.1.tar.gz.sha1                 15-Oct-2014 20:32                  72
    17 | monitoring-plugins-2.2.tar.gz                      29-Nov-2016 16:49             2461548
    18 | monitoring-plugins-2.2.tar.gz.sha1                 29-Nov-2016 16:49                  72
    19 | monitoring-plugins-2.3.1.tar.gz                    11-Apr-2021 17:07             2529669
    20 | monitoring-plugins-2.3.1.tar.gz.sha1               11-Apr-2021 17:07                  74
    21 | monitoring-plugins-2.3.2.tar.gz                    19-Oct-2022 20:58             2766966
    22 | monitoring-plugins-2.3.2.tar.gz.sha1               19-Oct-2022 20:58                  74
    23 | monitoring-plugins-2.3.3.tar.gz                    01-Feb-2023 21:53             2620192
    24 | monitoring-plugins-2.3.3.tar.gz.sha1               01-Feb-2023 21:53                  74
    25 | monitoring-plugins-2.3.tar.gz                      10-Dec-2020 05:50             2528556
    26 | monitoring-plugins-2.3.tar.gz.sha1                 10-Dec-2020 05:50                  72
    27 | timestamp                                          20-Jul-2023 10:46                  11
    28 | 

    29 | 30 | -------------------------------------------------------------------------------- /fixtures/mozilla/OJI/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Directory Listing: /pub/OJI/ 6 | 7 | 8 |

    Index of /pub/OJI/

    9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 |
    TypeNameSizeLast Modified
    Dir..
    DirMRJPlugin/
    FileMRJPlugin.sit.hqx234K13-Feb-2023 04:21
    43 | 44 | -------------------------------------------------------------------------------- /fixtures/mysql/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Index of /232905/apt/ubuntu/pool/mysql-tools/m 5 | 6 | 7 |

    Index of /232905/apt/ubuntu/pool/mysql-tools/m

    8 |
       Name                              Last modified        Size  
     9 | 
    10 | [DIR] Parent Directory 01-Jan-1970 00:00 - 11 | [DIR] mysql-community/ 19-Apr-2023 14:57 - 12 | [DIR] mysql-connector-c++/ 24-Oct-2023 18:04 - 13 | [DIR] mysql-connector-j/ 24-Oct-2023 18:17 - 14 | [DIR] mysql-connector-java/ 08-Oct-2022 08:19 - 15 | [DIR] mysql-connector-odbc/ 24-Oct-2023 17:29 - 16 | [DIR] mysql-connector-python/ 25-Oct-2023 16:10 - 17 | [DIR] mysql-router/ 24-Apr-2019 12:18 - 18 | [DIR] mysql-shell/ 19-Apr-2023 07:30 - 19 | [DIR] mysql-utilities/ 07-Nov-2017 09:27 - 20 | [DIR] mysql-workbench-community/ 19-Apr-2023 06:44 - 21 |

    22 | 23 | -------------------------------------------------------------------------------- /fixtures/nodejs/v4.9.1/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Index of /dist/v4.9.1/ 4 | 18 | 19 | 20 |

    Index of /dist/v4.9.1/


    ../
    21 | docs/                                                             -                   -
    22 | win-x64/                                                          -                   -
    23 | win-x86/                                                          -                   -
    24 | SHASUMS256.txt.asc                                 04-Nov-2024 17:40               4.1 KB
    25 | SHASUMS256.txt.sig                                 04-Nov-2024 17:40                310 B
    26 | SHASUMS256.txt                                     04-Nov-2024 17:40               3.6 KB
    27 | node-v4.9.1-darwin-x64.tar.gz                      30-Oct-2024 18:21                10 MB
    28 | node-v4.9.1-darwin-x64.tar.xz                      04-Nov-2024 17:40               7.1 MB
    29 | node-v4.9.1-headers.tar.gz                         04-Nov-2024 17:40               471 KB
    30 | node-v4.9.1-headers.tar.xz                         04-Nov-2024 17:40               342 KB
    31 | node-v4.9.1-linux-arm64.tar.gz                     30-Oct-2024 18:21                12 MB
    32 | node-v4.9.1-linux-arm64.tar.xz                     04-Nov-2024 17:40               7.7 MB
    33 | node-v4.9.1-linux-armv6l.tar.gz                    30-Oct-2024 18:21                11 MB
    34 | node-v4.9.1-linux-armv6l.tar.xz                    04-Nov-2024 17:40               7.3 MB
    35 | node-v4.9.1-linux-armv7l.tar.gz                    30-Oct-2024 18:21                11 MB
    36 | node-v4.9.1-linux-armv7l.tar.xz                    04-Nov-2024 17:40               7.3 MB
    37 | node-v4.9.1-linux-ppc64.tar.gz                     30-Oct-2024 18:21                12 MB
    38 | node-v4.9.1-linux-ppc64.tar.xz                     04-Nov-2024 17:40               7.5 MB
    39 | node-v4.9.1-linux-ppc64le.tar.gz                   30-Oct-2024 18:21                12 MB
    40 | node-v4.9.1-linux-ppc64le.tar.xz                   04-Nov-2024 17:40               7.6 MB
    41 | node-v4.9.1-linux-x64.tar.gz                       30-Oct-2024 18:21                12 MB
    42 | node-v4.9.1-linux-x64.tar.xz                       04-Nov-2024 17:40               8.2 MB
    43 | node-v4.9.1-linux-x86.tar.gz                       30-Oct-2024 18:21                12 MB
    44 | node-v4.9.1-linux-x86.tar.xz                       04-Nov-2024 17:40               7.8 MB
    45 | node-v4.9.1-sunos-x64.tar.gz                       30-Oct-2024 18:21                13 MB
    46 | node-v4.9.1-sunos-x64.tar.xz                       30-Oct-2024 18:21               8.4 MB
    47 | node-v4.9.1-sunos-x86.tar.gz                       30-Oct-2024 18:21                12 MB
    48 | node-v4.9.1-sunos-x86.tar.xz                       04-Nov-2024 17:40               7.7 MB
    49 | node-v4.9.1-win-x64.7z                             04-Nov-2024 17:40               6.1 MB
    50 | node-v4.9.1-win-x64.zip                            30-Oct-2024 18:21                11 MB
    51 | node-v4.9.1-win-x86.7z                             04-Nov-2024 17:40               5.4 MB
    52 | node-v4.9.1-win-x86.zip                            30-Oct-2024 18:21               9.6 MB
    53 | node-v4.9.1-x64.msi                                30-Oct-2024 18:21                11 MB
    54 | node-v4.9.1-x86.msi                                30-Oct-2024 18:21              10.0 MB
    55 | node-v4.9.1.pkg                                    30-Oct-2024 18:21                13 MB
    56 | node-v4.9.1.tar.gz                                 30-Oct-2024 18:21                23 MB
    57 | node-v4.9.1.tar.xz                                 30-Oct-2024 18:21                13 MB
    58 | 

    59 | 60 | -------------------------------------------------------------------------------- /fixtures/start_fileserver.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | python3 -m http.server 1921 -d $(dirname "$0") -------------------------------------------------------------------------------- /fixtures/vscode/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Index of /repos/vscode/ 4 | 5 |

    Index of /repos/vscode/

    6 |
    ../
     7 | dists/                                                                                              06-Mar-2025 07:19  481 Bytes
     8 | pool/                                                                                               05-Sep-2024 18:01  104.1 MB
     9 | 

    10 | -------------------------------------------------------------------------------- /fixtures/wine-builds/index.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | Index of /wine-builds 6 | 7 | 8 | 9 | 10 |
    11 | 12 |
    13 |
    14 | WineHQ 15 |
    16 | 17 |
    Run Windows applications on Linux, BSD, Solaris and Mac OS X.
    18 | 19 | 24 | 25 |
    26 | 33 |
    34 | 35 |
    36 |
    37 | 38 | 39 |

    Wine Download Server

    40 | 41 |
    42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 |
    [ICO]NameLast modifiedSize
    [PARENTDIR]Parent Directory  -
    [DIR]android/2022-01-18 15:14 -
    [DIR]debian/2019-01-07 19:52 -
    [DIR]fedora/2023-04-20 14:52 -
    [DIR]macosx/2017-03-30 15:49 -
    [DIR]mageia/2017-09-29 23:46 -
    [DIR]ubuntu/2019-01-03 09:20 -
    [   ]Release.key2017-03-28 14:54 3.0K
    [   ]winehq.key2018-12-19 08:07 3.1K
    55 |
    56 |
    57 | 58 |
    59 | 60 |
    61 |
    62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 77 | 78 | 82 | 83 | -------------------------------------------------------------------------------- /fixtures/zabbix/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Zabbix Cloud Images and Appliances 5 | 6 | 7 | 8 |
    9 |

    Zabbix Cloud Images and Appliances

    10 | Zabbix is an enterprise-class open source distributed monitoring solution designed to monitor and track performance and availability of network servers, devices and other IT resources. It supports distributed and WEB monitoring, auto-discovery, and more. 11 |

    12 | Zabbix appliances are available on Google Cloud Platform along with previously released Zabbix appliances, Microsoft Azure, DigitalOcean and Amazon Web Services. Zabbix is now available on all major cloud platforms. 13 |

    14 | These appliances are created and officially supported by Zabbix SIA. 15 |

    16 | Installation instructions are available in Zabbix Cloud Images page. 17 |

    18 | If you have any problems or suggestions, please report an issue on Zabbix Bug Tracking System. 19 |

    20 | If you want to get professional support, installation or upgrade service, please see our Zabbix technical support service page. 21 | 22 |
    23 |
    24 | 25 | Index of /zabbix/ 26 | 27 |

    Index of /zabbix/


    ../
    28 | appliances/                                        27-Jul-2020 11:06                   -
    29 | binaries/                                          01-Dec-2020 20:09                   -
    30 | integrations/                                      12-Nov-2021 12:30                   -
    31 | nightly/                                           24-Aug-2024 12:03                   -
    32 | sources/                                           14-Dec-2020 13:39                   -
    33 | 

    34 | 35 |
    36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /src/bar.rs: -------------------------------------------------------------------------------- 1 | pub const TEMPLATE_DEFAULT: &str = 2 | "{msg}\n[{elapsed_precise}] {bytes}/{total_bytes} ({bytes_per_sec}, {eta})"; 3 | pub fn set_progress_bar(bar: &kyuri::Bar, len: u64, url: &url::Url) { 4 | bar.set_len(len); 5 | bar.set_message(&format!("Downloading {}", url)); 6 | bar.set_template(TEMPLATE_DEFAULT); 7 | bar.set_pos(0); 8 | bar.set_visible(true); 9 | } 10 | -------------------------------------------------------------------------------- /src/cli/list.rs: -------------------------------------------------------------------------------- 1 | use crate::{ 2 | parser::ListResult, 3 | regex_manager::{get_exclusion_manager, Comparison}, 4 | utils::{build_client, relative_str_process}, 5 | AsyncContext, ListArgs, 6 | }; 7 | 8 | // TODO: clean code 9 | pub fn list(args: &ListArgs, bind_address: Option) -> ! { 10 | let parser = args.parser.build(); 11 | let client = build_client(args, parser.is_auto_redirect(), bind_address.as_ref(), true); 12 | let async_context = AsyncContext { 13 | runtime: tokio::runtime::Runtime::new().unwrap(), 14 | listing_client: client.clone(), 15 | download_client: client, 16 | }; 17 | let exclusion_manager = get_exclusion_manager(args); 18 | // get relative 19 | let upstream = &args.upstream; 20 | let upstream_path = parser.get_path(upstream); 21 | let relative = upstream_path 22 | .strip_prefix(&args.upstream_base) 23 | .unwrap() 24 | .to_str() 25 | .unwrap() 26 | .to_owned(); 27 | let relative = relative_str_process(&relative); 28 | assert!(relative.starts_with('/') && relative.ends_with('/')); 29 | let list = parser.get_list(&async_context, upstream).unwrap(); 30 | let match_cmp = exclusion_manager.match_str(&relative); 31 | 32 | println!("Relative: {relative}"); 33 | println!("Exclusion: {:?}", match_cmp); 34 | if match_cmp == Comparison::Stop { 35 | tracing::warn!("This listing would NOT be accessed at all."); 36 | } 37 | match list { 38 | ListResult::Redirect(url) => { 39 | println!("Redirect to {url}"); 40 | } 41 | ListResult::List(list) => { 42 | for item in list { 43 | print!("{item}"); 44 | let new_relative = format!("{}{}", relative, item.name); 45 | tracing::debug!("new_relative: {new_relative}"); 46 | println!( 47 | "{}", 48 | match exclusion_manager.match_str(new_relative.as_str()) { 49 | crate::regex_manager::Comparison::Stop => " (stop)", 50 | crate::regex_manager::Comparison::ListOnly => " (list only)", 51 | crate::regex_manager::Comparison::Ok => "", 52 | } 53 | ); 54 | } 55 | } 56 | } 57 | 58 | std::process::exit(0); 59 | } 60 | -------------------------------------------------------------------------------- /src/cli/mod.rs: -------------------------------------------------------------------------------- 1 | mod list; 2 | mod sync; 3 | pub use list::list; 4 | pub use sync::sync; 5 | -------------------------------------------------------------------------------- /src/compare.rs: -------------------------------------------------------------------------------- 1 | use std::path::Path; 2 | 3 | use chrono::{DateTime, FixedOffset, Utc}; 4 | use tracing::{debug, warn}; 5 | 6 | use crate::{ 7 | listing::{FileSize, FileType, ListItem}, 8 | utils::{self, naive_to_utc}, 9 | }; 10 | 11 | pub fn compare_filetype(fstype: std::fs::FileType, tsumugu_type: FileType) -> bool { 12 | match tsumugu_type { 13 | FileType::File => fstype.is_file(), 14 | FileType::Directory => fstype.is_dir(), 15 | } 16 | } 17 | 18 | pub fn should_download_by_list( 19 | path: &Path, 20 | remote: &ListItem, 21 | remote_timezone: Option, 22 | skip_if_exists: bool, 23 | size_only: bool, 24 | ) -> bool { 25 | let local_metadata = match path.metadata() { 26 | Ok(m) => { 27 | if skip_if_exists || remote.skip_check { 28 | debug!("Skipping {:?} because it exists", path); 29 | return false; 30 | } 31 | m 32 | } 33 | Err(e) => { 34 | if e.kind() != std::io::ErrorKind::NotFound { 35 | warn!("Failed to get metadata of {:?}: {:?}", path, e); 36 | } 37 | return true; 38 | } 39 | }; 40 | if !compare_filetype(local_metadata.file_type(), remote.type_) { 41 | // TODO: delete old file which type is not correct 42 | warn!("Type mismatch: {:?} remote {:?}", path, remote.type_); 43 | return true; 44 | } 45 | let local_size = local_metadata.len(); 46 | let is_size_match = match remote.size.unwrap_or(FileSize::Precise(0)) { 47 | FileSize::Precise(size) => local_size == size, 48 | // A very rough size check is used here, 49 | // as it looks like size returned by server may not be very accurate 50 | FileSize::HumanizedBinary(size, unit) => { 51 | let base = 1024_f64.powf(unit.get_exp().into()); 52 | let lsize = local_size as f64 / base; 53 | (lsize - size).abs() < 2.0 54 | } 55 | FileSize::HumanizedDecimal(size, unit) => { 56 | let base = 1000_f64.powf(unit.get_exp().into()); 57 | let lsize = local_size as f64 / base; 58 | (lsize - size).abs() < 2.0 59 | } 60 | }; 61 | if !is_size_match { 62 | debug!( 63 | "Size mismatch: {:?} local {:?} remote {:?}", 64 | path, local_size, remote.size 65 | ); 66 | return true; 67 | } 68 | if size_only { 69 | return false; 70 | } 71 | let local_mtime: DateTime = match local_metadata.modified() { 72 | Ok(m) => m, 73 | Err(_) => { 74 | // Here we expect all fs to support mtime 75 | unreachable!() 76 | } 77 | } 78 | .into(); 79 | // Use remote timezone or not? 80 | let timezone = match remote.timezone { 81 | None => remote_timezone, 82 | Some(tz) => Some(tz), 83 | }; 84 | let remote_mtime = naive_to_utc(&remote.mtime, timezone); 85 | let offset = remote_mtime - local_mtime; 86 | debug!("DateTime offset: {:?} {:?}", path, offset); 87 | match timezone { 88 | None => { 89 | // allow an offset to up to 24hrs 90 | offset.num_hours().abs() > 24 91 | } 92 | Some(_) => { 93 | // allow an offset up to 1min 94 | offset.num_minutes().abs() > 1 95 | } 96 | } 97 | } 98 | 99 | pub fn should_download_by_header(path: &Path, resp: &reqwest::Response, size_only: bool) -> bool { 100 | // Construct a valid "ListItem" and pass to should_download_by_list 101 | debug!("Checking {:?} by header: {:?}", path, resp); 102 | let item = ListItem { 103 | url: resp.url().clone(), 104 | name: path.file_name().unwrap().to_str().unwrap().to_string(), 105 | type_: if resp.url().as_str().ends_with('/') { 106 | FileType::Directory 107 | } else { 108 | FileType::File 109 | }, 110 | size: Some(FileSize::Precise(match resp.content_length() { 111 | Some(l) => l, 112 | None => { 113 | warn!( 114 | "No content-length from upstream ({}), go downloading anyway", 115 | resp.url() 116 | ); 117 | return true; 118 | } 119 | })), 120 | mtime: match utils::get_response_mtime(resp) { 121 | Ok(m) => m, 122 | Err(e) => { 123 | warn!( 124 | "Cannot get mtime from {} ({}), go downloading anyway", 125 | resp.url(), 126 | e 127 | ); 128 | return true; 129 | } 130 | } 131 | .naive_utc(), 132 | timezone: None, 133 | skip_check: false, 134 | }; 135 | should_download_by_list(path, &item, FixedOffset::east_opt(0), false, size_only) 136 | } 137 | -------------------------------------------------------------------------------- /src/extensions/apt.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use std::path::{Path, PathBuf}; 3 | use tracing::warn; 4 | use url::Url; 5 | 6 | pub fn is_apt_package(p: &Path) -> bool { 7 | // check if basename is Packages 8 | let basename = p.file_name().unwrap().to_str().unwrap(); 9 | if basename != "Packages" { 10 | return false; 11 | } 12 | // check if parents contain dists 13 | let parents = p.ancestors(); 14 | for iter in parents { 15 | let basename = iter.file_name().unwrap().to_str().unwrap(); 16 | if basename == "dists" { 17 | return true; 18 | } 19 | } 20 | false 21 | } 22 | 23 | // In every iter packages_path and packages_url be updated to their parents 24 | // When they reach the dists directory, return the root of debian 25 | // Otherwise when one of them reach the root, return error 26 | fn get_debian_root( 27 | packages_path: &Path, 28 | relative: &[String], 29 | packages_url: &Url, 30 | ) -> Result<(PathBuf, Vec, Url)> { 31 | fn pop(p: &mut PathBuf, r: Option<&mut Vec>, u: &mut Url) -> Result<()> { 32 | if !p.pop() { 33 | anyhow::bail!( 34 | "Cannot find debian root (path can not be popped, path = {:?})", 35 | p 36 | ); 37 | } 38 | if u.path() == "/" { 39 | anyhow::bail!( 40 | "Cannot find debian root (url can not be popped, url = {:?})", 41 | u 42 | ); 43 | } 44 | if let Some(r) = r { 45 | if r.pop().is_none() { 46 | anyhow::bail!( 47 | "Cannot find debian root (relative can not be popped, relative = {:?})", 48 | r 49 | ); 50 | } 51 | } 52 | u.path_segments_mut().unwrap().pop(); 53 | Ok(()) 54 | } 55 | let mut packages_path = packages_path.to_path_buf(); 56 | let mut relative = relative.to_owned(); 57 | let mut packages_url = packages_url.clone(); 58 | // first pop of file name to match relative 59 | pop(&mut packages_path, None, &mut packages_url)?; 60 | loop { 61 | let basename = packages_path.file_name().unwrap().to_str().unwrap(); 62 | let url_basename = packages_url.path_segments().unwrap().next_back().unwrap(); 63 | if basename == "dists" && url_basename == "dists" { 64 | // we don't wanna dists folder in return value 65 | pop(&mut packages_path, Some(&mut relative), &mut packages_url)?; 66 | // add trailing slash to packages_url 67 | packages_url.path_segments_mut().unwrap().push(""); 68 | return Ok((packages_path, relative, packages_url)); 69 | } 70 | if basename != url_basename { 71 | warn!( 72 | "basename = {}, url_basename = {}, relative = {:?}", 73 | basename, url_basename, relative 74 | ); 75 | } 76 | pop(&mut packages_path, Some(&mut relative), &mut packages_url)?; 77 | } 78 | } 79 | 80 | #[derive(Debug)] 81 | pub struct AptPackage { 82 | pub url: Url, 83 | pub relative: Vec, 84 | #[allow(dead_code)] 85 | pub size: usize, 86 | pub filename: String, 87 | } 88 | 89 | impl From for super::ExtensionPackage { 90 | fn from(val: AptPackage) -> Self { 91 | super::ExtensionPackage { 92 | url: val.url, 93 | relative: val.relative, 94 | filename: val.filename, 95 | } 96 | } 97 | } 98 | 99 | pub fn parse_package( 100 | packages_path: &Path, 101 | relative: &[String], 102 | packages_url: &Url, 103 | ) -> Result> { 104 | let data = std::fs::read_to_string(packages_path)?; 105 | let packages = apt_parser::Packages::from(&data); 106 | let (_, root_relative, debian_root_url) = 107 | get_debian_root(packages_path, relative, packages_url)?; 108 | // ignore errors 109 | let mut res = vec![]; 110 | for package in packages { 111 | let pool_url = package.filename; 112 | let size = package.size; 113 | let url = debian_root_url.join(&pool_url)?; 114 | 115 | let mut pool_split: Vec = pool_url.split('/').map(|s| s.to_string()).collect(); 116 | let mut relative = root_relative.clone(); 117 | relative.append(&mut pool_split); 118 | 119 | let basename = relative.pop().unwrap(); 120 | 121 | res.push(AptPackage { 122 | url, 123 | relative, 124 | size: size as usize, 125 | filename: basename, 126 | }) 127 | } 128 | 129 | Ok(res) 130 | } 131 | 132 | #[cfg(test)] 133 | mod tests { 134 | use super::*; 135 | use test_log::test; 136 | 137 | #[test] 138 | fn test_debian_root() { 139 | let packages_path = Path::new("/var/www/html/dists/buster/main/binary-amd64/Packages"); 140 | let relative = vec![ 141 | "dists".to_string(), 142 | "buster".to_string(), 143 | "main".to_string(), 144 | "binary-amd64".to_string(), 145 | ]; 146 | let packages_url = 147 | Url::parse("http://localhost/dists/buster/main/binary-amd64/Packages").unwrap(); 148 | let (debian_root_path, root_relative, debian_root_url) = 149 | get_debian_root(packages_path, &relative, &packages_url).unwrap(); 150 | assert_eq!(debian_root_path, Path::new("/var/www/html/")); 151 | assert_eq!(root_relative, Vec::::new()); 152 | assert_eq!(debian_root_url, Url::parse("http://localhost/").unwrap()); 153 | 154 | let packages_path = 155 | Path::new("/var/www/html/mysql/apt/ubuntu/dists/jammy/mysql-8.0/binary-amd64/Packages"); 156 | let relative = vec![ 157 | "apt".to_string(), 158 | "ubuntu".to_string(), 159 | "dists".to_string(), 160 | "jammy".to_string(), 161 | "mysql-8.0".to_string(), 162 | "binary-amd64".to_string(), 163 | ]; 164 | let packages_url = Url::parse( 165 | "http://repo.mysql.com/apt/ubuntu/dists/jammy/mysql-8.0/binary-amd64/Packages", 166 | ) 167 | .unwrap(); 168 | let (debian_root_path, root_relative, debian_root_url) = 169 | get_debian_root(packages_path, &relative, &packages_url).unwrap(); 170 | assert_eq!( 171 | debian_root_path, 172 | Path::new("/var/www/html/mysql/apt/ubuntu/") 173 | ); 174 | assert_eq!(root_relative, vec!["apt".to_string(), "ubuntu".to_string()]); 175 | assert_eq!( 176 | debian_root_url, 177 | Url::parse("http://repo.mysql.com/apt/ubuntu/").unwrap() 178 | ); 179 | } 180 | } 181 | -------------------------------------------------------------------------------- /src/extensions/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::SyncArgs; 2 | use std::path::Path; 3 | use tracing::{info, warn}; 4 | use url::Url; 5 | 6 | mod apt; 7 | mod yum; 8 | 9 | pub struct ExtensionPackage { 10 | pub url: Url, 11 | pub relative: Vec, 12 | pub filename: String, 13 | } 14 | 15 | pub fn extension_handler( 16 | args: &SyncArgs, 17 | path: &Path, 18 | relative: &[String], 19 | url: &Url, 20 | push_func: F, 21 | ) where 22 | F: Fn(&ExtensionPackage), 23 | { 24 | if args.apt_packages && crate::extensions::apt::is_apt_package(path) { 25 | let packages = apt::parse_package(path, relative, url); 26 | match packages { 27 | Err(e) => { 28 | warn!("Failed to parse APT package {:?}: {:?}", path, e); 29 | } 30 | Ok(packages) => { 31 | for package in packages { 32 | info!("APT package: {:?}", package); 33 | push_func(&package.into()); 34 | } 35 | } 36 | } 37 | } 38 | if args.yum_packages { 39 | let is_primary = crate::extensions::yum::is_yum_primary_xml(path); 40 | let is_repomd = crate::extensions::yum::is_yum_repomd_xml(path); 41 | match (is_primary, is_repomd) { 42 | (false, false) => (), 43 | (p, r) => { 44 | assert!(!(p && r), "File is both primary and repomd"); 45 | let xml_type = if p { 46 | crate::extensions::yum::YumXmlType::Primary 47 | } else { 48 | crate::extensions::yum::YumXmlType::Repomd 49 | }; 50 | let packages = yum::parse_package(path, relative, url, xml_type); 51 | match packages { 52 | Err(e) => { 53 | warn!("Failed to parse YUM file {:?}: {:?}", path, e); 54 | } 55 | Ok(packages) => { 56 | for package in packages { 57 | info!("YUM package: {:?}", package); 58 | push_func(&package.into()); 59 | } 60 | } 61 | } 62 | } 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/extensions/yum.rs: -------------------------------------------------------------------------------- 1 | use std::{io::Read, path::Path}; 2 | 3 | use anyhow::Result; 4 | use flate2::read::GzDecoder; 5 | use tracing::info; 6 | use url::Url; 7 | 8 | fn get_locations_from_xml(s: &str) -> Vec { 9 | let re = regex::Regex::new(r#""#).unwrap(); 10 | let mut urls = Vec::new(); 11 | for line in s.lines() { 12 | if let Some(caps) = re.captures(line) { 13 | let url = caps.get(1).unwrap().as_str(); 14 | urls.push(url.to_string()); 15 | } 16 | } 17 | urls 18 | } 19 | 20 | pub fn is_yum_primary_xml(p: &Path) -> bool { 21 | p.file_name() 22 | .map(|f| f.to_str().unwrap()) 23 | .map(|f| f.ends_with("primary.xml.gz")) 24 | .unwrap_or(false) 25 | } 26 | 27 | // read and extract location 28 | pub fn read_primary_xml(p: &Path) -> Result> { 29 | let bytes = std::fs::read(p)?; 30 | let mut gzd = GzDecoder::new(&bytes[..]); 31 | let mut s = String::new(); 32 | gzd.read_to_string(&mut s)?; 33 | 34 | Ok(get_locations_from_xml(&s)) 35 | } 36 | 37 | pub enum YumXmlType { 38 | Primary, 39 | Repomd, 40 | } 41 | 42 | #[derive(Debug)] 43 | pub struct YumPackage { 44 | pub url: Url, 45 | pub relative: Vec, 46 | pub filename: String, 47 | } 48 | 49 | impl From for super::ExtensionPackage { 50 | fn from(val: YumPackage) -> Self { 51 | super::ExtensionPackage { 52 | url: val.url, 53 | relative: val.relative, 54 | filename: val.filename, 55 | } 56 | } 57 | } 58 | 59 | pub fn parse_package( 60 | packages_path: &Path, 61 | relative: &[String], 62 | packages_url: &Url, 63 | xml_type: YumXmlType, 64 | ) -> Result> { 65 | let packages = match xml_type { 66 | YumXmlType::Primary => read_primary_xml(packages_path)?, 67 | YumXmlType::Repomd => read_yum_repomd_xml(packages_path)?, 68 | }; 69 | let mut relative = relative.to_owned(); 70 | relative.pop(); // pop "repodata" 71 | 72 | let mut base_url = packages_url.clone(); 73 | base_url.path_segments_mut().unwrap().pop().pop().push(""); 74 | info!("base_url = {:?}", base_url); 75 | info!("relative = {:?}", relative); 76 | 77 | let mut res = vec![]; 78 | for package in packages { 79 | let url = base_url.join(&package)?; 80 | let split: Vec = package.split('/').map(|s| s.to_string()).collect(); 81 | let mut relative = relative.clone(); 82 | relative.append(&mut split.clone()); 83 | 84 | let basename = relative.pop().unwrap(); 85 | res.push(YumPackage { 86 | url, 87 | relative, 88 | filename: basename, 89 | }) 90 | } 91 | 92 | Ok(res) 93 | } 94 | 95 | // Well, brain-damaged mysql-repo even cannot show all primary.xml.gz... 96 | // So I have to use repomd.xml to get primary.xml.gz... 97 | // Good news is that it seems like existing functions for handling primary.xml.gz can be reused. 98 | pub fn is_yum_repomd_xml(p: &Path) -> bool { 99 | p.file_name() 100 | .map(|f| f.to_str().unwrap()) 101 | .map(|f| f == "repomd.xml") 102 | .unwrap_or(false) 103 | } 104 | 105 | pub fn read_yum_repomd_xml(p: &Path) -> Result> { 106 | let bytes = std::fs::read(p)?; 107 | let s = String::from_utf8_lossy(&bytes); 108 | 109 | Ok(get_locations_from_xml(s.as_ref())) 110 | } 111 | -------------------------------------------------------------------------------- /src/listing.rs: -------------------------------------------------------------------------------- 1 | // Module for handling directory listing 2 | 3 | use std::fmt::Display; 4 | 5 | use chrono::{FixedOffset, NaiveDateTime}; 6 | use url::Url; 7 | 8 | #[derive(Debug, PartialEq, Clone, Copy)] 9 | pub enum FileType { 10 | File, 11 | Directory, 12 | } 13 | 14 | #[derive(Debug, PartialEq, Clone, Copy)] 15 | pub enum SizeUnit { 16 | B, 17 | K, 18 | M, 19 | G, 20 | T, 21 | P, 22 | } 23 | 24 | impl Display for SizeUnit { 25 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 26 | let unit = match self { 27 | SizeUnit::B => "B", 28 | SizeUnit::K => "K", 29 | SizeUnit::M => "M", 30 | SizeUnit::G => "G", 31 | SizeUnit::T => "T", 32 | SizeUnit::P => "P", 33 | }; 34 | write!(f, "{unit}") 35 | } 36 | } 37 | 38 | impl SizeUnit { 39 | pub fn get_exp(self) -> u32 { 40 | match self { 41 | SizeUnit::B => 0, 42 | SizeUnit::K => 1, 43 | SizeUnit::M => 2, 44 | SizeUnit::G => 3, 45 | SizeUnit::T => 4, 46 | SizeUnit::P => 5, 47 | } 48 | } 49 | } 50 | 51 | #[derive(Debug, Clone, Copy, PartialEq)] 52 | pub enum FileSize { 53 | Precise(u64), 54 | /// 1024B -> 1KiB 55 | HumanizedBinary(f64, SizeUnit), 56 | #[allow(dead_code)] 57 | /// 1000B -> 1KB 58 | HumanizedDecimal(f64, SizeUnit), 59 | } 60 | 61 | impl Display for FileSize { 62 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 63 | match self { 64 | FileSize::Precise(size) => write!(f, "{}", size), 65 | FileSize::HumanizedBinary(size, unit) => write!(f, "{size} {unit}"), 66 | FileSize::HumanizedDecimal(size, unit) => write!(f, "{size} {unit}"), 67 | } 68 | } 69 | } 70 | 71 | impl FileSize { 72 | pub fn get_humanized(s: &str) -> (f64, SizeUnit) { 73 | // separate numeric and unit 74 | let mut numeric = String::new(); 75 | let mut unit = String::new(); 76 | for c in s.chars() { 77 | if c.is_ascii_digit() || c == '.' { 78 | numeric.push(c); 79 | } else { 80 | unit.push(c); 81 | } 82 | } 83 | let unit = unit.to_lowercase(); 84 | let unit = unit.trim(); 85 | 86 | let numeric = numeric.parse::().unwrap(); 87 | let unit = match unit.chars().next() { 88 | None => SizeUnit::B, 89 | Some(u) => match u { 90 | 'b' => SizeUnit::B, 91 | 'k' => SizeUnit::K, 92 | 'm' => SizeUnit::M, 93 | 'g' => SizeUnit::G, 94 | 't' => SizeUnit::T, 95 | 'p' => SizeUnit::P, 96 | _ => panic!("Unknown unit: {unit}"), 97 | }, 98 | }; 99 | 100 | (numeric, unit) 101 | } 102 | 103 | pub fn get_estimated(&self) -> u64 { 104 | match self { 105 | FileSize::Precise(size) => *size, 106 | FileSize::HumanizedBinary(size, unit) => { 107 | let exp = unit.get_exp(); 108 | (size * 1024_f64.powi(exp as i32)) as u64 109 | } 110 | FileSize::HumanizedDecimal(size, unit) => { 111 | let exp = unit.get_exp(); 112 | (size * 1000_f64.powi(exp as i32)) as u64 113 | } 114 | } 115 | } 116 | } 117 | 118 | #[derive(Debug, Clone)] 119 | pub struct ListItem { 120 | pub url: Url, 121 | pub name: String, 122 | pub type_: FileType, 123 | pub size: Option, 124 | /// mtime is parsed from HTML, which is the local datetime of the "server" (not necessarily localtime or UTC) 125 | pub mtime: NaiveDateTime, 126 | /// Some HTML provides "timezone", parser shall set this if so (otherwise just None) 127 | pub timezone: Option, 128 | /// Don't check size and mtime: download only if the file doesn't exist. 129 | /// This is expected to be set by apt/yum parser extension (parser will not use this). 130 | pub skip_check: bool, 131 | } 132 | 133 | impl ListItem { 134 | pub fn new( 135 | url: Url, 136 | name: String, 137 | type_: FileType, 138 | size: Option, 139 | mtime: NaiveDateTime, 140 | timezone: Option, 141 | ) -> Self { 142 | Self { 143 | url, 144 | name, 145 | type_, 146 | size, 147 | mtime, 148 | timezone, 149 | skip_check: false, 150 | } 151 | } 152 | } 153 | 154 | impl Display for ListItem { 155 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 156 | let size_str = match self.size { 157 | Some(size) => size.to_string(), 158 | None => String::from("(none)"), 159 | }; 160 | let mtime_str = self.mtime.format("%Y-%m-%d %H:%M:%S").to_string(); 161 | let timezone = match self.timezone { 162 | None => "", 163 | Some(tz) => &format!("({})", tz), 164 | }; 165 | write!( 166 | f, 167 | "{} {:?} {} {}{} {}", 168 | self.url, self.type_, size_str, mtime_str, timezone, self.name 169 | ) 170 | } 171 | } 172 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | #![warn(clippy::cognitive_complexity)] 2 | use std::{path::PathBuf, sync::Mutex}; 3 | 4 | use clap::{Parser, Subcommand}; 5 | 6 | use parser::{ParserType, ParserTypeMatch}; 7 | use tracing::level_filters::LevelFilter; 8 | use tracing_subscriber::EnvFilter; 9 | use url::Url; 10 | 11 | use shadow_rs::shadow; 12 | use utils::{headers_to_headermap, Header}; 13 | shadow!(build); 14 | 15 | mod bar; 16 | mod cli; 17 | mod compare; 18 | mod listing; 19 | mod parser; 20 | mod regex_manager; 21 | mod timezone; 22 | mod utils; 23 | 24 | mod extensions; 25 | 26 | use crate::regex_manager::ExpandedRegex; 27 | 28 | #[allow(clippy::const_is_empty)] 29 | fn get_version() -> &'static str { 30 | let tag = build::TAG; 31 | let clean = build::GIT_CLEAN; 32 | let short_commit = build::SHORT_COMMIT; 33 | if !clean { 34 | Box::leak(format!("{} (dirty)", build::SHORT_COMMIT).into_boxed_str()) 35 | } else if tag.is_empty() { 36 | if short_commit.is_empty() { 37 | return build::PKG_VERSION; 38 | } else { 39 | return short_commit; 40 | } 41 | } else { 42 | return tag; 43 | } 44 | } 45 | 46 | #[derive(Parser, Debug)] 47 | #[command(about)] 48 | #[command(propagate_version = true)] 49 | #[command(version = get_version())] 50 | struct Cli { 51 | #[command(subcommand)] 52 | command: Commands, 53 | } 54 | 55 | #[derive(Subcommand, Debug)] 56 | enum Commands { 57 | /// Sync files from upstream to local. 58 | Sync(SyncArgs), 59 | 60 | /// List files from upstream. 61 | List(ListArgs), 62 | } 63 | 64 | trait SharedArgs { 65 | fn user_agent(&self) -> &str; 66 | fn headers(&self) -> reqwest::header::HeaderMap; 67 | fn use_v2_exclusion(&self) -> bool; 68 | fn exclude(&self) -> &[ExpandedRegex]; 69 | fn include(&self) -> &[ExpandedRegex]; 70 | } 71 | 72 | #[derive(Parser, Debug)] 73 | pub struct SyncArgs { 74 | /// Customize tsumugu's user agent. 75 | #[clap(long, default_value = "tsumugu")] 76 | user_agent: String, 77 | 78 | /// Do not download files and cleanup. 79 | #[clap(long)] 80 | dry_run: bool, 81 | 82 | /// Threads at work. 83 | #[clap(long, default_value_t = 2)] 84 | threads: usize, 85 | 86 | /// Do not clean up after sync. 87 | #[clap(long)] 88 | no_delete: bool, 89 | 90 | /// Set max delete count. 91 | #[clap(long, default_value_t = 100)] 92 | max_delete: usize, 93 | 94 | /// The upstream URL. 95 | #[clap(value_parser)] 96 | upstream: Url, 97 | 98 | /// The local directory. 99 | #[clap(value_parser)] 100 | local: PathBuf, 101 | 102 | /// You can set a valid URL for guessing. Set it to "no" to disable this behavior. 103 | /// By default it would recursively find the first file to HEAD for guessing 104 | #[clap(long)] 105 | timezone_file: Option, 106 | 107 | /// Manually set timezone (+- hrs). This overrides timezone_file. 108 | #[clap(long)] 109 | timezone: Option, 110 | 111 | /// Retry count for each request. 112 | #[clap(long, default_value_t = 3)] 113 | retry: usize, 114 | 115 | /// Do an HEAD before actual GET. 116 | /// Otherwise when head-before-get and allow-time-from-parser are not set, 117 | /// when GETting tsumugu would try checking if we still need to download it. 118 | #[clap(long)] 119 | head_before_get: bool, 120 | 121 | /// Choose a main parser. 122 | #[clap(long, value_enum, default_value_t = ParserType::Nginx)] 123 | parser: ParserType, 124 | 125 | /// Choose supplementary parsers. Format: "parsername:matchpattern". 126 | /// matchpattern is a relative path regex. 127 | /// Supports multiple. 128 | #[clap(long, value_parser)] 129 | parser_match: Vec, 130 | 131 | /// Excluded relative path regex. Supports multiple. 132 | #[clap(long, value_parser)] 133 | exclude: Vec, 134 | 135 | /// Included relative path regex (even if excluded). Supports multiple. 136 | #[clap(long, value_parser)] 137 | include: Vec, 138 | 139 | /// Skip relative path regex if they exist. Supports multiple. 140 | #[clap(long, value_parser)] 141 | skip_if_exists: Vec, 142 | 143 | /// Relative path regex for those compare size only **after** HEAD (head_before_get on) or GET (head_before_get off) 144 | #[clap(long, value_parser)] 145 | compare_size_only: Vec, 146 | 147 | /// Allow mtime from parser if not available from HTTP headers. 148 | #[clap(long, visible_alias = "allow-mtime-from-parser")] 149 | trust_mtime_from_parser: bool, 150 | 151 | /// (Experimental) APT Packages file parser to find out missing packages. 152 | #[clap(long)] 153 | apt_packages: bool, 154 | 155 | /// (Experimental) YUM Packages file parser to find out missing packages. 156 | #[clap(long)] 157 | yum_packages: bool, 158 | 159 | /// Ignore 404 NOT FOUND as error when downloading files. 160 | #[clap(long)] 161 | ignore_nonexist: bool, 162 | 163 | /// Allow automatically choose fallback parser when ParseError occurred. 164 | #[clap(long)] 165 | auto_fallback: bool, 166 | 167 | /// Custom header for HTTP(S) requests in format "Headerkey: headervalue". Supports multiple. 168 | #[clap(long, value_parser)] 169 | header: Vec
    , 170 | 171 | /// The exclusion v2 mode. To keep compatibility, this is off by default. 172 | #[clap(long)] 173 | exclusion_v2: bool, 174 | } 175 | 176 | impl SharedArgs for &SyncArgs { 177 | fn user_agent(&self) -> &str { 178 | &self.user_agent 179 | } 180 | 181 | fn headers(&self) -> reqwest::header::HeaderMap { 182 | headers_to_headermap(&self.header) 183 | } 184 | 185 | fn use_v2_exclusion(&self) -> bool { 186 | self.exclusion_v2 187 | } 188 | 189 | fn exclude(&self) -> &[ExpandedRegex] { 190 | &self.exclude 191 | } 192 | 193 | fn include(&self) -> &[ExpandedRegex] { 194 | &self.include 195 | } 196 | } 197 | 198 | #[derive(Parser, Debug)] 199 | pub struct ListArgs { 200 | /// Customize tsumugu's user agent. 201 | #[clap(long, default_value = "tsumugu")] 202 | user_agent: String, 203 | 204 | /// The upstream URL. 205 | #[clap(value_parser)] 206 | upstream: Url, 207 | 208 | /// Choose a main parser. 209 | #[clap(long, value_enum, default_value_t=ParserType::Nginx)] 210 | parser: ParserType, 211 | 212 | /// Excluded relative path regex. Supports multiple. 213 | #[clap(long, value_parser)] 214 | exclude: Vec, 215 | 216 | /// Included relative path regex (even if excluded). Supports multiple. 217 | #[clap(long, value_parser)] 218 | include: Vec, 219 | 220 | /// The upstream base starting with "/". 221 | #[clap(long, default_value = "/")] 222 | upstream_base: String, 223 | 224 | /// Custom header for HTTP(S) requests in format "Headerkey: headervalue". Supports multiple. 225 | #[clap(long, value_parser)] 226 | header: Vec
    , 227 | 228 | /// The exclusion v2 mode. To keep compatibility, this is off by default. 229 | #[clap(long)] 230 | exclusion_v2: bool, 231 | } 232 | 233 | impl SharedArgs for &ListArgs { 234 | fn user_agent(&self) -> &str { 235 | &self.user_agent 236 | } 237 | 238 | fn headers(&self) -> reqwest::header::HeaderMap { 239 | headers_to_headermap(&self.header) 240 | } 241 | 242 | fn use_v2_exclusion(&self) -> bool { 243 | self.exclusion_v2 244 | } 245 | 246 | fn exclude(&self) -> &[ExpandedRegex] { 247 | &self.exclude 248 | } 249 | 250 | fn include(&self) -> &[ExpandedRegex] { 251 | &self.include 252 | } 253 | } 254 | 255 | pub struct AsyncContext { 256 | pub listing_client: reqwest::Client, 257 | pub download_client: reqwest::Client, 258 | pub runtime: tokio::runtime::Runtime, 259 | } 260 | 261 | fn main() { 262 | let enable_color = std::env::var("NO_COLOR").is_err(); 263 | let pb_manager = kyuri::Manager::new(std::time::Duration::from_secs(1)); 264 | pb_manager.set_ticker(true); 265 | let pb_writer = pb_manager.create_writer(); 266 | tracing_subscriber::fmt() 267 | .with_thread_ids(true) 268 | .with_env_filter( 269 | // https://github.com/tokio-rs/tracing/issues/735 270 | EnvFilter::builder() 271 | .with_default_directive(LevelFilter::INFO.into()) 272 | .from_env_lossy(), 273 | ) 274 | .with_ansi(enable_color) 275 | .with_writer(Mutex::new(pb_writer)) 276 | .init(); 277 | 278 | // Print version info in debug mode 279 | tracing::debug!("{}", build::CLAP_LONG_VERSION); 280 | 281 | let bind_address = match std::env::var("BIND_ADDRESS").ok() { 282 | Some(s) => { 283 | let s = s.trim(); 284 | if s.is_empty() { 285 | None 286 | } else { 287 | Some(s.to_owned()) 288 | } 289 | } 290 | None => None, 291 | }; 292 | 293 | // terminate whole process when a thread panics 294 | let orig_hook = std::panic::take_hook(); 295 | std::panic::set_hook(Box::new(move |panic_info| { 296 | orig_hook(panic_info); 297 | std::process::exit(3); 298 | })); 299 | 300 | let args = Cli::parse(); 301 | match args.command { 302 | Commands::Sync(args) => { 303 | if !args.upstream.path().ends_with('/') { 304 | tracing::warn!("It's suggested to append backslash to upstream, though this also works in most cases (most web servers redirects this to URL with backslash at end).") 305 | } 306 | cli::sync(&args, bind_address, pb_manager); 307 | } 308 | Commands::List(args) => { 309 | // extra arg check 310 | if !args.upstream.path().ends_with('/') { 311 | panic!("upstream should end with /"); 312 | } 313 | if !args.upstream_base.starts_with('/') { 314 | panic!("upstream_base does not start with /") 315 | } 316 | cli::list(&args, bind_address); 317 | } 318 | }; 319 | } 320 | -------------------------------------------------------------------------------- /src/parser/apache_f2.rs: -------------------------------------------------------------------------------- 1 | // https://httpd.apache.org/docs/2.4/mod/mod_autoindex.html 2 | // > F=2 formats the listing as an HTMLTable FancyIndexed list 3 | 4 | use crate::{ 5 | listing::{FileSize, FileType, ListItem}, 6 | utils::get, 7 | }; 8 | 9 | use super::*; 10 | use anyhow::{anyhow, Result}; 11 | use chrono::NaiveDateTime; 12 | use scraper::{Html, Selector}; 13 | use tracing::debug; 14 | 15 | #[derive(Debug, Clone, Default)] 16 | pub struct ApacheF2ListingParser; 17 | 18 | impl Parser for ApacheF2ListingParser { 19 | fn name(&self) -> &'static str { 20 | "Apache-f2 format" 21 | } 22 | 23 | fn get_list( 24 | &self, 25 | async_context: &AsyncContext, 26 | url: &url::Url, 27 | ) -> Result { 28 | let resp = get( 29 | &async_context.runtime, 30 | &async_context.listing_client, 31 | url.clone(), 32 | )?; 33 | let url = resp.url().clone(); 34 | let body = get_text(&async_context.runtime, resp)?; 35 | assert_if_url_has_no_trailing_slash(&url); 36 | let document = Html::parse_document(&body); 37 | // find the indexlist which contains file index 38 | let selector = Selector::parse("table").unwrap(); 39 | let mut selector_iter = document.select(&selector); 40 | let indexlist; 41 | loop { 42 | let t = selector_iter 43 | .next() 44 | .ok_or(anyhow!("No more matched"))?; 45 | let t_html = t.html().to_lowercase(); 46 | if t_html.contains("name") 47 | && t_html.contains("last modified") 48 | && t_html.contains("size") 49 | { 50 | indexlist = t; 51 | break; 52 | } 53 | } 54 | // find all inside -- there might have titlebar or
    , filter them later 55 | let selector = Selector::parse("tr").unwrap(); 56 | let mut items = Vec::new(); 57 | 58 | let mut lastmod_before_size = true; 59 | for element in indexlist.select(&selector) { 60 | // skip divider 61 | let hr_selector = Selector::parse("hr").unwrap(); 62 | if element.select(&hr_selector).next().is_some() { 63 | continue; 64 | } 65 | // skip table title 66 | let a_selector = Selector::parse("a").unwrap(); 67 | let hrefs: Vec<&str> = element 68 | .select(&a_selector) 69 | .map(|a| a.value().attr("href").unwrap_or("?")) 70 | .collect(); 71 | // Empty or all query string hrefs 72 | if hrefs.iter().all(|h| h.starts_with('?')) { 73 | let lastmod_pos = element.inner_html().to_lowercase().find("last modified"); 74 | let size_pos = element.inner_html().to_lowercase().find("size"); 75 | if let (Some(lastmod_pos), Some(size_pos)) = (lastmod_pos, size_pos) { 76 | lastmod_before_size = lastmod_pos < size_pos; 77 | } 78 | continue; 79 | } 80 | 81 | let td_selector = Selector::parse("td").unwrap(); 82 | let mut td_iterator = element.select(&td_selector); 83 | // skip icon (first col) 84 | td_iterator.next(); 85 | let td = td_iterator 86 | .next() 87 | .ok_or(anyhow!("no more td after first iterate"))?; 88 | let a = td.select(&a_selector).next().unwrap(); 89 | let displayed_filename = a.inner_html(); 90 | if displayed_filename == "Parent Directory" || displayed_filename == ".." { 91 | continue; 92 | } 93 | 94 | let href = a.value().attr("href").unwrap(); 95 | let name = get_real_name_from_href(href); 96 | let href = url.join(href)?; 97 | let type_ = if href.as_str().ends_with('/') { 98 | FileType::Directory 99 | } else { 100 | FileType::File 101 | }; 102 | let col2 = td_iterator 103 | .next() 104 | .ok_or(anyhow!("no more td after second iterate"))? 105 | .inner_html(); 106 | let col2 = col2.trim(); 107 | let col3 = td_iterator 108 | .next() 109 | .ok_or(anyhow!("no more td after third iterate"))? 110 | .inner_html(); 111 | let col3 = col3.trim(); 112 | 113 | let (lastmod, size) = if lastmod_before_size { 114 | (col2, col3) 115 | } else { 116 | (col3, col2) 117 | }; 118 | 119 | // debug!("{} {} {} {}", href, name, lastmod, size); 120 | 121 | let date = if lastmod.is_empty() && type_ == FileType::Directory { 122 | // if it's a directory, it's okay to have empty lastmod 123 | NaiveDateTime::default() 124 | } else { 125 | debug!("lastmod: {}", lastmod); 126 | let (date_fmt, _) = guess_date_fmt(lastmod); 127 | NaiveDateTime::parse_from_str(lastmod, &date_fmt)? 128 | }; 129 | 130 | items.push(ListItem::new( 131 | href, 132 | name.to_string(), 133 | type_, 134 | { 135 | if size == "-" || size.is_empty() { 136 | None 137 | } else { 138 | let (n_size, unit) = FileSize::get_humanized(size); 139 | Some(FileSize::HumanizedBinary(n_size, unit)) 140 | } 141 | }, 142 | date, 143 | None, 144 | )) 145 | } 146 | 147 | Ok(ListResult::List(items)) 148 | } 149 | } 150 | 151 | #[cfg(test)] 152 | mod tests { 153 | use crate::listing::SizeUnit; 154 | 155 | use super::*; 156 | use crate::parser::tests::*; 157 | 158 | #[test] 159 | fn test_winehq_root() { 160 | let context = init_async_context(); 161 | let items = ApacheF2ListingParser 162 | .get_list( 163 | &context, 164 | &url::Url::parse("http://localhost:1921/wine-builds").unwrap(), 165 | ) 166 | .unwrap(); 167 | match items { 168 | ListResult::List(items) => { 169 | assert_eq!(items.len(), 8); 170 | assert_eq!(items[0].name, "android"); 171 | assert_eq!(items[0].type_, FileType::Directory); 172 | assert_eq!(items[0].size, None); 173 | assert_eq!( 174 | items[0].mtime, 175 | NaiveDateTime::parse_from_str("2022-01-18 15:14", "%Y-%m-%d %H:%M").unwrap() 176 | ); 177 | assert_eq!(items[6].name, "Release.key"); 178 | assert_eq!(items[6].type_, FileType::File); 179 | assert_eq!( 180 | items[6].size, 181 | Some(FileSize::HumanizedBinary(3.0, SizeUnit::K)) 182 | ); 183 | assert_eq!( 184 | items[6].mtime, 185 | NaiveDateTime::parse_from_str("2017-03-28 14:54", "%Y-%m-%d %H:%M").unwrap() 186 | ); 187 | } 188 | _ => unreachable!(), 189 | } 190 | } 191 | 192 | #[test] 193 | fn test_raspberrypi_root() { 194 | let context = init_async_context(); 195 | let items = ApacheF2ListingParser 196 | .get_list( 197 | &context, 198 | &url::Url::parse("http://localhost:1921/raspberrypi/").unwrap(), 199 | ) 200 | .unwrap(); 201 | match items { 202 | ListResult::List(items) => { 203 | assert_eq!(items.len(), 61); 204 | assert_eq!(items[0].name, "AstroPi"); 205 | assert_eq!(items[0].type_, FileType::Directory); 206 | assert_eq!(items[0].size, None); 207 | assert_eq!( 208 | items[0].mtime, 209 | NaiveDateTime::parse_from_str("2017-09-04 15:41", "%Y-%m-%d %H:%M").unwrap() 210 | ); 211 | assert_eq!(items[6].name, "Raspberry_Pi_Education_Manual.pdf"); 212 | assert_eq!(items[6].type_, FileType::File); 213 | assert_eq!( 214 | items[6].size, 215 | Some(FileSize::HumanizedBinary(2.8, SizeUnit::M)) 216 | ); 217 | assert_eq!( 218 | items[6].mtime, 219 | NaiveDateTime::parse_from_str("2013-09-16 13:51", "%Y-%m-%d %H:%M").unwrap() 220 | ); 221 | } 222 | _ => unreachable!(), 223 | } 224 | } 225 | 226 | #[test] 227 | fn test_mozilla_root() { 228 | let context = init_async_context(); 229 | let items = ApacheF2ListingParser 230 | .get_list( 231 | &context, 232 | &url::Url::parse("http://localhost:1921/mozilla/").unwrap(), 233 | ) 234 | .unwrap(); 235 | match items { 236 | ListResult::List(items) => { 237 | assert_eq!(items.len(), 46); 238 | assert_eq!(items[0].name, "OJI"); 239 | assert_eq!(items[0].type_, FileType::Directory); 240 | assert_eq!(items[0].size, None); 241 | assert_eq!( 242 | items[0].mtime, 243 | NaiveDateTime::parse_from_str("1970-01-01 00:00", "%Y-%m-%d %H:%M").unwrap() 244 | ); 245 | } 246 | _ => unreachable!(), 247 | } 248 | } 249 | 250 | #[test] 251 | fn test_mozilla_oji() { 252 | let context = init_async_context(); 253 | let items = ApacheF2ListingParser 254 | .get_list( 255 | &context, 256 | &url::Url::parse("http://localhost:1921/mozilla/OJI/").unwrap(), 257 | ) 258 | .unwrap(); 259 | match items { 260 | ListResult::List(items) => { 261 | assert_eq!(items.len(), 2); 262 | assert_eq!(items[0].name, "MRJPlugin"); 263 | assert_eq!(items[0].type_, FileType::Directory); 264 | assert_eq!(items[0].size, None); 265 | assert_eq!( 266 | items[0].mtime, 267 | NaiveDateTime::parse_from_str("1970-01-01 00:00", "%Y-%m-%d %H:%M").unwrap() 268 | ); 269 | assert_eq!(items[1].name, "MRJPlugin.sit.hqx"); 270 | assert_eq!(items[1].type_, FileType::File); 271 | assert_eq!( 272 | items[1].size, 273 | Some(FileSize::HumanizedBinary(234.0, SizeUnit::K)) 274 | ); 275 | assert_eq!( 276 | items[1].mtime, 277 | NaiveDateTime::parse_from_str("2023-02-13 04:21", "%Y-%m-%d %H:%M").unwrap() 278 | ); 279 | } 280 | _ => unreachable!(), 281 | } 282 | } 283 | 284 | #[test] 285 | fn test_grml() { 286 | let context = init_async_context(); 287 | let items = ApacheF2ListingParser 288 | .get_list( 289 | &context, 290 | &url::Url::parse("http://localhost:1921/grml/").unwrap(), 291 | ) 292 | .unwrap(); 293 | match items { 294 | ListResult::List(items) => { 295 | assert_eq!(items.len(), 10); 296 | // Test "+" 297 | assert_eq!(items[3].name, "memtest86+"); 298 | } 299 | _ => unreachable!(), 300 | } 301 | } 302 | } 303 | -------------------------------------------------------------------------------- /src/parser/caddy.rs: -------------------------------------------------------------------------------- 1 | /// A parser for default caddy file_server format 2 | use crate::{ 3 | listing::{FileSize, FileType, ListItem}, 4 | utils::get, 5 | }; 6 | 7 | use super::*; 8 | use anyhow::Result; 9 | use chrono::NaiveDateTime; 10 | use scraper::{Html, Selector}; 11 | 12 | #[derive(Debug, Clone, Default)] 13 | pub struct CaddyListingParser; 14 | 15 | impl Parser for CaddyListingParser { 16 | fn name(&self) -> &'static str { 17 | "Caddy" 18 | } 19 | 20 | fn get_list( 21 | &self, 22 | async_context: &AsyncContext, 23 | url: &url::Url, 24 | ) -> Result { 25 | let resp = get( 26 | &async_context.runtime, 27 | &async_context.listing_client, 28 | url.clone(), 29 | )?; 30 | let url = resp.url().clone(); 31 | let body = get_text(&async_context.runtime, resp)?; 32 | assert_if_url_has_no_trailing_slash(&url); 33 | let document = Html::parse_document(&body); 34 | let selector = Selector::parse("tr.file").unwrap(); 35 | let mut items = Vec::new(); 36 | for element in document.select(&selector) { 37 | // name and herf 38 | let selector = Selector::parse("td a").unwrap(); 39 | let a = element.select(&selector).next().unwrap(); 40 | let href = a.value().attr("href").unwrap(); 41 | // Caddy file_server will append "./" to href 42 | let name = get_real_name_from_href(href) 43 | .trim_start_matches("./") 44 | .to_string(); 45 | let href = url.join(href)?; 46 | let type_ = if href.as_str().ends_with('/') { 47 | FileType::Directory 48 | } else { 49 | FileType::File 50 | }; 51 | // size 52 | let selector = Selector::parse("td.size div.sizebar div.sizebar-text").unwrap(); 53 | let size = match element.select(&selector).next() { 54 | Some(s) => { 55 | let size_text = s.inner_html(); 56 | // ↱  would be added by caddy when it's a symlink 57 | // https://github.com/caddyserver/caddy/commit/9338741ca79a74247ced86bc26e4994138470852 58 | let size_text = size_text.trim().trim_start_matches("↱ "); 59 | let (n_size, unit) = FileSize::get_humanized(size_text); 60 | Some(FileSize::HumanizedBinary(n_size, unit)) 61 | } 62 | None => None, 63 | }; 64 | // date 65 | let selector = Selector::parse("td.timestamp time").unwrap(); 66 | let mtime = element 67 | .select(&selector) 68 | .next() 69 | .unwrap() 70 | .value() 71 | .attr("datetime") 72 | .unwrap() 73 | .trim(); 74 | // Store UTC time 75 | let date = NaiveDateTime::parse_from_str(mtime, "%Y-%m-%dT%H:%M:%S%Z")?; 76 | 77 | items.push(ListItem::new(href, name, type_, size, date, None)) 78 | } 79 | 80 | Ok(ListResult::List(items)) 81 | } 82 | } 83 | 84 | #[cfg(test)] 85 | mod tests { 86 | use crate::listing::SizeUnit; 87 | 88 | use super::*; 89 | use crate::parser::tests::*; 90 | 91 | #[test] 92 | fn test_sdumirror_ubuntu() { 93 | let context = init_async_context(); 94 | let items = CaddyListingParser 95 | .get_list( 96 | &context, 97 | &url::Url::parse("http://localhost:1921/sdumirror-ubuntu").unwrap(), 98 | ) 99 | .unwrap(); 100 | match items { 101 | ListResult::List(items) => { 102 | assert_eq!(items.len(), 7); 103 | assert_eq!(items[0].name, ".trace"); 104 | assert_eq!(items[0].type_, FileType::Directory); 105 | assert_eq!(items[0].size, None); 106 | assert_eq!( 107 | items[0].mtime, 108 | NaiveDateTime::parse_from_str("2023-07-10T13:07:52Z", "%Y-%m-%dT%H:%M:%S%Z") 109 | .unwrap() 110 | ); 111 | assert_eq!(items[5].name, "ubuntu"); 112 | assert_eq!(items[5].type_, FileType::Directory); 113 | assert_eq!(items[5].size, None); 114 | assert_eq!( 115 | items[5].mtime, 116 | NaiveDateTime::parse_from_str("2010-11-24T11:01:53Z", "%Y-%m-%dT%H:%M:%S%Z") 117 | .unwrap() 118 | ); 119 | assert_eq!(items[6].name, "ls-lR.gz"); 120 | assert_eq!(items[6].type_, FileType::File); 121 | assert_eq!( 122 | items[6].size, 123 | Some(FileSize::HumanizedBinary(26.0, SizeUnit::M)) 124 | ); 125 | assert_eq!( 126 | items[6].mtime, 127 | NaiveDateTime::parse_from_str("2024-03-10T04:45:24Z", "%Y-%m-%dT%H:%M:%S%Z") 128 | .unwrap() 129 | ); 130 | } 131 | _ => unreachable!(), 132 | } 133 | } 134 | 135 | #[test] 136 | fn test_caddy_symlink() { 137 | let context = init_async_context(); 138 | let items = CaddyListingParser 139 | .get_list( 140 | &context, 141 | &url::Url::parse("http://localhost:1921/caddy-symlink").unwrap(), 142 | ) 143 | .unwrap(); 144 | match items { 145 | ListResult::List(items) => { 146 | assert_eq!(items.len(), 3); 147 | assert_eq!(items[0].name, "aoi.png"); 148 | assert_eq!(items[0].type_, FileType::File); 149 | assert_eq!( 150 | items[0].size, 151 | Some(FileSize::HumanizedBinary(32.0, SizeUnit::K)) 152 | ); 153 | assert_eq!( 154 | items[0].mtime, 155 | NaiveDateTime::parse_from_str("2022-11-19T19:15:45Z", "%Y-%m-%dT%H:%M:%S%Z") 156 | .unwrap() 157 | ); 158 | assert_eq!(items[1].name, "index.html.bak"); 159 | assert_eq!(items[1].type_, FileType::File); 160 | assert_eq!( 161 | items[1].size, 162 | Some(FileSize::HumanizedBinary(143.0, SizeUnit::B)) 163 | ); 164 | assert_eq!( 165 | items[1].mtime, 166 | NaiveDateTime::parse_from_str("2022-11-19T19:14:38Z", "%Y-%m-%dT%H:%M:%S%Z") 167 | .unwrap() 168 | ); 169 | assert_eq!(items[2].name, "symlink"); 170 | assert_eq!(items[2].type_, FileType::File); 171 | assert_eq!( 172 | items[2].size, 173 | Some(FileSize::HumanizedBinary(143.0, SizeUnit::B)) 174 | ); 175 | assert_eq!( 176 | items[2].mtime, 177 | NaiveDateTime::parse_from_str("2025-02-27T10:45:49Z", "%Y-%m-%dT%H:%M:%S%Z") 178 | .unwrap() 179 | ); 180 | } 181 | _ => unreachable!(), 182 | } 183 | } 184 | } 185 | -------------------------------------------------------------------------------- /src/parser/denoflare_r2.rs: -------------------------------------------------------------------------------- 1 | use crate::{ 2 | listing::{FileSize, FileType, ListItem}, 3 | parser::{assert_if_url_has_no_trailing_slash, get_real_name_from_href}, 4 | utils::{get, get_text}, 5 | AsyncContext, 6 | }; 7 | 8 | use super::{ListResult, Parser, ParserError}; 9 | use anyhow::Result; 10 | use chrono::{FixedOffset, NaiveDateTime}; 11 | use scraper::CaseSensitivity::*; 12 | use scraper::{Html, Selector}; 13 | use tracing::info; 14 | 15 | #[derive(Debug, Clone, Default)] 16 | pub struct DenoFlareR2ListingParser; 17 | 18 | // Ref: https://github.com/skymethod/denoflare/blob/2e89fb33972a924dd9c5078bb2b2834a1f619081/examples/r2-public-read-worker/listing.ts 19 | 20 | impl Parser for DenoFlareR2ListingParser { 21 | fn name(&self) -> &'static str { 22 | "DenoFlare R2 Public Read Worker example" 23 | } 24 | 25 | fn get_list( 26 | &self, 27 | async_context: &AsyncContext, 28 | url: &url::Url, 29 | ) -> Result { 30 | let mut documents = vec![]; 31 | assert_if_url_has_no_trailing_slash(url); 32 | let mut inner_url = url.clone(); 33 | loop { 34 | info!("(in paging loop) Fetching: {}", inner_url); 35 | let resp = get( 36 | &async_context.runtime, 37 | &async_context.listing_client, 38 | inner_url.clone(), 39 | )?; 40 | let body = get_text(&async_context.runtime, resp)?; 41 | let document = Html::parse_document(&body); 42 | documents.push((inner_url.clone(), document.clone())); 43 | 44 | // Check if last element of #contents is next ➜ 45 | let selector = Selector::parse("div#contents").unwrap(); 46 | let contents = document 47 | .select(&selector) 48 | .next() 49 | .expect("
    not found"); 50 | // 51 | let last_child = contents 52 | .child_elements() 53 | .last() 54 | .expect("Expected last child"); 55 | // next ➜ 56 | let last_child = match last_child.last_child() { 57 | Some(child) => child, 58 | None => break, 59 | }; 60 | // next ➜ 61 | let textnode = match last_child.first_child() { 62 | Some(child) => child, 63 | _ => break, 64 | }; 65 | if textnode.value().as_text().map_or("", |t| t) != "next ➜" { 66 | break; 67 | } 68 | let href = last_child 69 | .value() 70 | .as_element() 71 | .unwrap() 72 | .attr("href") 73 | .unwrap(); 74 | inner_url = url.join(href)?; 75 | } 76 | let mut items = Vec::new(); 77 | for (url, document) in documents { 78 | let selector = Selector::parse("div#contents").unwrap(); 79 | let contents = document 80 | .select(&selector) 81 | .next() 82 | .expect("
    not found"); 83 | 84 | enum State { 85 | Start, 86 | Dirs, 87 | Files, 88 | } 89 | let mut state = State::Start; 90 | 91 | let mut iter = contents.child_elements().peekable(); 92 | while let Some(child) = iter.next() { 93 | match state { 94 | State::Start => { 95 | if child.value().name() == "div" 96 | && child.value().has_class("full", CaseSensitive) 97 | && child.text().next().unwrap_or_default() == "\u{a0}" 98 | //   99 | { 100 | // peek 101 | let next_elem = iter.peek().expect("Expected next element"); 102 | let class_is_full = next_elem.value().has_class("full", CaseSensitive); 103 | if class_is_full { 104 | state = State::Dirs; 105 | } else { 106 | state = State::Files; 107 | } 108 | } 109 | } 110 | State::Dirs => { 111 | if child.value().name() == "div" { 112 | assert!( 113 | child.value().has_class("full", CaseSensitive), 114 | "Expected class=\"full\" as end of dirs" 115 | ); 116 | assert!( 117 | child.text().next().unwrap_or_default() == "\u{a0}", 118 | "Expected   as end of dirs" 119 | ); 120 | state = State::Files; 121 | continue; 122 | } 123 | assert!(child.value().name() == "a", "Expected in dirs"); 124 | let href = child.value().attr("href").expect("href not found"); 125 | let name = get_real_name_from_href(href); 126 | let href = url.join(href)?; 127 | items.push(ListItem::new( 128 | href, 129 | name, 130 | FileType::Directory, 131 | None, 132 | NaiveDateTime::UNIX_EPOCH, 133 | None, 134 | )); 135 | } 136 | State::Files => { 137 | if child.value().name() == "div" { 138 | assert!( 139 | child.value().has_class("full", CaseSensitive), 140 | "Expected class=\"full\" as end of files, if paging required." 141 | ); 142 | break; 143 | } 144 | assert!(child.value().name() == "a", "Expected in files"); 145 | let href = child.value().attr("href").expect("href not found"); 146 | if href.ends_with('/') { 147 | for _ in 0..3 { 148 | iter.next(); 149 | } 150 | continue; 151 | } 152 | let name = get_real_name_from_href(href); 153 | let child = iter.next().expect("Expected next child"); 154 | let size = child 155 | .text() 156 | .next() 157 | .expect("Expected size text") 158 | .replace(',', ""); // bytes 159 | let size = size.parse::().expect("Expected size to be u64"); 160 | iter.next(); // skip estimated size 161 | let mtime = iter 162 | .next() 163 | .expect("Expected mtime") 164 | .text() 165 | .next() 166 | .expect("Expected mtime text"); 167 | let mtime = NaiveDateTime::parse_from_str(mtime, "%Y-%m-%dT%H:%M:%S.%3fZ") 168 | .expect("Expected mtime to be NaiveDateTime"); 169 | let href = url.join(href)?; 170 | items.push(ListItem::new( 171 | href, 172 | name, 173 | FileType::File, 174 | Some(FileSize::Precise(size)), 175 | mtime, 176 | FixedOffset::east_opt(0), 177 | )); 178 | } 179 | } 180 | } 181 | } 182 | 183 | Ok(ListResult::List(items)) 184 | } 185 | } 186 | 187 | #[cfg(test)] 188 | mod tests { 189 | use url::Url; 190 | 191 | use crate::parser::tests::*; 192 | 193 | use super::*; 194 | 195 | #[test] 196 | fn test_clickhouse() { 197 | let context = init_async_context(); 198 | let items = DenoFlareR2ListingParser 199 | .get_list( 200 | &context, 201 | &Url::parse("http://localhost:1921/clickhouse/").unwrap(), 202 | ) 203 | .unwrap(); 204 | match items { 205 | ListResult::List(items) => { 206 | assert_eq!(items.len(), 4); 207 | assert_eq!(items[0].name, "deb"); 208 | assert_eq!(items[0].type_, FileType::Directory); 209 | assert_eq!(items[1].name, "rpm"); 210 | assert_eq!(items[1].type_, FileType::Directory); 211 | assert_eq!(items[2].name, "tgz"); 212 | assert_eq!(items[2].type_, FileType::Directory); 213 | assert_eq!(items[3].name, "CLICKHOUSE-KEY.GPG"); 214 | assert_eq!(items[3].type_, FileType::File); 215 | assert_eq!(items[3].size, Some(FileSize::Precise(3133))); 216 | assert_eq!( 217 | items[3].mtime, 218 | NaiveDateTime::parse_from_str( 219 | "2022-09-23 13:53:51.925", 220 | "%Y-%m-%d %H:%M:%S.%3f" 221 | ) 222 | .unwrap() 223 | ); 224 | assert_eq!(items[3].timezone, FixedOffset::east_opt(0)); 225 | } 226 | _ => unreachable!(), 227 | } 228 | } 229 | 230 | #[test] 231 | fn test_clickhouse_fileonly() { 232 | let context = init_async_context(); 233 | let items = DenoFlareR2ListingParser 234 | .get_list( 235 | &context, 236 | &Url::parse("http://localhost:1921/clickhouse/clickhouse-client/").unwrap(), 237 | ) 238 | .unwrap(); 239 | match items { 240 | ListResult::List(items) => { 241 | assert_eq!(items.len(), 61); 242 | assert_eq!(items[0].name, "clickhouse-client_22.3.10.22_amd64.deb"); 243 | assert_eq!(items[0].type_, FileType::File); 244 | } 245 | _ => unreachable!(), 246 | } 247 | } 248 | 249 | #[test] 250 | fn test_clickhouse_multipage() { 251 | let context = init_async_context(); 252 | let items = DenoFlareR2ListingParser 253 | .get_list( 254 | &context, 255 | &Url::parse("http://localhost:1921/clickhouse/stable/").unwrap(), 256 | ) 257 | .unwrap(); 258 | match items { 259 | ListResult::List(items) => { 260 | assert_eq!(items.len(), 4); 261 | assert_eq!(items[0].name, "clickhouse-client-21.1.9.41.tgz.sha512"); 262 | assert_eq!(items[3].name, "clickhouse-client-23.7.3.14-arm64.tgz"); 263 | } 264 | _ => unreachable!(), 265 | } 266 | } 267 | } 268 | -------------------------------------------------------------------------------- /src/parser/directory_lister.rs: -------------------------------------------------------------------------------- 1 | use crate::{ 2 | listing::{FileSize, FileType, ListItem}, 3 | utils::get, 4 | }; 5 | 6 | use super::*; 7 | use anyhow::Result; 8 | use chrono::NaiveDateTime; 9 | use scraper::{Html, Selector}; 10 | 11 | #[derive(Debug, Clone, Default)] 12 | pub struct DirectoryListerListingParser; 13 | 14 | impl Parser for DirectoryListerListingParser { 15 | fn name(&self) -> &'static str { 16 | "Directory Lister" 17 | } 18 | 19 | fn get_path(&self, url: &Url) -> PathBuf { 20 | // Extract things after ?dir 21 | let dir = url 22 | .query_pairs() 23 | .find(|(key, _value)| key == "dir") 24 | .map(|(_key, value)| value.to_string()); 25 | let mut dir = match dir { 26 | Some(d) => d, 27 | None => return PathBuf::from(url.path()), 28 | }; 29 | if !dir.starts_with('/') { 30 | dir.insert(0, '/'); 31 | } 32 | if !dir.ends_with('/') { 33 | dir.push('/'); 34 | } 35 | PathBuf::from(dir) 36 | } 37 | 38 | fn get_list( 39 | &self, 40 | async_context: &AsyncContext, 41 | url: &url::Url, 42 | ) -> Result { 43 | let resp = get( 44 | &async_context.runtime, 45 | &async_context.listing_client, 46 | url.clone(), 47 | )?; 48 | let url = resp.url().clone(); 49 | let body = get_text(&async_context.runtime, resp)?; 50 | assert_if_url_has_no_trailing_slash(&url); 51 | let document = Html::parse_document(&body); 52 | // https://github.com/DirectoryLister/DirectoryLister/blob/0283f14aa1fbd97796f753e8d6105c752546050f/app/views/components/file.twig 53 | 54 | // find
      which contains file index 55 | let selector = Selector::parse("ul").unwrap(); 56 | let indexlist = document.select(&selector).next().unwrap(); 57 | // find second
    • 58 | let selector = Selector::parse("li").unwrap(); 59 | let indexlist = indexlist.select(&selector).nth(1).unwrap(); 60 | let selector = Selector::parse("a").unwrap(); 61 | let mut items = Vec::new(); 62 | for element in indexlist.select(&selector) { 63 | let href = element.value().attr("href").unwrap(); 64 | let href = url.join(href)?; 65 | // displayed file name, class = "flex-1 truncate" 66 | let selector = Selector::parse("div.flex-1.truncate").unwrap(); 67 | let displayed_filename = element.select(&selector).next().unwrap().inner_html(); 68 | let displayed_filename = displayed_filename.trim(); 69 | // size, class = "hidden whitespace-nowrap text-right mx-2 w-1/6 sm:block" 70 | let selector = Selector::parse("div.hidden.whitespace-nowrap.text-right.mx-2").unwrap(); 71 | let size = element.select(&selector).next().unwrap().inner_html(); 72 | let size = size.trim(); 73 | // mtime, class = "hidden whitespace-nowrap text-right truncate ml-2 w-1/4 sm:block" 74 | let selector = 75 | Selector::parse("div.hidden.whitespace-nowrap.text-right.truncate.ml-2").unwrap(); 76 | let mtime = element.select(&selector).next().unwrap().inner_html(); 77 | let mtime = mtime.trim(); 78 | 79 | if displayed_filename == ".." { 80 | continue; 81 | } 82 | let type_ = if size == "—" { 83 | FileType::Directory 84 | } else { 85 | FileType::File 86 | }; 87 | let date = NaiveDateTime::parse_from_str(mtime, "%Y-%m-%d %H:%M:%S")?; 88 | items.push(ListItem::new( 89 | href, 90 | displayed_filename.to_string(), 91 | type_, 92 | { 93 | if size == "—" { 94 | None 95 | } else { 96 | let (n_size, unit) = FileSize::get_humanized(size); 97 | Some(FileSize::HumanizedBinary(n_size, unit)) 98 | } 99 | }, 100 | date, 101 | None, 102 | )) 103 | } 104 | 105 | Ok(ListResult::List(items)) 106 | } 107 | } 108 | 109 | #[cfg(test)] 110 | mod tests { 111 | use url::Url; 112 | 113 | use crate::listing::SizeUnit; 114 | 115 | use super::*; 116 | use crate::parser::tests::*; 117 | 118 | #[test] 119 | fn test_vyos() { 120 | let context = init_async_context(); 121 | let items = DirectoryListerListingParser 122 | .get_list( 123 | &context, 124 | &url::Url::parse("http://localhost:1921/vyos/").unwrap(), 125 | ) 126 | .unwrap(); 127 | match items { 128 | ListResult::List(items) => { 129 | assert_eq!(items.len(), 7); 130 | assert_eq!(items[0].name, "main"); 131 | assert_eq!(items[0].type_, FileType::Directory); 132 | assert_eq!(items[0].size, None); 133 | assert_eq!( 134 | items[0].mtime, 135 | NaiveDateTime::parse_from_str("2023-08-07 21:11:02", "%Y-%m-%d %H:%M:%S") 136 | .unwrap() 137 | ); 138 | assert_eq!( 139 | items[0].url, 140 | Url::parse( 141 | "http://localhost:1921/vyos/?dir=repositories/current/dists/current/main" 142 | ) 143 | .unwrap() 144 | ); 145 | assert_eq!(items[4].name, "Contents-amd64.gz"); 146 | assert_eq!(items[4].type_, FileType::File); 147 | assert_eq!( 148 | items[4].size, 149 | Some(FileSize::HumanizedBinary(1.80, SizeUnit::M)) 150 | ); 151 | assert_eq!( 152 | items[4].mtime, 153 | NaiveDateTime::parse_from_str("2023-08-07 21:10:57", "%Y-%m-%d %H:%M:%S") 154 | .unwrap() 155 | ); 156 | assert_eq!(items[4].url, Url::parse("http://localhost:1921/vyos/repositories/current/dists/current/Contents-amd64.gz").unwrap()); 157 | } 158 | _ => unreachable!(), 159 | } 160 | } 161 | 162 | #[test] 163 | fn test_vyos_2() { 164 | let context = init_async_context(); 165 | let items = DirectoryListerListingParser 166 | .get_list( 167 | &context, 168 | &url::Url::parse("http://localhost:1921/vyos/vyos-accel-ppp/").unwrap(), 169 | ) 170 | .unwrap(); 171 | match items { 172 | ListResult::List(items) => { 173 | assert_eq!(items.len(), 3); 174 | } 175 | _ => unreachable!(), 176 | } 177 | } 178 | } 179 | -------------------------------------------------------------------------------- /src/parser/docker.rs: -------------------------------------------------------------------------------- 1 | use crate::{ 2 | listing::{FileSize, FileType, ListItem}, 3 | utils::get, 4 | }; 5 | use chrono::NaiveDateTime; 6 | use scraper::{Html, Selector}; 7 | // use tracing::debug; 8 | 9 | use super::*; 10 | use anyhow::Result; 11 | use regex::Regex; 12 | 13 | #[derive(Debug, Clone)] 14 | pub struct DockerListingParser { 15 | metadata_regex: Regex, 16 | } 17 | 18 | impl Default for DockerListingParser { 19 | fn default() -> Self { 20 | Self { 21 | metadata_regex: Regex::new( 22 | r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}(:\d{2})?)\s+([\d \w\.-]+)$", 23 | ) 24 | .unwrap(), 25 | } 26 | } 27 | } 28 | 29 | impl Parser for DockerListingParser { 30 | fn name(&self) -> &'static str { 31 | "download.docker.com" 32 | } 33 | 34 | fn is_auto_redirect(&self) -> bool { 35 | false 36 | } 37 | 38 | fn get_list( 39 | &self, 40 | async_context: &AsyncContext, 41 | url: &url::Url, 42 | ) -> Result { 43 | assert_if_url_has_no_trailing_slash(url); 44 | let resp = get( 45 | &async_context.runtime, 46 | &async_context.listing_client, 47 | url.clone(), 48 | )?; 49 | // if is a redirect? 50 | if let Some(url) = resp.headers().get("location") { 51 | let mut url = url.to_str()?.to_string(); 52 | // replace /index.html at the end to / 53 | if url.ends_with("/index.html") { 54 | url = url.trim_end_matches("/index.html").to_string(); 55 | url.push('/'); 56 | } 57 | return Ok(ListResult::Redirect(url)); 58 | } 59 | let body = get_text(&async_context.runtime, resp)?; 60 | let document = Html::parse_document(&body); 61 | let selector = Selector::parse("a").unwrap(); 62 | let mut items = Vec::new(); 63 | for element in document.select(&selector) { 64 | let href = match element.value().attr("href") { 65 | Some(href) => href, 66 | None => continue, 67 | }; 68 | let name = get_real_name_from_href(href); 69 | let mut href = url.join(href)?; 70 | 71 | if name == ".." { 72 | continue; 73 | } 74 | 75 | let displayed_name = element.inner_html(); 76 | 77 | let (type_, size, date) = { 78 | if href.as_str().ends_with('/') || displayed_name.ends_with('/') { 79 | (FileType::Directory, None, NaiveDateTime::default()) 80 | } else { 81 | let metadata_raw = element 82 | .next_sibling() 83 | .unwrap() 84 | .value() 85 | .as_text() 86 | .unwrap() 87 | .to_string(); 88 | let metadata_raw = metadata_raw.trim(); 89 | let metadata = self.metadata_regex.captures(metadata_raw).unwrap(); 90 | let date = metadata.get(1).unwrap().as_str(); 91 | let date = match NaiveDateTime::parse_from_str(date, "%Y-%m-%d %H:%M:%S") { 92 | Ok(date) => date, 93 | Err(_) => NaiveDateTime::parse_from_str(date, "%Y-%m-%d %H:%M").unwrap(), 94 | }; 95 | let size = metadata.get(3).unwrap().as_str(); 96 | if size == "-" { 97 | (FileType::Directory, None, date) 98 | } else { 99 | let (n_size, unit) = FileSize::get_humanized(size); 100 | ( 101 | FileType::File, 102 | Some(FileSize::HumanizedBinary(n_size, unit)), 103 | date, 104 | ) 105 | } 106 | } 107 | }; 108 | if type_ == FileType::Directory && !href.path().ends_with('/') { 109 | href.set_path(&format!("{}/", href.path())); 110 | } 111 | 112 | items.push(ListItem::new( 113 | href, 114 | name.to_string(), 115 | type_, 116 | size, 117 | date, 118 | None, 119 | )) 120 | } 121 | Ok(ListResult::List(items)) 122 | } 123 | } 124 | 125 | #[cfg(test)] 126 | mod tests { 127 | use crate::listing::SizeUnit; 128 | 129 | use super::*; 130 | use crate::parser::tests::*; 131 | 132 | #[test] 133 | fn test_docker() { 134 | let context = init_async_context(); 135 | let items = DockerListingParser::default() 136 | .get_list( 137 | &context, 138 | &url::Url::parse("http://localhost:1921/docker/").unwrap(), 139 | ) 140 | .unwrap(); 141 | match items { 142 | ListResult::List(items) => { 143 | assert_eq!(items.len(), 45); 144 | assert_eq!(items[0].name, "7.0"); 145 | assert_eq!(items[0].type_, FileType::Directory); 146 | assert_eq!(items[0].size, None); 147 | assert_eq!(items[0].mtime, NaiveDateTime::default()); 148 | assert_eq!(items[42].name, "docker-ce-staging.repo"); 149 | assert_eq!(items[42].type_, FileType::File); 150 | assert_eq!( 151 | items[42].size, 152 | Some(FileSize::HumanizedBinary(2.0, SizeUnit::K)) 153 | ); 154 | assert_eq!( 155 | items[42].mtime, 156 | NaiveDateTime::parse_from_str("2023-07-07 20:20:56", "%Y-%m-%d %H:%M:%S") 157 | .unwrap() 158 | ); 159 | } 160 | _ => unreachable!(), 161 | } 162 | } 163 | 164 | #[test] 165 | fn test_docker_2() { 166 | let context = init_async_context(); 167 | let items = DockerListingParser::default() 168 | .get_list( 169 | &context, 170 | &url::Url::parse("http://localhost:1921/docker/armv7l/").unwrap(), 171 | ) 172 | .unwrap(); 173 | match items { 174 | ListResult::List(items) => { 175 | assert_eq!(items.len(), 2); 176 | assert_eq!(items[0].name, "nightly"); 177 | assert_eq!(items[0].type_, FileType::Directory); 178 | assert_eq!(items[0].size, None); 179 | // Don't compare folder mtime here... 180 | // assert_eq!( 181 | // items[0].mtime, 182 | // NaiveDateTime::parse_from_str("2020-01-21 07:38", "%Y-%m-%d %H:%M").unwrap() 183 | // ); 184 | assert_eq!(items[1].name, "test"); 185 | assert_eq!(items[1].type_, FileType::Directory); 186 | assert_eq!(items[1].size, None); 187 | // assert_eq!( 188 | // items[1].mtime, 189 | // NaiveDateTime::parse_from_str("2020-01-21 07:38", "%Y-%m-%d %H:%M").unwrap() 190 | // ); 191 | } 192 | _ => unreachable!(), 193 | } 194 | } 195 | } 196 | -------------------------------------------------------------------------------- /src/parser/fallback.rs: -------------------------------------------------------------------------------- 1 | // An inefficient fallback parser only for non-listing HTML. 2 | // Read docs/parser.md for known limitations. 3 | 4 | use crate::{ 5 | listing::{FileSize, FileType, ListItem}, 6 | utils::{get, get_response_mtime, head}, 7 | }; 8 | use scraper::{Html, Selector}; 9 | use tracing::debug; 10 | 11 | use super::*; 12 | 13 | #[derive(Debug, Clone, Default)] 14 | pub struct FallbackParser; 15 | 16 | const INDEX: [&str; 2] = ["index.html", "index.htm"]; 17 | 18 | impl Parser for FallbackParser { 19 | fn name(&self) -> &'static str { 20 | "Fallback for non-listing directory HTML (index.html) only" 21 | } 22 | 23 | fn get_list(&self, async_context: &AsyncContext, url: &Url) -> Result { 24 | let url = if !url.path().ends_with('/') { 25 | Url::parse(&format!("{}/", url.path())).unwrap() 26 | } else { 27 | url.clone() 28 | }; 29 | let (name, resp) = { 30 | let mut final_resp = None; 31 | let mut final_name = None; 32 | for index in INDEX { 33 | let url = url.join(index).unwrap(); 34 | let resp = get( 35 | &async_context.runtime, 36 | &async_context.listing_client, 37 | url.clone(), 38 | ); 39 | match resp { 40 | Ok(r) => { 41 | final_resp = Some(r); 42 | final_name = Some(index); 43 | break; 44 | } 45 | Err(e) => { 46 | warn!("Failed to fetch {url}: {e}"); 47 | continue; 48 | } 49 | } 50 | } 51 | ( 52 | final_name, 53 | final_resp.ok_or(anyhow!("Does not match index list: {:?}", INDEX)), 54 | ) 55 | }; 56 | let resp = resp?; 57 | let name = name.unwrap(); 58 | let mtime = get_response_mtime(&resp) 59 | .unwrap_or(chrono::offset::Utc::now()) 60 | .naive_utc(); 61 | let url = resp.url().clone(); 62 | let body = get_text(&async_context.runtime, resp)?; 63 | let size = body.len(); 64 | let timezone = chrono::FixedOffset::east_opt(0); 65 | 66 | let document = Html::parse_document(&body); 67 | let selector = Selector::parse("a").unwrap(); 68 | let mut items = Vec::new(); 69 | // Add index file 70 | items.push(ListItem::new( 71 | url.clone(), 72 | name.to_string(), 73 | FileType::File, 74 | Some(FileSize::Precise(size as u64)), 75 | mtime, 76 | timezone, 77 | )); 78 | // Remove the "index.htm(l)" part in url 79 | let url = url.join("./").unwrap(); 80 | for element in document.select(&selector) { 81 | let href = match element.value().attr("href") { 82 | // well, what can I say... if you don't have href attribute? 83 | None => continue, 84 | Some(h) => h, 85 | }; 86 | let href = match url.join(href) { 87 | Err(e) => { 88 | warn!("cannot join {href} to {url}: {e}, skipping"); 89 | continue; 90 | } 91 | Ok(h) => h, 92 | }; 93 | // Ignore if url is not a prefix of href 94 | if !href.as_str().starts_with(url.as_str()) { 95 | warn!("{href} is not inside {url}, skipping"); 96 | continue; 97 | } 98 | let relative_href = match url.make_relative(&href) { 99 | None => { 100 | warn!("cannot make relative of {href} from {url}"); 101 | continue; 102 | } 103 | Some(r) => r, 104 | }; 105 | let relative_href = match relative_href.find('/') { 106 | Some(idx) => relative_href[..idx + 1].to_string(), 107 | None => relative_href, 108 | }; 109 | if relative_href.is_empty() || relative_href == "/" { 110 | continue; 111 | } 112 | let name = get_real_name_from_href(&relative_href); 113 | if name.is_empty() { 114 | continue; 115 | } 116 | let href = url 117 | .join(&relative_href) 118 | .expect("unexpected error of handling URL"); 119 | let type_ = if relative_href.ends_with('/') { 120 | FileType::Directory 121 | } else { 122 | FileType::File 123 | }; 124 | 125 | // Try HEAD 126 | debug!("HEADing {href} in fallback parser"); 127 | let resp = match head( 128 | &async_context.runtime, 129 | &async_context.listing_client, 130 | href.clone(), 131 | ) { 132 | Ok(r) => r, 133 | Err(e) => { 134 | let status = e.status(); 135 | if status == Some(reqwest::StatusCode::NOT_FOUND) 136 | || status == Some(reqwest::StatusCode::FORBIDDEN) 137 | { 138 | continue; 139 | } 140 | 141 | // TODO: what to do here? 142 | warn!("Cannot get from {}, skipping", href); 143 | continue; 144 | } 145 | }; 146 | 147 | let item = if type_ == FileType::File { 148 | let size = resp.content_length(); 149 | let mtime = match get_response_mtime(&resp) { 150 | Ok(m) => m, 151 | Err(e) => { 152 | warn!("Cannot get mtime from {href}: {e}, skipping"); 153 | continue; 154 | } 155 | }; 156 | let naive = mtime.naive_utc(); 157 | 158 | ListItem::new( 159 | href, 160 | name.to_string(), 161 | type_, 162 | size.map(FileSize::Precise), 163 | naive, 164 | timezone, 165 | ) 166 | } else { 167 | ListItem::new( 168 | href, 169 | name.to_string(), 170 | type_, 171 | None, 172 | mtime, // mtime does not matter for dir 173 | timezone, 174 | ) 175 | }; 176 | 177 | items.push(item); 178 | } 179 | 180 | Ok(ListResult::List(items)) 181 | } 182 | } 183 | 184 | #[cfg(test)] 185 | mod tests { 186 | use super::*; 187 | use crate::parser::tests::*; 188 | 189 | #[test] 190 | fn test_mimalloc() { 191 | let context = init_async_context(); 192 | let items = FallbackParser 193 | .get_list( 194 | &context, 195 | &url::Url::parse("http://localhost:1921/buildroot/mimalloc/").unwrap(), 196 | ) 197 | .unwrap(); 198 | match items { 199 | ListResult::List(items) => { 200 | assert_eq!(items.len(), 4); 201 | assert_eq!(items[0].name, "index.html"); 202 | assert_eq!(items[0].type_, FileType::File); 203 | assert_eq!(items[0].size, Some(FileSize::Precise(9369))); 204 | 205 | assert_eq!(items[3].name, "test"); 206 | assert_eq!(items[3].type_, FileType::Directory); 207 | } 208 | _ => unreachable!(), 209 | } 210 | } 211 | } 212 | -------------------------------------------------------------------------------- /src/parser/fancyindex.rs: -------------------------------------------------------------------------------- 1 | // Nginx fancyindex parser 2 | 3 | use crate::{ 4 | listing::{FileSize, FileType, ListItem}, 5 | utils::get, 6 | }; 7 | 8 | use super::*; 9 | use anyhow::Result; 10 | use chrono::{DateTime, NaiveDateTime}; 11 | use scraper::{Html, Selector}; 12 | 13 | #[derive(Debug, Clone, Default)] 14 | pub struct FancyIndexListingParser; 15 | 16 | impl Parser for FancyIndexListingParser { 17 | fn name(&self) -> &'static str { 18 | "Fancyindex" 19 | } 20 | 21 | fn get_list( 22 | &self, 23 | async_context: &AsyncContext, 24 | url: &url::Url, 25 | ) -> Result { 26 | let resp = get( 27 | &async_context.runtime, 28 | &async_context.listing_client, 29 | url.clone(), 30 | )?; 31 | let url = resp.url().clone(); 32 | let body = get_text(&async_context.runtime, resp)?; 33 | assert_if_url_has_no_trailing_slash(&url); 34 | let document = Html::parse_document(&body); 35 | let selector = Selector::parse("tbody tr").unwrap(); 36 | let mut items = Vec::new(); 37 | for element in document.select(&selector) { 38 | // let link_selector = Selector::parse("td.link a").unwrap(); 39 | // let size_selector = Selector::parse("td.size").unwrap(); 40 | // let date_selector = Selector::parse("td.date").unwrap(); 41 | 42 | // Select
    (header maybe?), skipping..."); 50 | continue; 51 | } 52 | }; 53 | let a = match td_a.select(&Selector::parse("a").unwrap()).next() { 54 | Some(a) => a, 55 | None => { 56 | return Err(anyhow!("Cannot find in first cell.").into()); 57 | } 58 | }; 59 | let href = a.value().attr("href").unwrap(); 60 | let displayed_filename = a.inner_html(); 61 | 62 | if displayed_filename == "Parent Directory/" || href == "../" { 63 | continue; 64 | } 65 | 66 | let name = get_real_name_from_href(href); 67 | let href = url.join(href)?; 68 | let type_ = if href.as_str().ends_with('/') { 69 | FileType::Directory 70 | } else { 71 | FileType::File 72 | }; 73 | let size = td_iterator.next().unwrap().inner_html(); 74 | let size = size.trim(); 75 | let date = td_iterator.next().unwrap().inner_html(); 76 | let date = date.trim(); 77 | 78 | // decide (guess) which time format to use 79 | let (date_fmt, _) = guess_date_fmt(date); 80 | let naive_date; 81 | let timezone; 82 | if !date_fmt_has_timezone(&date_fmt) { 83 | naive_date = NaiveDateTime::parse_from_str(date, &date_fmt)?; 84 | timezone = None; 85 | } else { 86 | let date = DateTime::parse_from_str(date, &date_fmt)?; 87 | naive_date = date.naive_utc(); 88 | timezone = Some(date.offset().to_owned()); 89 | } 90 | 91 | items.push(ListItem::new( 92 | href, 93 | name, 94 | type_, 95 | { 96 | if size == "-" { 97 | None 98 | } else { 99 | let (n_size, unit) = FileSize::get_humanized(size); 100 | Some(FileSize::HumanizedBinary(n_size, unit)) 101 | } 102 | }, 103 | naive_date, 104 | timezone, 105 | )); 106 | } 107 | 108 | Ok(ListResult::List(items)) 109 | } 110 | } 111 | 112 | #[cfg(test)] 113 | mod tests { 114 | use chrono::FixedOffset; 115 | 116 | use super::*; 117 | use crate::listing::SizeUnit; 118 | use crate::parser::tests::*; 119 | 120 | #[test] 121 | fn test_njumirrors() { 122 | let context = init_async_context(); 123 | let items = FancyIndexListingParser 124 | .get_list( 125 | &context, 126 | &Url::parse("http://localhost:1921/bmclapi/").unwrap(), 127 | ) 128 | .unwrap(); 129 | match items { 130 | ListResult::List(items) => { 131 | assert_eq!(items[0].name, "bouncycastle"); 132 | assert_eq!(items[0].type_, FileType::Directory); 133 | assert_eq!(items[0].size, None); 134 | assert_eq!( 135 | items[0].mtime, 136 | NaiveDateTime::parse_from_str("2024-04-23 19:01:54", "%Y-%m-%d %H:%M:%S") 137 | .unwrap() 138 | ); 139 | assert_eq!(items[items.len() - 1].name, "lwjgURL"); 140 | assert_eq!(items[items.len() - 1].type_, FileType::File); 141 | assert_eq!( 142 | items[items.len() - 1].size, 143 | Some(FileSize::HumanizedBinary(1767.0, SizeUnit::B)) 144 | ); 145 | assert_eq!( 146 | items[items.len() - 1].mtime, 147 | NaiveDateTime::parse_from_str("2021-04-30 20:55:32", "%Y-%m-%d %H:%M:%S") 148 | .unwrap() 149 | ); 150 | } 151 | _ => unreachable!(), 152 | } 153 | } 154 | 155 | #[test] 156 | fn test_loongnix() { 157 | let context = init_async_context(); 158 | let items = FancyIndexListingParser 159 | .get_list( 160 | &context, 161 | &Url::parse("http://localhost:1921/loongnix/").unwrap(), 162 | ) 163 | .unwrap(); 164 | match items { 165 | ListResult::List(items) => { 166 | assert_eq!(items[0].name, "contrib"); 167 | assert_eq!(items[0].type_, FileType::Directory); 168 | assert_eq!(items[0].size, None); 169 | assert_eq!( 170 | items[0].mtime, 171 | NaiveDateTime::parse_from_str("2023-08-15 05:48", "%Y-%m-%d %H:%M").unwrap() 172 | ); 173 | assert_eq!(items[items.len() - 1].name, "Release.gpg"); 174 | assert_eq!(items[items.len() - 1].type_, FileType::File); 175 | assert_eq!( 176 | items[items.len() - 1].size, 177 | Some(FileSize::HumanizedBinary(659.0, SizeUnit::B)) 178 | ); 179 | assert_eq!( 180 | items[items.len() - 1].mtime, 181 | NaiveDateTime::parse_from_str("2023-08-15 05:48", "%Y-%m-%d %H:%M").unwrap() 182 | ); 183 | } 184 | _ => unreachable!(), 185 | } 186 | } 187 | 188 | #[test] 189 | fn test_misc_1() { 190 | // In fact this is NOT a fancyindex page, but it basically match the layout of that. 191 | let context = init_async_context(); 192 | let items = FancyIndexListingParser 193 | .get_list( 194 | &context, 195 | &Url::parse("http://localhost:1921/misc/1/").unwrap(), 196 | ) 197 | .unwrap(); 198 | match items { 199 | ListResult::List(items) => { 200 | assert_eq!(items.len(), 1); 201 | assert_eq!(items[0].name, "passwd"); 202 | assert_eq!(items[0].type_, FileType::File); 203 | assert_eq!( 204 | items[0].size, 205 | Some(FileSize::HumanizedBinary(3.3, SizeUnit::K)) 206 | ); 207 | assert_eq!( 208 | items[0].mtime, 209 | NaiveDateTime::parse_from_str("2024-08-24 15:04:11", "%Y-%m-%d %H:%M:%S") 210 | .unwrap() 211 | ); 212 | assert_eq!(items[0].timezone, FixedOffset::east_opt(0),); 213 | } 214 | _ => unreachable!(), 215 | } 216 | } 217 | } 218 | -------------------------------------------------------------------------------- /src/parser/gradle.rs: -------------------------------------------------------------------------------- 1 | use crate::listing::{FileSize, FileType, ListItem}; 2 | use chrono::{DateTime, NaiveDateTime}; 3 | use scraper::{Html, Selector}; 4 | use tracing::info; 5 | 6 | use super::*; 7 | use anyhow::Result; 8 | 9 | #[derive(Debug, Clone, Default)] 10 | pub struct GradleListingParser {} 11 | 12 | impl Parser for GradleListingParser { 13 | fn name(&self) -> &'static str { 14 | "services.gradle.org" 15 | } 16 | 17 | fn get_list( 18 | &self, 19 | async_context: &AsyncContext, 20 | url: &url::Url, 21 | ) -> Result { 22 | let resp = get( 23 | &async_context.runtime, 24 | &async_context.listing_client, 25 | url.clone(), 26 | )?; 27 | let url = resp.url().clone(); 28 | let body = get_text(&async_context.runtime, resp)?; 29 | assert_if_url_has_no_trailing_slash(&url); 30 | let document = Html::parse_document(&body); 31 | let selector = Selector::parse("ul li").unwrap(); 32 | let mut items = Vec::new(); 33 | for element in document.select(&selector) { 34 | // Select first, then s 35 | let a_selector = Selector::parse("a").unwrap(); 36 | let span_selector = Selector::parse("span").unwrap(); 37 | let size_selector = Selector::parse("span.size").unwrap(); 38 | let date_selector = Selector::parse("span.date").unwrap(); 39 | 40 | if element.select(&span_selector).next().is_none() { 41 | info!("No in this
  • . Maybe it's a header"); 42 | continue; 43 | } 44 | 45 | let a = match element.select(&a_selector).next() { 46 | Some(a) => a, 47 | None => { 48 | return Err(anyhow!("No in given
  • ").into()); 49 | } 50 | }; 51 | let href = a.value().attr("href").unwrap(); 52 | let displayed_filename = a.inner_html(); 53 | 54 | if displayed_filename == "Parent Directory/" || href == "../" { 55 | continue; 56 | } 57 | 58 | let name = get_real_name_from_href(href); 59 | let href = url.join(href)?; 60 | let type_ = if href.as_str().ends_with('/') { 61 | FileType::Directory 62 | } else { 63 | FileType::File 64 | }; 65 | let size = element.select(&size_selector).next().unwrap().inner_html(); 66 | let size = size.trim(); 67 | let date = element.select(&date_selector).next().unwrap().inner_html(); 68 | let date = date.trim(); 69 | 70 | // decide (guess) which time format to use 71 | let (date_fmt, _) = guess_date_fmt(date); 72 | let naive_date; 73 | let timezone; 74 | if !date_fmt_has_timezone(&date_fmt) { 75 | naive_date = NaiveDateTime::parse_from_str(date, &date_fmt)?; 76 | timezone = None; 77 | } else { 78 | let date = DateTime::parse_from_str(date, &date_fmt)?; 79 | naive_date = date.naive_utc(); 80 | timezone = Some(date.offset().to_owned()); 81 | } 82 | 83 | items.push(ListItem::new( 84 | href, 85 | name, 86 | type_, 87 | { 88 | if size == "-" { 89 | None 90 | } else { 91 | let (n_size, unit) = FileSize::get_humanized(size); 92 | Some(FileSize::HumanizedBinary(n_size, unit)) 93 | } 94 | }, 95 | naive_date, 96 | timezone, 97 | )); 98 | } 99 | 100 | Ok(ListResult::List(items)) 101 | } 102 | } 103 | 104 | #[cfg(test)] 105 | mod tests { 106 | use chrono::FixedOffset; 107 | use test_log::test; 108 | 109 | use crate::listing::SizeUnit; 110 | 111 | use super::*; 112 | use crate::parser::tests::*; 113 | 114 | #[test] 115 | fn test_gradle() { 116 | let context = init_async_context(); 117 | let items = GradleListingParser::default() 118 | .get_list( 119 | &context, 120 | &url::Url::parse("http://localhost:1921/gradle").unwrap(), 121 | ) 122 | .unwrap(); 123 | match items { 124 | ListResult::List(items) => { 125 | assert_eq!(items.len(), 64); 126 | assert_eq!(items[0].name, "gradle-8.10-wrapper.jar.sha256"); 127 | assert_eq!(items[0].type_, FileType::File); 128 | assert_eq!( 129 | items[0].size, 130 | Some(FileSize::HumanizedBinary(64.0, SizeUnit::B)) 131 | ); 132 | assert_eq!( 133 | items[0].mtime, 134 | NaiveDateTime::parse_from_str("14-Aug-2024 11:18", "%d-%b-%Y %H:%M").unwrap() 135 | ); 136 | assert_eq!(items[0].timezone, FixedOffset::east_opt(0),); 137 | } 138 | _ => unreachable!(), 139 | } 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /src/parser/lighttpd.rs: -------------------------------------------------------------------------------- 1 | use crate::{ 2 | listing::{FileSize, FileType, ListItem}, 3 | utils::get, 4 | }; 5 | use chrono::NaiveDateTime; 6 | use scraper::{Html, Selector}; 7 | // use tracing::debug; 8 | 9 | use super::*; 10 | use anyhow::{anyhow, Result}; 11 | 12 | #[derive(Debug, Clone, Default)] 13 | pub struct LighttpdListingParser; 14 | 15 | impl Parser for LighttpdListingParser { 16 | fn name(&self) -> &'static str { 17 | "Lighttpd" 18 | } 19 | 20 | fn get_list( 21 | &self, 22 | async_context: &AsyncContext, 23 | url: &url::Url, 24 | ) -> Result { 25 | let resp = get( 26 | &async_context.runtime, 27 | &async_context.listing_client, 28 | url.clone(), 29 | )?; 30 | let url = resp.url().clone(); 31 | let body = get_text(&async_context.runtime, resp)?; 32 | assert_if_url_has_no_trailing_slash(&url); 33 | let document = Html::parse_document(&body); 34 | let selector = Selector::parse("tbody").unwrap(); 35 | let indexlist = document 36 | .select(&selector) 37 | .next() 38 | .ok_or_else(|| anyhow!("Cannot find
  • "))?; 39 | let selector = Selector::parse("tr").unwrap(); 40 | let mut items = Vec::new(); 41 | for element in indexlist.select(&selector) { 42 | let a = element 43 | .select(&Selector::parse("a").unwrap()) 44 | .next() 45 | .ok_or_else(|| anyhow!("Cannot find "))?; 46 | let mtime = element 47 | .select(&Selector::parse(".m").unwrap()) 48 | .next() 49 | .ok_or_else(|| anyhow!("Cannot find .m"))?; 50 | let size = element 51 | .select(&Selector::parse(".s").unwrap()) 52 | .next() 53 | .ok_or_else(|| anyhow!("Cannot find .s"))?; 54 | 55 | // let filetype = element.select(&Selector::parse(".t").unwrap()).next().unwrap(); 56 | 57 | let displayed_filename = a.inner_html(); 58 | if displayed_filename == ".." { 59 | continue; 60 | } 61 | let href = a 62 | .value() 63 | .attr("href") 64 | .ok_or_else(|| anyhow!("Cannot find href inside "))?; 65 | let name = get_real_name_from_href(href); 66 | let href = url.join(href)?; 67 | 68 | let type_ = if href.as_str().ends_with('/') { 69 | FileType::Directory 70 | } else { 71 | FileType::File 72 | }; 73 | 74 | let mtime = mtime.inner_html(); 75 | let mtime = mtime.trim(); 76 | let mtime = NaiveDateTime::parse_from_str(mtime, "%Y-%b-%d %H:%M:%S")?; 77 | 78 | let size = size.inner_html(); 79 | // Currently we just use simple replace to handle HTML entities 80 | // if we need a more sophisticated way to handle it, we should use a crate 81 | // like https://crates.io/crates/htmlentity 82 | let size = size.replace(" ", ""); 83 | let size = size.trim(); 84 | let size = if size == "-" { 85 | None 86 | } else { 87 | let (n_size, unit) = FileSize::get_humanized(size); 88 | Some(FileSize::HumanizedBinary(n_size, unit)) 89 | }; 90 | 91 | // debug!("{} {} {} {:?} {:?}", href, name, mtime, size, type_); 92 | items.push(ListItem::new(href, name, type_, size, mtime, None)) 93 | } 94 | 95 | Ok(ListResult::List(items)) 96 | } 97 | } 98 | 99 | #[cfg(test)] 100 | mod tests { 101 | use crate::listing::SizeUnit; 102 | 103 | use super::*; 104 | use crate::parser::tests::*; 105 | 106 | #[test] 107 | fn test_buildroot_root() { 108 | let context = init_async_context(); 109 | let items = LighttpdListingParser 110 | .get_list( 111 | &context, 112 | &Url::parse("http://localhost:1921/buildroot/").unwrap(), 113 | ) 114 | .unwrap(); 115 | match items { 116 | ListResult::List(items) => { 117 | assert_eq!(items[0].name, "18xx-ti-utils"); 118 | assert_eq!(items[0].type_, FileType::Directory); 119 | assert_eq!(items[0].size, None); 120 | assert_eq!( 121 | items[0].mtime, 122 | NaiveDateTime::parse_from_str("2021-01-11 15:59:23", "%Y-%m-%d %H:%M:%S") 123 | .unwrap() 124 | ); 125 | let last_item = items.last().unwrap(); 126 | assert_eq!(last_item.name, "zyre-v2.0.0.tar.gz"); 127 | assert_eq!(last_item.type_, FileType::File); 128 | assert_eq!( 129 | last_item.size, 130 | Some(FileSize::HumanizedBinary(262.1, SizeUnit::K)) 131 | ); 132 | assert_eq!( 133 | last_item.mtime, 134 | NaiveDateTime::parse_from_str("2018-03-08 11:18:46", "%Y-%m-%d %H:%M:%S") 135 | .unwrap() 136 | ); 137 | } 138 | _ => unreachable!(), 139 | } 140 | } 141 | 142 | #[test] 143 | fn test_buildroot_subfolder() { 144 | let context = init_async_context(); 145 | let items = LighttpdListingParser 146 | .get_list( 147 | &context, 148 | &Url::parse("http://localhost:1921/buildroot/acl/").unwrap(), 149 | ) 150 | .unwrap(); 151 | match items { 152 | ListResult::List(items) => { 153 | assert_eq!(items.len(), 4); 154 | assert_eq!(items[0].name, "acl-2.2.52.src.tar.gz"); 155 | assert_eq!(items[0].type_, FileType::File); 156 | assert_eq!( 157 | items[0].size, 158 | Some(FileSize::HumanizedBinary(377.5, SizeUnit::K)) 159 | ); 160 | assert_eq!( 161 | items[0].mtime, 162 | NaiveDateTime::parse_from_str("2013-05-19 06:10:38", "%Y-%m-%d %H:%M:%S") 163 | .unwrap() 164 | ); 165 | assert_eq!(items[3].name, "acl-2.3.2.tar.xz"); 166 | assert_eq!(items[3].type_, FileType::File); 167 | assert_eq!( 168 | items[3].size, 169 | Some(FileSize::HumanizedBinary(362.9, SizeUnit::K)) 170 | ); 171 | assert_eq!( 172 | items[3].mtime, 173 | NaiveDateTime::parse_from_str("2024-02-07 03:04:10", "%Y-%m-%d %H:%M:%S") 174 | .unwrap() 175 | ); 176 | } 177 | _ => unreachable!(), 178 | } 179 | } 180 | } 181 | -------------------------------------------------------------------------------- /src/regex_manager/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod v1; 2 | pub mod v2; 3 | 4 | use std::str::FromStr; 5 | 6 | use regex::Regex; 7 | 8 | use crate::SharedArgs; 9 | 10 | // Submit an issue if you find this out-of-date! 11 | // And assuming that all vars are distro_ver 12 | const REGEX_REPLACEMENTS: &[(&str, &str)] = &[ 13 | // https://endoflife.date/debian 14 | ("${DEBIAN_CURRENT}", "(?bullseye|bookworm)"), 15 | // https://endoflife.date/ubuntu (excluding ESM) 16 | ("${UBUNTU_LTS}", "(?focal|jammy|noble)"), 17 | ("${UBUNTU_NONLTS}", "(?oracular|plucky)"), 18 | // https://endoflife.date/fedora 19 | ("${FEDORA_CURRENT}", "(?40|41|42)"), 20 | // CentOS is no longer supported -- this regex is replaced to something that could match nothing 21 | ( 22 | "${CENTOS_CURRENT}", 23 | "(?NONEXISTFILENAMESOITCOULDNEVERMATCHANYTHING)", 24 | ), 25 | // https://endoflife.date/rhel (excluding ELCS) 26 | ("${RHEL_CURRENT}", "(?8|9)"), 27 | // https://endoflife.date/opensuse 28 | ("${OPENSUSE_CURRENT}", "(?15.6)"), 29 | // https://endoflife.date/sles 30 | ("${SLES_CURRENT}", "(?15)"), 31 | ]; 32 | 33 | /// ExpandedRegex contains inner and rev_inner, and would transparently add '/' before string 34 | /// (and convert regex with ^). A warning would be given if text input contains '/' at front. 35 | #[derive(Debug, Clone)] 36 | pub struct ExpandedRegex { 37 | pub inner: Regex, 38 | /// v1 compatibility field 39 | rev_inner: Regex, 40 | } 41 | 42 | impl FromStr for ExpandedRegex { 43 | type Err = regex::Error; 44 | 45 | fn from_str(s: &str) -> Result { 46 | // If starts with ^ and not ^/, change start matching character from ^ to ^/ 47 | let s = if s.starts_with('^') && !s.starts_with("^/") { 48 | &format!("^/{}", &s[1..]) 49 | } else { 50 | s 51 | }; 52 | let mut s1 = s.to_string(); 53 | for (from, to) in REGEX_REPLACEMENTS { 54 | s1 = s1.replace(from, to); 55 | } 56 | let mut s2 = s.to_string(); 57 | for (from, _) in REGEX_REPLACEMENTS.iter().rev() { 58 | s2 = s2.replace(from, "(?.+)"); 59 | } 60 | Ok(Self { 61 | inner: Regex::new(&s1)?, 62 | rev_inner: Regex::new(&s2)?, 63 | }) 64 | } 65 | } 66 | 67 | // Delegate to inner 68 | impl ExpandedRegex { 69 | fn text_transform(text: &str) -> String { 70 | if !text.starts_with('/') { 71 | tracing::warn!("(unexpected internal input: string given to match_str shall start with /, anything wrong?)"); 72 | format!("/{}", text) 73 | } else { 74 | text.to_string() 75 | } 76 | } 77 | 78 | pub fn is_match(&self, text: &str) -> bool { 79 | self.inner.is_match(&Self::text_transform(text)) 80 | } 81 | 82 | /// v1 compatibility method 83 | pub fn is_others_match(&self, text: &str) -> bool { 84 | let text = &Self::text_transform(text); 85 | !self.inner.is_match(text) && self.rev_inner.is_match(text) 86 | } 87 | } 88 | 89 | #[derive(Debug, Clone, Copy, PartialEq)] 90 | pub enum Comparison { 91 | Stop, 92 | /// v1 compatibility field 93 | ListOnly, 94 | Ok, 95 | } 96 | 97 | pub trait ExclusionManagerTrait: Send + Sync { 98 | fn match_str(&self, text: &str) -> Comparison; 99 | } 100 | 101 | pub fn get_exclusion_manager(shared_args: impl SharedArgs) -> Box { 102 | if shared_args.use_v2_exclusion() { 103 | Box::new(v2::ExclusionManager::new()) 104 | } else { 105 | Box::new(v1::ExclusionManager::new( 106 | shared_args.exclude(), 107 | shared_args.include(), 108 | )) 109 | } 110 | } 111 | 112 | #[cfg(test)] 113 | mod tests { 114 | use super::*; 115 | 116 | #[test] 117 | fn test_expanded_regex() { 118 | let regex = ExpandedRegex::from_str("^/deb/dists/${DEBIAN_CURRENT}").unwrap(); 119 | assert!(regex.is_match("/deb/dists/bookworm/Release")); 120 | assert!(!regex.is_match("/deb/dists/wheezy/Release")); 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /src/regex_manager/v1.rs: -------------------------------------------------------------------------------- 1 | use super::{Comparison, ExclusionManagerTrait, ExpandedRegex}; 2 | 3 | #[derive(Debug, Clone)] 4 | pub struct ExclusionManager { 5 | /// Stop the task immediately if any of these regexes match. 6 | instant_stop_regexes: Vec, 7 | /// Continue, but don't download anything if any of these regexes match. 8 | list_only_regexes: Vec, 9 | /// Include only these regexes. 10 | include_regexes: Vec, 11 | } 12 | 13 | impl ExclusionManager { 14 | pub fn new(exclusions: &[ExpandedRegex], inclusions: &[ExpandedRegex]) -> Self { 15 | let mut instant_stop_regexes = Vec::new(); 16 | let mut list_only_regexes = Vec::new(); 17 | 18 | for exclusion in exclusions { 19 | let regex_str = exclusion.inner.as_str(); 20 | let mut flag = false; 21 | for inclusion in inclusions { 22 | if inclusion.inner.as_str().starts_with(regex_str) { 23 | list_only_regexes.push(exclusion.clone()); 24 | flag = true; 25 | break; 26 | } 27 | } 28 | if !flag { 29 | instant_stop_regexes.push(exclusion.clone()); 30 | } 31 | } 32 | 33 | Self { 34 | instant_stop_regexes, 35 | list_only_regexes, 36 | include_regexes: inclusions.to_vec(), 37 | } 38 | } 39 | } 40 | 41 | impl ExclusionManagerTrait for ExclusionManager { 42 | fn match_str(&self, text: &str) -> Comparison { 43 | for regex in &self.instant_stop_regexes { 44 | if regex.is_match(text) { 45 | return Comparison::Stop; 46 | } 47 | } 48 | for regex in &self.include_regexes { 49 | if regex.is_match(text) { 50 | return Comparison::Ok; 51 | } 52 | } 53 | // Performance: it is possible that a regex for inclusion shown like this: 54 | // ^fedora/${FEDORA_CURRENT} 55 | // And the remote corresponding folder has a lot of subfolders. 56 | // This is a "shortcut" to avoid checking all subfolders. 57 | for regex in &self.include_regexes { 58 | if regex.is_others_match(text) { 59 | return Comparison::Stop; 60 | } 61 | } 62 | for regex in &self.list_only_regexes { 63 | if regex.is_match(text) { 64 | return Comparison::ListOnly; 65 | } 66 | } 67 | Comparison::Ok 68 | } 69 | } 70 | 71 | #[cfg(test)] 72 | mod tests { 73 | use std::str::FromStr; 74 | 75 | use test_log::test; 76 | use tracing::debug; 77 | 78 | use super::*; 79 | 80 | #[test] 81 | fn test_exclusion() { 82 | let target = 83 | "/debian/pmg/dists/stretch/pmgtest/binary-amd64/grub-efi-amd64-bin_2.02-pve6.changelog"; 84 | let exclusions = 85 | vec![ExpandedRegex::from_str("pmg/dists/.+/pmgtest/.+changelog$").unwrap()]; 86 | let inclusions = vec![]; 87 | let exclusion_manager = ExclusionManager::new(&exclusions, &inclusions); 88 | assert_eq!(exclusion_manager.match_str(target), Comparison::Stop); 89 | } 90 | 91 | #[test] 92 | fn test_partial() { 93 | let target1 = "/yum/mysql-tools-community/fc/24/x86_64"; 94 | let target2 = "/yum/mysql-tools-community/fc/40/x86_64"; 95 | let target3 = "/yum/mysql-tools-community/fc/"; 96 | let target4 = "/yum/mysql-tools-community/fc/24/"; 97 | let target5 = "/yum/mysql-tools-community/fc/40/"; 98 | let exclusions = vec![ExpandedRegex::from_str("/fc/").unwrap()]; 99 | let inclusions = vec![ExpandedRegex::from_str("/fc/${FEDORA_CURRENT}").unwrap()]; 100 | debug!("exclusions: {:?}", exclusions); 101 | debug!("inclusions: {:?}", inclusions); 102 | let exclusion_manager = ExclusionManager::new(&exclusions, &inclusions); 103 | assert_eq!(exclusion_manager.match_str(target1), Comparison::Stop); 104 | assert_eq!(exclusion_manager.match_str(target2), Comparison::Ok); 105 | assert_eq!(exclusion_manager.match_str(target3), Comparison::ListOnly); 106 | assert_eq!(exclusion_manager.match_str(target4), Comparison::Stop); 107 | assert_eq!(exclusion_manager.match_str(target5), Comparison::Ok); 108 | } 109 | 110 | #[test] 111 | fn test_exclude_dbg() { 112 | let target1 = "/yum/mysql-8.0-community/docker/el/8/aarch64/mysql-community-server-minimal-8.0.33-1.el8.aarch64.rpm"; 113 | let target2 = "/yum/mysql-8.0-community/docker/el/8/debuginfo/x86_64/mysql-community-server-minimal-debuginfo-8.0.24-1.el8.x86_64.rpm"; 114 | let exclusions = vec![ 115 | ExpandedRegex::from_str("/el/").unwrap(), 116 | ExpandedRegex::from_str("debuginfo").unwrap(), 117 | ]; 118 | let inclusions = vec![ExpandedRegex::from_str("/el/${RHEL_CURRENT}").unwrap()]; 119 | let exclusion_manager = ExclusionManager::new(&exclusions, &inclusions); 120 | assert_eq!(exclusion_manager.match_str(target1), Comparison::Ok); 121 | assert_eq!(exclusion_manager.match_str(target2), Comparison::Stop); 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /src/regex_manager/v2.rs: -------------------------------------------------------------------------------- 1 | use std::str::FromStr; 2 | 3 | use tracing::debug; 4 | 5 | use super::{Comparison, ExclusionManagerTrait, ExpandedRegex}; 6 | 7 | #[derive(Debug, Clone)] 8 | enum RegexType { 9 | Include(ExpandedRegex), 10 | Exclude(ExpandedRegex), 11 | } 12 | 13 | #[derive(Debug, Clone)] 14 | pub struct ExclusionManager { 15 | regexes: Vec, 16 | } 17 | 18 | impl ExclusionManager { 19 | pub fn new() -> Self { 20 | // TODO: how to get correct order with clap? 21 | let args = std::env::args().collect::>(); 22 | debug!("args: {:?}", args); 23 | let mut regexes = Vec::new(); 24 | let mut iter = args.iter().peekable(); 25 | while let Some(arg) = iter.next() { 26 | if let Some(stripped) = arg.strip_prefix("--exclude=") { 27 | regexes.push(RegexType::Exclude( 28 | ExpandedRegex::from_str(stripped).expect("unexpected exclude regex"), 29 | )); 30 | } else if let Some(stripped) = arg.strip_prefix("--include=") { 31 | regexes.push(RegexType::Include( 32 | ExpandedRegex::from_str(stripped).expect("unexpected include regex"), 33 | )); 34 | } else if arg == "--exclude" { 35 | if let Some(s) = iter.peek() { 36 | regexes.push(RegexType::Exclude( 37 | ExpandedRegex::from_str(s).expect("unexpected exclude regex"), 38 | )); 39 | } 40 | } else if arg == "--include" { 41 | if let Some(s) = iter.peek() { 42 | regexes.push(RegexType::Include( 43 | ExpandedRegex::from_str(s).expect("unexpected include regex"), 44 | )); 45 | } 46 | } 47 | } 48 | debug!("regexes: {:?}", regexes); 49 | Self { regexes } 50 | } 51 | } 52 | 53 | impl ExclusionManagerTrait for ExclusionManager { 54 | fn match_str(&self, text: &str) -> Comparison { 55 | for regex in &self.regexes { 56 | match regex { 57 | RegexType::Exclude(regex) if regex.is_match(text) => return Comparison::Stop, 58 | RegexType::Include(regex) if regex.is_match(text) => return Comparison::Ok, 59 | _ => {} 60 | } 61 | } 62 | Comparison::Ok 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/timezone.rs: -------------------------------------------------------------------------------- 1 | use crate::listing::FileType; 2 | use crate::parser::{ListResult, ParserMux}; 3 | use crate::regex_manager::{Comparison, ExclusionManagerTrait}; 4 | use crate::utils::{self, again}; 5 | use crate::utils::{head, relative_to_str}; 6 | use crate::AsyncContext; 7 | use crate::{parser, SyncArgs}; 8 | 9 | use anyhow::{bail, Result}; 10 | use chrono::{DateTime, FixedOffset, Utc}; 11 | use tracing::{debug, info}; 12 | use url::Url; 13 | 14 | pub fn determinate_timezone( 15 | args: &SyncArgs, 16 | parser: &ParserMux, 17 | exclusion_manager: &dyn ExclusionManagerTrait, 18 | async_context: &AsyncContext, 19 | ) -> Option { 20 | match args.timezone { 21 | None => { 22 | // Check if to guess timezone 23 | // Some parsers like directory-lister, requires special handling for URL -- 24 | // we cannot deduce "base" from file URL. Most normal websites work like this 25 | // File http://example.com/d1/f1 => Listing http://example.com/d1/ 26 | // However directory-lister: 27 | // File https://example.com/d1/f1 => Listing https://example.com/?dir=d1/ 28 | // So we have to remember the listing URL here, too 29 | let timezone_base_and_url = match &args.timezone_file { 30 | Some(f) => { 31 | if f == "no" { 32 | None 33 | } else { 34 | // Currently timezone_file could be given from CLI 35 | // In this case, we still use the old logic to "guess" the listing 36 | // by setting the base (listing) url to None 37 | Some((None, Url::parse(f).expect("Invalid timezone file URL"))) 38 | } 39 | } 40 | None => { 41 | // eek, try getting first file in root index 42 | fn find_first_file( 43 | args: &SyncArgs, 44 | parser: &ParserMux, 45 | async_context: &AsyncContext, 46 | url: &Url, 47 | relative: Vec, 48 | exclusion_manager: &dyn ExclusionManagerTrait, 49 | ) -> Option<(Option, Url)> { 50 | let relative_str = relative_to_str(&relative, None); 51 | if exclusion_manager.match_str(&relative_str) == Comparison::Stop { 52 | info!("Excluded by exclusion manager: {}", relative_str); 53 | return None; 54 | } 55 | info!("Try finding first File in {}", url); 56 | let list = again(|| Ok(parser.get_list_with_filter(async_context, url, &relative_str)?), args.retry) 57 | .unwrap_or_else(|_| panic!("Failed to get list for {}. Maybe you shall disable timezone guessing?", url)); 58 | match list { 59 | ListResult::List(list) => { 60 | if let Some(item) = list.iter().find(|x| x.type_ == FileType::File) 61 | { 62 | info!("Find a file! URL: {}", item.url); 63 | return Some((Some(url.clone()), item.url.clone())); 64 | } 65 | for item in list.iter().filter(|x| x.type_ == FileType::Directory) { 66 | let mut relative = relative.clone(); 67 | relative.push(item.name.clone()); 68 | if let Some(res) = find_first_file( 69 | args, 70 | parser, 71 | async_context, 72 | &item.url, 73 | relative, 74 | exclusion_manager, 75 | ) { 76 | return Some(res); 77 | } 78 | } 79 | None 80 | } 81 | ListResult::Redirect(_) => { 82 | info!("Get a manual redirect instead of a file"); 83 | None 84 | } 85 | } 86 | } 87 | find_first_file( 88 | args, 89 | parser, 90 | async_context, 91 | &args.upstream, 92 | [].to_vec(), 93 | exclusion_manager, 94 | ) 95 | } 96 | }; 97 | match timezone_base_and_url { 98 | Some((timezone_base_url, timezone_url)) => { 99 | let timezone = guess_remote_timezone( 100 | parser, 101 | async_context, 102 | &args.upstream, 103 | timezone_base_url, 104 | timezone_url, 105 | ) 106 | .expect("Failed to guess timezone"); 107 | info!("Guessed timezone: {:?}", timezone); 108 | Some(timezone) 109 | } 110 | None => None, 111 | } 112 | } 113 | Some(tz) => { 114 | info!("Using timezone from argument: {:?} hrs", tz); 115 | Some(FixedOffset::east_opt(tz * 3600).unwrap()) 116 | } 117 | } 118 | } 119 | 120 | fn guess_remote_timezone( 121 | parser: &ParserMux, 122 | async_context: &AsyncContext, 123 | upstream: &Url, 124 | base_url: Option, 125 | file_url: Url, 126 | ) -> Result { 127 | assert!(!file_url.as_str().ends_with('/')); 128 | // trim after the latest '/' 129 | // TODO: improve this 130 | 131 | let file_url_str = file_url.as_str(); 132 | let base_url = match base_url { 133 | Some(b) => b, 134 | None => Url::parse(&file_url_str[..=file_url_str.rfind('/').unwrap()]).unwrap(), 135 | }; 136 | let relative = base_url.path().strip_prefix(upstream.path()).unwrap(); 137 | debug!("get {relative} as relative for parser in guess remote timezone"); 138 | 139 | info!("base: {:?}", base_url); 140 | info!("file: {:?}", file_url); 141 | 142 | let list = parser.get_list_with_filter(async_context, &base_url, relative)?; 143 | let list = match list { 144 | parser::ListResult::Redirect(_) => { 145 | anyhow::bail!("Redirection not supported"); 146 | } 147 | parser::ListResult::List(list) => list, 148 | }; 149 | debug!("{:?}", list); 150 | for item in list { 151 | if item.url == file_url { 152 | // access file_url with HEAD 153 | let resp = head( 154 | &async_context.runtime, 155 | &async_context.download_client, 156 | file_url, 157 | )?; 158 | let mtime = utils::get_response_mtime(&resp)?; 159 | 160 | // compare how many hours are there between mtime (FixedOffset) and item.mtime (Naive) 161 | // assuming that Naive one is UTC 162 | let unknown_mtime = DateTime::::from_naive_utc_and_offset(item.mtime, Utc); 163 | let offset = unknown_mtime - mtime; 164 | let offset_minutes = offset.num_minutes(); 165 | let hrs = (offset_minutes as f64 / 60.0).round() as i32; 166 | 167 | let minute_delta = (hrs as i64 * 60 - offset_minutes).abs(); 168 | if minute_delta > 20 { 169 | bail!("File mtime got from parser and response does not match."); 170 | } 171 | 172 | // Construct timezone by hrs 173 | let timezone = FixedOffset::east_opt(hrs * 3600).ok_or(anyhow::anyhow!( 174 | "Cannot convert to timezone (offset hour = {hrs})." 175 | ))?; 176 | info!( 177 | "html time: {:?}, head time: {:?}, timezone: {:?}", 178 | item.mtime, mtime, timezone 179 | ); 180 | return Ok(timezone); 181 | } 182 | } 183 | anyhow::bail!("File not found") 184 | } 185 | --------------------------------------------------------------------------------
    in order, instead of using class name, to improve compatibility for strange pages 43 | let td_selector = Selector::parse("td").unwrap(); 44 | let mut td_iterator = element.select(&td_selector); 45 | 46 | let td_a = match td_iterator.next() { 47 | Some(tda) => tda, 48 | None => { 49 | warn!("Cannot find in this