├── fixtures ├── buildroot │ ├── mimalloc │ │ ├── build.html │ │ ├── test │ │ │ └── test │ │ └── using.html │ └── acl │ │ └── index.html ├── start_fileserver.sh ├── postgresql │ └── empty │ │ └── index.html ├── docker │ ├── armv7l │ │ └── index.html │ └── index.html ├── vscode │ └── index.html ├── misc │ └── 1 │ │ └── index.html ├── clickhouse │ ├── stable │ │ ├── index2.html │ │ └── index.html │ └── index.html ├── artifactrepo │ ├── 10 │ │ └── index.html │ └── index.html ├── mozilla │ └── OJI │ │ └── index.html ├── ghettoforge │ └── index.html ├── mysql │ └── index.html ├── zabbix │ └── index.html ├── grml │ └── index.html ├── monitoring-plugins │ └── index.html ├── loongnix │ └── index.html ├── wine-builds │ └── index.html ├── influxdata │ └── index.html └── nodejs │ ├── latest-jod │ └── index.html │ └── v4.9.1 │ └── index.html ├── tsumugu-net ├── src │ ├── lib.rs │ ├── client │ │ ├── mod.rs │ │ └── impls.rs │ └── utils.rs ├── README.md └── Cargo.toml ├── tsumugu-cli ├── src │ ├── cli │ │ ├── mod.rs │ │ └── list.rs │ ├── bar.rs │ ├── compare.rs │ └── utils.rs ├── build.rs ├── examples │ ├── vyos.yaml │ ├── python.yaml │ ├── proxmox.yaml │ ├── postgresql.yaml │ ├── wine-builds.yaml │ ├── node.yaml │ ├── mysql-repo.yaml │ ├── zerotier.yaml │ ├── openresty.yaml │ └── docker-ce.yaml ├── Cargo.toml └── README.md ├── .gitignore ├── tsumugu-parser ├── src │ ├── lib.rs │ ├── extensions │ │ ├── mod.rs │ │ ├── yum.rs │ │ └── apt.rs │ ├── utils.rs │ ├── regex_manager │ │ ├── v2.rs │ │ ├── mod.rs │ │ └── v1.rs │ ├── parser │ │ ├── s3indexbuilder.rs │ │ ├── gradle.rs │ │ ├── lighttpd.rs │ │ ├── caddy.rs │ │ ├── directory_lister.rs │ │ ├── fallback.rs │ │ ├── docker.rs │ │ └── fancyindex.rs │ ├── listing.rs │ └── timezone.rs ├── README.md └── Cargo.toml ├── shell.nix ├── Cargo.toml ├── release.sh ├── LICENSE ├── .github └── workflows │ └── rust.yml ├── README.md └── docs ├── exclusion.md └── parser.md /fixtures/buildroot/mimalloc/build.html: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /fixtures/buildroot/mimalloc/test/test: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /fixtures/buildroot/mimalloc/using.html: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tsumugu-net/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod client; 2 | pub mod utils; 3 | -------------------------------------------------------------------------------- /fixtures/start_fileserver.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | python3 -m http.server 1921 -d $(dirname "$0") -------------------------------------------------------------------------------- /tsumugu-cli/src/cli/mod.rs: -------------------------------------------------------------------------------- 1 | mod list; 2 | mod sync; 3 | pub(crate) use list::list; 4 | pub(crate) use sync::sync; 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | Cargo.toml.cache 3 | .direnv 4 | 5 | # Add use nix if using direnv with nix 6 | .envrc 7 | 8 | .idea/ 9 | -------------------------------------------------------------------------------- /tsumugu-cli/build.rs: -------------------------------------------------------------------------------- 1 | use shadow_rs::ShadowBuilder; 2 | 3 | fn main() { 4 | ShadowBuilder::builder().build().unwrap(); 5 | } 6 | -------------------------------------------------------------------------------- /tsumugu-parser/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod extensions; 2 | pub mod listing; 3 | pub mod parser; 4 | pub mod regex_manager; 5 | pub mod timezone; 6 | pub mod utils; 7 | -------------------------------------------------------------------------------- /tsumugu-net/README.md: -------------------------------------------------------------------------------- 1 | # tsumugu-net 2 | 3 | Abstraction of HTTP client (required by `tsumugu-parser`) and an implementation with `reqwest` + `tokio` (used by `tsumugu-cli`). 4 | 5 | Please refer to [project README](../README.md) for more details. 6 | -------------------------------------------------------------------------------- /tsumugu-parser/README.md: -------------------------------------------------------------------------------- 1 | # tsumugu-parser 2 | 3 | This is a parser crate for various directory listing formats, used by tsumugu's CLI. 4 | 5 | Please refer to [project README](../README.md) for more details. 6 | 7 | ## Example 8 | 9 | TBD 10 | 11 | -------------------------------------------------------------------------------- /fixtures/postgresql/empty/index.html: -------------------------------------------------------------------------------- 1 | 2 | Index of /pub/old/binary/v7.4.27/linux/rpms/redhat/rhel-4-i386/ 3 | 4 |

Index of /pub/old/binary/v7.4.27/linux/rpms/redhat/rhel-4-i386/


../
5 | 

6 | 7 | -------------------------------------------------------------------------------- /tsumugu-cli/examples/vyos.yaml: -------------------------------------------------------------------------------- 1 | envs: 2 | UPSTREAM: https://packages.vyos.net/ 3 | TSUMUGU_EXCLUDE: > 4 | --exclude [=/]tmp 5 | --exclude /legacy/ 6 | --exclude /sagitta/ 7 | --exclude /equuleus/ 8 | --exclude /circinus/ 9 | TSUMUGU_PARSER: nginx 10 | image: ustcmirror/tsumugu:latest 11 | #cron: 25 5 * * * 12 | logRotCycle: 10 13 | name: vyos 14 | storageDir: /srv/repo/vyos 15 | -------------------------------------------------------------------------------- /tsumugu-cli/examples/python.yaml: -------------------------------------------------------------------------------- 1 | envs: 2 | UPSTREAM: https://www.python.org/ftp/python/ 3 | TSUMUGU_EXCLUDE: > 4 | --exclude \.rekor$ 5 | --exclude ^/2.7.16/windows-fixed/ 6 | --exclude ^/3.12.0/tmp/ 7 | --exclude ^/mail/ 8 | --exclude ^/nt/ 9 | --exclude ^/pc/ 10 | --exclude ^/win32/ 11 | image: ustcmirror/tsumugu:latest 12 | #cron: 11 5 * * * 13 | logRotCycle: 10 14 | name: python 15 | storageDir: /srv/repo/python/ 16 | -------------------------------------------------------------------------------- /tsumugu-cli/src/bar.rs: -------------------------------------------------------------------------------- 1 | pub(crate) const TEMPLATE_DEFAULT: &str = 2 | "{msg}\n[{elapsed_precise}] {bytes}/{total_bytes} ({bytes_per_sec}, {eta})"; 3 | pub(crate) fn set_progress_bar(bar: &kyuri::Bar, len: u64, url: &url::Url) { 4 | bar.set_len(len); 5 | bar.set_message(&format!("Downloading {}", url)); 6 | bar.set_template(TEMPLATE_DEFAULT); 7 | bar.set_pos(0); 8 | bar.set_visible(true); 9 | bar.reset_created_at(); 10 | } 11 | -------------------------------------------------------------------------------- /tsumugu-cli/examples/proxmox.yaml: -------------------------------------------------------------------------------- 1 | envs: 2 | UPSTREAM: http://download.proxmox.com/ 3 | TSUMUGU_EXCLUDE: > 4 | --exclude ^/temp 5 | --exclude /pmg/dists/.+changelog$ 6 | --exclude /devel/dists/.+changelog$ 7 | TSUMUGU_TIMEZONEFILE: http://download.proxmox.com/images/aplinfo.dat 8 | TSUMUGU_THREADS: 1 9 | image: ustcmirror/tsumugu:latest 10 | #cron: 17 5 * * * 11 | logRotCycle: 10 12 | name: proxmox 13 | storageDir: /srv/repo/proxmox/ 14 | -------------------------------------------------------------------------------- /fixtures/docker/armv7l/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Index of linux/centos/7/armv7l/ 6 | 7 | 8 |

Index of linux/centos/7/armv7l/

9 |
10 |
../
11 | nightly/                                    2020-01-21 07:38  -
12 | test/                                       2020-01-21 07:38  -
13 | 

14 | -------------------------------------------------------------------------------- /fixtures/vscode/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Index of /repos/vscode/ 4 | 5 |

Index of /repos/vscode/

6 |
../
 7 | dists/                                                                                              06-Mar-2025 07:19  481 Bytes
 8 | pool/                                                                                               05-Sep-2024 18:01  104.1 MB
 9 | 

10 | -------------------------------------------------------------------------------- /shell.nix: -------------------------------------------------------------------------------- 1 | { pkgs ? import {} }: 2 | pkgs.mkShell { 3 | nativeBuildInputs = with pkgs; [ rustc cargo gcc rustfmt clippy cargo-edit ]; 4 | 5 | # Certain Rust tools won't work without this 6 | # This can also be fixed by using oxalica/rust-overlay and specifying the rust-src extension 7 | # See https://discourse.nixos.org/t/rust-src-not-found-and-other-misadventures-of-developing-rust-on-nixos/11570/3?u=samuela. for more details. 8 | RUST_SRC_PATH = "${pkgs.rust.packages.stable.rustPlatform.rustLibSrc}"; 9 | } 10 | -------------------------------------------------------------------------------- /tsumugu-cli/examples/postgresql.yaml: -------------------------------------------------------------------------------- 1 | envs: 2 | UPSTREAM: https://apt.postgresql.org/pub/ 3 | TSUMUGU_EXCLUDE: > 4 | --include ^/repos/ 5 | --exclude ^/.+ 6 | TSUMUGU_EXTRA: > 7 | --exclusion-v2 8 | --parser-match s3-indexbuilder:/repos/yum/srpms/testing/ 9 | --parser-match s3-indexbuilder:/repos/zypp/srpms/.+ 10 | --ignore-nonexist 11 | TSUMUGU_PARSER: nginx 12 | image: ustcmirror/tsumugu:latest 13 | #cron: 13 8 * * * 14 | logRotCycle: 10 15 | name: postgresql 16 | storageDir: /srv/repo/postgresql/ 17 | -------------------------------------------------------------------------------- /fixtures/misc/1/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Misc test index 1 4 | 5 | 6 |

/etc/

7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 |
FilenameSizeLast ModifiedSHA256
passwd3.3 KB2024-08-24 15:04:11 +0000477a3d43f692aeaf1c7f40c0c91bffde3e2e638d8e90c668422373ee82a18521
20 | 21 | 22 | -------------------------------------------------------------------------------- /tsumugu-cli/examples/wine-builds.yaml: -------------------------------------------------------------------------------- 1 | envs: 2 | UPSTREAM: https://dl.winehq.org/wine-builds/ 3 | TSUMUGU_EXCLUDE: > 4 | --exclude ^/mageia/ 5 | --exclude ^/macosx/ 6 | --exclude ^/android/ 7 | --exclude ^/debian/ 8 | --include ^/debian/dists/${DEBIAN_CURRENT}/ 9 | --exclude ^/ubuntu/ 10 | --include ^/ubuntu/dists/${UBUNTU_LTS}/ 11 | --exclude ^/fedora/ 12 | --include ^/fedora/${FEDORA_CURRENT}/ 13 | TSUMUGU_PARSER: apache-f2 14 | TSUMUGU_EXTRA: --head-before-get --ignore-nonexist 15 | image: ustcmirror/tsumugu:latest 16 | #cron: 40 6 * * * 17 | logRotCycle: 10 18 | name: wine-builds 19 | storageDir: /srv/repo/wine-builds 20 | -------------------------------------------------------------------------------- /tsumugu-cli/examples/node.yaml: -------------------------------------------------------------------------------- 1 | envs: 2 | UPSTREAM: http://nodejs.org/dist/ 3 | TSUMUGU_EXCLUDE: > 4 | --exclude /docs/ 5 | --exclude /old-docs/ 6 | --exclude SHASUMS$ 7 | # nodejs page mtime and actual mtime always does not match 8 | # so we ignore package changes (assuming that this would not happen) 9 | # we need to keep index.{json,tab} to be latest and regex crate does not support look-around 10 | # so let's just skip-if-exists for node* and npm* files 11 | TSUMUGU_EXTRA: > 12 | --skip-if-exists /node[^/]+$ 13 | --skip-if-exists /npm[^/]+$ 14 | image: ustcmirror/tsumugu:latest 15 | #cron: 7 7 * * * 16 | logRotCycle: 10 17 | name: node 18 | storageDir: /srv/repo/node/ 19 | -------------------------------------------------------------------------------- /tsumugu-net/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "tsumugu-net" 3 | version = "0.20251111.0" 4 | edition = "2024" 5 | description = "Network abstraction for tsumugu" 6 | license = "MIT" 7 | repository = "https://github.com/taoky/tsumugu" 8 | 9 | [dependencies] 10 | anyhow = { workspace = true } 11 | tracing = { workspace = true } 12 | chrono = { workspace = true } 13 | url = { workspace = true } 14 | http = { workspace = true } 15 | 16 | reqwest = { workspace = true, optional = true } 17 | tokio = { workspace = true, optional = true } 18 | 19 | [dev-dependencies] 20 | test-log = { workspace= true } 21 | 22 | [features] 23 | with-impl = [ 24 | "dep:reqwest", 25 | "dep:tokio", 26 | ] # otherwise interface only 27 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | resolver = "3" 3 | members = [ 4 | "tsumugu-parser", 5 | "tsumugu-net", 6 | "tsumugu-cli" 7 | ] 8 | 9 | [workspace.dependencies] 10 | anyhow = { version = "1.0", features = ["backtrace"] } 11 | chrono = { version = "0.4", default-features = false, features = ["clock"] } 12 | clap = { version = "4.5", features = ["derive"] } 13 | url = "2.5" 14 | tracing = "0.1" 15 | reqwest = { version = "0.12", default-features = false, features = ["stream", "gzip", "deflate", "brotli", "socks", "rustls-tls-native-roots", "http2"] } 16 | test-log = { version = "0.2", default-features = false, features = ["trace"] } 17 | tokio = { version = "1.4", features = ["rt-multi-thread"] } 18 | http = "1.3" 19 | -------------------------------------------------------------------------------- /tsumugu-cli/examples/mysql-repo.yaml: -------------------------------------------------------------------------------- 1 | envs: 2 | TSUMUGU_EXCLUDE: > 3 | --exclude /apt/dists/ 4 | --exclude /apt/pool/ 5 | --exclude /ubuntu/dists/ 6 | --include /ubuntu/dists/${UBUNTU_LTS}/ 7 | --exclude /debian/dists/ 8 | --include /debian/dists/${DEBIAN_CURRENT}/ 9 | --exclude /fc/ 10 | --include /fc/${FEDORA_CURRENT}/ 11 | --exclude /el/ 12 | --include /el/${RHEL_CURRENT}/ 13 | --exclude /yum/mysql-tools-preview/ 14 | --exclude dbgsym 15 | --exclude debuginfo 16 | UPSTREAM: https://repo.mysql.com/ 17 | TSUMUGU_EXTRA: --apt-packages --yum-packages --ignore-nonexist 18 | image: ustcmirror/tsumugu:latest 19 | #cron: 15 5 * * * 20 | logRotCycle: 10 21 | name: mysql-repo 22 | storageDir: /srv/repo/mysql-repo 23 | -------------------------------------------------------------------------------- /release.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | version="$1" 5 | msg="$2" 6 | 7 | function get_full_version() { 8 | local ver="$1" 9 | if [[ "$ver" == *.* ]]; then 10 | echo "0.$ver" 11 | else 12 | echo "0.$ver.0" 13 | fi 14 | } 15 | 16 | full_version=$(get_full_version "$version") 17 | prev_tag=$(git describe --tags --abbrev=0) 18 | prev_version=$(get_full_version "$prev_tag") 19 | echo "Releasing version: $full_version (previous: $prev_version)" 20 | # cargo install cargo-jump; echo "See https://github.com/taoky/cargo-jump" 21 | cargo jump --old-tag="$prev_tag" "$full_version" 22 | git commit -a -m "Bump version to $full_version" 23 | git tag "$version" -m "$msg" 24 | echo "Release prepared. Run 'git push' and 'git push --tags' to publish the release." -------------------------------------------------------------------------------- /tsumugu-parser/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "tsumugu-parser" 3 | version = "0.20251111.0" 4 | edition = "2024" 5 | description = "Parser for various directory listing formats" 6 | license = "MIT" 7 | repository = "https://github.com/taoky/tsumugu" 8 | 9 | [dependencies] 10 | tsumugu-net = { version = "0", path = "../tsumugu-net" } 11 | 12 | anyhow = { workspace = true } 13 | tracing = { workspace = true } 14 | clap = { workspace = true } 15 | chrono = { workspace = true } 16 | url = { workspace = true } 17 | 18 | flate2 = "1.1" 19 | apt-parser = "1.0" 20 | percent-encoding = "2.3" 21 | regex = "1.12" 22 | scraper = "0.24" 23 | thiserror = "2.0" 24 | 25 | [dev-dependencies] 26 | test-log = { workspace= true } 27 | tsumugu-net = { path = "../tsumugu-net", features = ["with-impl"]} 28 | -------------------------------------------------------------------------------- /tsumugu-cli/examples/zerotier.yaml: -------------------------------------------------------------------------------- 1 | envs: 2 | UPSTREAM: https://download.zerotier.com/ 3 | TSUMUGU_EXCLUDE: > 4 | --exclude /1.8.[123]/dist/wd$ 5 | --exclude /1.8.[123]/dist/ubiquiti/zerotier-one_latest_mips.deb$ 6 | --exclude /dist/dist/ 7 | --exclude /fc/ 8 | --include /fc/${FEDORA_CURRENT}/ 9 | --exclude /el/ 10 | --include /el/${RHEL_CURRENT}/ 11 | --exclude /debian/ 12 | --include /debian/${UBUNTU_LTS}/ 13 | --include /debian/${UBUNTU_NONLTS}/ 14 | --include /debian/${DEBIAN_CURRENT}/ 15 | # looks like that zerotier server does not report correct mtime 16 | TSUMUGU_EXTRA: > 17 | --allow-mtime-from-parser 18 | --timezone 0 19 | image: ustcmirror/tsumugu:latest 20 | #cron: 33 2 * * * 21 | logRotCycle: 10 22 | name: zerotier 23 | storageDir: /srv/repo/zerotier/ 24 | -------------------------------------------------------------------------------- /tsumugu-cli/examples/openresty.yaml: -------------------------------------------------------------------------------- 1 | envs: 2 | UPSTREAM: http://openresty.org/package/ 3 | TSUMUGU_EXCLUDE: > 4 | --exclude ^/centos/7/2/ 5 | --exclude ^/oracle/7/2/ 6 | --exclude ^/rhel/7/2/ 7 | --exclude ^/rhel/7Client/2/ 8 | --exclude ^/rhel/7Server/2/ 9 | --exclude ^/centos/7Client/2/ 10 | --exclude ^/centos/7Server/2/ 11 | --exclude ^/centos/8/8/ 12 | --exclude ^/rocky/8/8/ 13 | --exclude ^/alinux/ 14 | --exclude ^/tlinux/ 15 | --exclude ^/oracle/ 16 | --exclude ^/amazon/ 17 | --exclude ^/almalinux/8.*/3$ 18 | --exclude ^/rhel/8.*/3$ 19 | --exclude ^/rocky/8.*/3$ 20 | --exclude ^/centos/8.*/3$ 21 | --exclude ^/kylin/10/3$ 22 | image: ustcmirror/tsumugu:latest 23 | #cron: 13 5 * * * 24 | logRotCycle: 10 25 | name: openresty 26 | storageDir: /srv/repo/openresty/ 27 | -------------------------------------------------------------------------------- /tsumugu-cli/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "tsumugu" 3 | version = "0.20251128.0" 4 | edition = "2024" 5 | description = "A HTTP(S) syncing tool with lower overhead, for OSS mirrors" 6 | license = "MIT" 7 | repository = "https://github.com/taoky/tsumugu" 8 | 9 | [dependencies] 10 | tsumugu-parser = { version = "0", path = "../tsumugu-parser" } 11 | tsumugu-net = { version = "0", path = "../tsumugu-net", features = ["with-impl"] } 12 | 13 | clap = { workspace = true } 14 | tokio = { workspace = true } 15 | anyhow = { workspace = true } 16 | chrono = { workspace = true } 17 | url = { workspace = true } 18 | reqwest = { workspace = true } 19 | tracing = { workspace = true } 20 | 21 | walkdir = "2.5" 22 | crossbeam-deque = "0.8" 23 | humansize = "2.1" 24 | futures-util = "0.3" 25 | filetime = "0.2" 26 | kyuri = "0.2" 27 | tracing-subscriber = { version = "0.3", features = ["env-filter"] } 28 | shadow-rs = "1.4" 29 | 30 | [dev-dependencies] 31 | test-log = { workspace= true } 32 | 33 | [build-dependencies] 34 | shadow-rs = "1.4" 35 | -------------------------------------------------------------------------------- /tsumugu-net/src/client/mod.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use chrono::{DateTime, Utc}; 3 | use http::HeaderMap; 4 | use url::Url; 5 | 6 | pub enum RequestType { 7 | List, 8 | Download, 9 | } 10 | 11 | pub struct HttpResponse { 12 | pub body: String, 13 | pub final_url: Url, 14 | pub status_code: u16, 15 | pub content_length: Option, 16 | pub modified_time: Result>, 17 | pub headers: HeaderMap, 18 | } 19 | 20 | /// A trait for HTTP clients used by the parser. 21 | pub trait HttpClient { 22 | fn get_text_with_type(&self, url: &Url, req_type: RequestType) -> Result; 23 | fn get_text(&self, url: &Url) -> Result { 24 | self.get_text_with_type(url, RequestType::List) 25 | } 26 | 27 | fn head_with_type(&self, url: &Url, req_type: RequestType) -> Result; 28 | fn head(&self, url: &Url) -> Result { 29 | self.head_with_type(url, RequestType::List) 30 | } 31 | } 32 | 33 | #[cfg(feature = "with-impl")] 34 | pub mod impls; 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 taoky 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /fixtures/clickhouse/stable/index2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 18 | 19 | 20 |
21 |
roottgz ⟩ stable
22 |
 
23 | clickhouse-client-23.7.3.14-arm64.tgz.sha512
176
2023-08-05T19:31:50.283Z
24 | clickhouse-client-23.7.3.14-arm64.tgz
79,837
(77.97 kb)
2023-08-05T19:30:42.398Z
25 |
26 | 27 | -------------------------------------------------------------------------------- /fixtures/clickhouse/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 18 | 19 | 20 |
21 |
root ⟩ repo-archive
22 |
 
23 | deb/ 24 | rpm/ 25 | tgz/ 26 |
 
27 |
0
2022-09-23T13:39:52.080Z
28 | CLICKHOUSE-KEY.GPG
3,133
(3.06 kb)
2022-09-23T13:53:51.925Z
29 |
30 | 31 | -------------------------------------------------------------------------------- /tsumugu-cli/examples/docker-ce.yaml: -------------------------------------------------------------------------------- 1 | envs: 2 | UPSTREAM: https://download.docker.com/ 3 | TSUMUGU_PARSER: docker 4 | TSUMUGU_EXTRA: > 5 | --timezone 0 6 | --head-before-get 7 | --skip-if-exists /static/ 8 | --skip-if-exists 0.0.0 --skip-if-exists 2019 --skip-if-exists 2018 9 | --skip-if-exists (nightly|edge|s390x|ppc64le|ppc64el|test|debug-.+)/.+\.(rpm|deb) 10 | --compare-size-only \.(rpm|deb)$ 11 | TSUMUGU_EXCLUDE: > 12 | --exclude /debian/ 13 | --include /debian/dists/${DEBIAN_CURRENT} 14 | --include /debian/gpg 15 | --exclude /fedora/ 16 | --include /fedora/docker-ce.+ 17 | --include /fedora/${FEDORA_CURRENT}/ 18 | --include /fedora/gpg 19 | --exclude /ubuntu/ 20 | --include /ubuntu/dists/${UBUNTU_LTS}/ 21 | --include /ubuntu/dists/${UBUNTU_NONLTS}/ 22 | --include /ubuntu/gpg 23 | --exclude /centos/ 24 | --include /centos/${RHEL_CURRENT}.*/ 25 | --include /centos/docker-ce.+ 26 | --include /centos/gpg 27 | --exclude /rhel/ 28 | --include /rhel/${RHEL_CURRENT}.*/ 29 | --include /rhel/docker-ce.+ 30 | --include /rhel/gpg 31 | image: ustcmirror/tsumugu:latest 32 | #cron: 10 4 * * * 33 | logRotCycle: 10 34 | name: docker-ce 35 | storageDir: /srv/repo/docker-ce 36 | -------------------------------------------------------------------------------- /fixtures/artifactrepo/10/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Index of 4 | openjdk-local/10 5 | 6 | 7 | 8 |

9 |

Index of openjdk-local/10

10 |

11 |
Name                              Last modified     Size
12 |
13 |
../
14 | openjdk-10_linux-x64_bin.tar.gz   08-Mar-2018 10:06 195.38 MB
16 | openjdk-10_osx-x64_bin.tar.gz     08-Mar-2018 10:06 191.54 MB
18 | openjdk-10_windows-x64_bin.tar.gz 08-Mar-2018 10:25 189.68 MB
20 | 
21 |
22 |
23 | ArtifactRepo/ Server Port 443 24 |
25 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /fixtures/clickhouse/stable/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 18 | 19 | 20 |
21 |
roottgz ⟩ stable
22 |
 
23 |
0
2022-09-21T12:08:53.750Z
24 | clickhouse-client-21.1.9.41.tgz.sha512
162
2023-11-24T13:12:00.540Z
25 | clickhouse-client-21.1.9.41.tgz
161,531
(157.75 kb)
2022-09-21T23:58:17.236Z
26 |
 
27 | 28 |
29 | 30 | -------------------------------------------------------------------------------- /fixtures/mozilla/OJI/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Directory Listing: /pub/OJI/ 6 | 7 | 8 |

Index of /pub/OJI/

9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 |
TypeNameSizeLast Modified
Dir..
DirMRJPlugin/
FileMRJPlugin.sit.hqx234K13-Feb-2023 04:21
43 | 44 | -------------------------------------------------------------------------------- /fixtures/ghettoforge/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Index of /distributions/gf 5 | 6 | 7 |

Index of /distributions/gf

8 |
Icon  Name                                Last modified      Size  Description
[PARENTDIR] Parent Directory - 9 | [   ] RPM-GPG-KEY-gf.el7 2014-12-30 02:53 3.0K 10 | [   ] RPM-GPG-KEY-gf.el8 2020-01-13 09:40 3.1K 11 | [   ] RPM-GPG-KEY-gf.el9 2022-08-03 11:28 1.6K 12 | [DIR] archive/ 2020-12-21 02:34 - 13 | [DIR] el/ 2022-08-02 11:57 - 14 | [   ] gf-release-latest.gf.el7.noarch.rpm 2021-08-21 10:38 8.0K 15 | [   ] gf-release-latest.gf.el8.noarch.rpm 2021-08-21 10:39 11K 16 | [   ] gf-release-latest.gf.el9.noarch.rpm 2022-08-03 12:16 9.2K 17 |
18 | 19 | -------------------------------------------------------------------------------- /fixtures/mysql/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Index of /232905/apt/ubuntu/pool/mysql-tools/m 5 | 6 | 7 |

Index of /232905/apt/ubuntu/pool/mysql-tools/m

8 |
   Name                              Last modified        Size  
 9 | 
10 | [DIR] Parent Directory 01-Jan-1970 00:00 - 11 | [DIR] mysql-community/ 19-Apr-2023 14:57 - 12 | [DIR] mysql-connector-c++/ 24-Oct-2023 18:04 - 13 | [DIR] mysql-connector-j/ 24-Oct-2023 18:17 - 14 | [DIR] mysql-connector-java/ 08-Oct-2022 08:19 - 15 | [DIR] mysql-connector-odbc/ 24-Oct-2023 17:29 - 16 | [DIR] mysql-connector-python/ 25-Oct-2023 16:10 - 17 | [DIR] mysql-router/ 24-Apr-2019 12:18 - 18 | [DIR] mysql-shell/ 19-Apr-2023 07:30 - 19 | [DIR] mysql-utilities/ 07-Nov-2017 09:27 - 20 | [DIR] mysql-workbench-community/ 19-Apr-2023 06:44 - 21 |

22 | 23 | -------------------------------------------------------------------------------- /fixtures/docker/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Index of linux/centos/ 6 | 7 | 8 |

Index of linux/centos/

9 |
10 |
../
11 | 7.0/
12 | 7.1/
13 | 7.2/
14 | 7.3/
15 | 7.4/
16 | 7.5/
17 | 7.6/
18 | 7.7/
19 | 7.8/
20 | 7.9/
21 | 7/
22 | 7Client/
23 | 7Server/
24 | 7Workstation/
25 | 8.0/
26 | 8.1/
27 | 8.2/
28 | 8.3/
29 | 8.4/
30 | 8.5/
31 | 8.6/
32 | 8.7/
33 | 8.8/
34 | 8.9/
35 | 8/
36 | 8Client/
37 | 8Server/
38 | 8Workstation/
39 | 9.0/
40 | 9.1/
41 | 9.2/
42 | 9.3/
43 | 9.4/
44 | 9.5/
45 | 9.6/
46 | 9.7/
47 | 9.8/
48 | 9.9/
49 | 9/
50 | 9Client/
51 | 9Server/
52 | 9Workstation/
53 | docker-ce-staging.repo                                                                2023-07-07 20:20:56 2.0 KiB
54 | docker-ce.repo                                                                        2023-07-07 20:20:51 1.9 KiB
55 | gpg                                                                                   2023-07-07 20:21:31 1.6 KiB
56 | 

57 | -------------------------------------------------------------------------------- /tsumugu-net/src/utils.rs: -------------------------------------------------------------------------------- 1 | use anyhow::{Result, anyhow}; 2 | use chrono::{DateTime, FixedOffset, NaiveDateTime, Utc}; 3 | 4 | pub fn parse_last_modified(last_modified: &str) -> Result> { 5 | let last_modified = match DateTime::parse_from_rfc2822(last_modified) { 6 | Ok(res) => res, 7 | Err(_) => { 8 | // Maybe header is in format like "Monday, 02-Sep-2024 09:11:16 GMT" 9 | // Parse this type of datetime 10 | 11 | // Chrono does not support %Z, but according to rfc1945, it shall always GMT. 12 | let last_modified = last_modified.trim_end_matches(" GMT"); 13 | let naive = NaiveDateTime::parse_from_str(last_modified, "%A, %d-%b-%Y %H:%M:%S")?; 14 | naive 15 | .and_local_timezone(FixedOffset::east_opt(0).unwrap()) 16 | .single() 17 | .unwrap() 18 | } 19 | }; 20 | Ok(last_modified.with_timezone(&Utc)) 21 | } 22 | 23 | pub fn last_modified_from_header(headers: &http::HeaderMap) -> Result> { 24 | let last_modified = headers 25 | .get(http::header::LAST_MODIFIED) 26 | .ok_or(anyhow!("No Last-Modified header found in response"))?; 27 | let last_modified = last_modified.to_str()?; 28 | parse_last_modified(last_modified) 29 | } 30 | 31 | #[cfg(test)] 32 | mod tests { 33 | use super::*; 34 | use test_log::test; 35 | 36 | #[test] 37 | fn test_parse_last_modified() { 38 | assert_eq!( 39 | parse_last_modified("Monday, 02-Sep-2024 09:11:16 GMT").unwrap(), 40 | DateTime::parse_from_str("2024/09/02 09:11:16 +0000", "%Y/%m/%d %H:%M:%S %z") 41 | .unwrap() 42 | .with_timezone(&Utc) 43 | ); 44 | assert_eq!( 45 | parse_last_modified("Wed, 14 Aug 2024 07:02:10 GMT").unwrap(), 46 | DateTime::parse_from_str("2024/08/14 07:02:10 +0000", "%Y/%m/%d %H:%M:%S %z") 47 | .unwrap() 48 | .with_timezone(&Utc) 49 | ); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /tsumugu-parser/src/extensions/mod.rs: -------------------------------------------------------------------------------- 1 | use std::path::Path; 2 | use tracing::{info, warn}; 3 | use url::Url; 4 | 5 | mod apt; 6 | mod yum; 7 | 8 | pub struct ExtensionPackage { 9 | pub url: Url, 10 | pub relative: Vec, 11 | pub filename: String, 12 | } 13 | 14 | pub fn extension_handler( 15 | apt_packages: bool, 16 | yum_packages: bool, 17 | path: &Path, 18 | relative: &[String], 19 | url: &Url, 20 | push_func: F, 21 | ) where 22 | F: Fn(&ExtensionPackage), 23 | { 24 | if apt_packages && crate::extensions::apt::is_apt_package(path) { 25 | let packages = apt::parse_package(path, relative, url); 26 | match packages { 27 | Err(e) => { 28 | warn!("Failed to parse APT package {:?}: {:?}", path, e); 29 | } 30 | Ok(packages) => { 31 | for package in packages { 32 | info!("APT package: {:?}", package); 33 | push_func(&package.into()); 34 | } 35 | } 36 | } 37 | } 38 | if yum_packages { 39 | let is_primary = crate::extensions::yum::is_yum_primary_xml(path); 40 | let is_repomd = crate::extensions::yum::is_yum_repomd_xml(path); 41 | match (is_primary, is_repomd) { 42 | (false, false) => (), 43 | (p, r) => { 44 | assert!(!(p && r), "File is both primary and repomd"); 45 | let xml_type = if p { 46 | crate::extensions::yum::YumXmlType::Primary 47 | } else { 48 | crate::extensions::yum::YumXmlType::Repomd 49 | }; 50 | let packages = yum::parse_package(path, relative, url, xml_type); 51 | match packages { 52 | Err(e) => { 53 | warn!("Failed to parse YUM file {:?}: {:?}", path, e); 54 | } 55 | Ok(packages) => { 56 | for package in packages { 57 | info!("YUM package: {:?}", package); 58 | push_func(&package.into()); 59 | } 60 | } 61 | } 62 | } 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /tsumugu-parser/src/utils.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use tracing::warn; 3 | 4 | pub fn again Result>( 5 | mut f: F, 6 | retries: usize, 7 | ) -> Result { 8 | for attempt in 0..=retries { 9 | match f() { 10 | Ok(v) => return Ok(v), 11 | Err(e) if attempt == retries => return Err(e), 12 | Err(e) => { 13 | warn!("Error: {:?}. retry {}/{}", e, attempt + 1, retries); 14 | } 15 | } 16 | } 17 | unreachable!() 18 | } 19 | 20 | pub fn relative_str_process(relative: &str) -> String { 21 | let mut r = relative.to_string(); 22 | if r.starts_with('/') { 23 | warn!("unexpected / at the beginning of relative ({r})"); 24 | } else { 25 | r.insert(0, '/'); 26 | } 27 | if r.len() != 1 { 28 | if r.ends_with('/') { 29 | warn!("unexpected / at the end of relative ({r})") 30 | } else { 31 | r.push('/') 32 | } 33 | } 34 | r 35 | } 36 | 37 | pub fn relative_to_str(relative: &[String], filename: Option<&str>) -> String { 38 | let r = relative.join("/"); 39 | let r = relative_str_process(&r); 40 | 41 | // here r already has / at the end 42 | match filename { 43 | None => r, 44 | Some(filename) => { 45 | assert!(!filename.starts_with('/') && !filename.ends_with('/')); 46 | format!("{}{}", r, filename) 47 | } 48 | } 49 | } 50 | 51 | #[cfg(test)] 52 | mod tests { 53 | use super::*; 54 | use test_log::test; 55 | 56 | #[test] 57 | fn test_relative() { 58 | let mut relative: Vec = vec![]; 59 | assert_eq!(relative_to_str(&relative, None), "/"); 60 | relative.push("debian".to_string()); 61 | assert_eq!(relative_to_str(&relative, None), "/debian/"); 62 | relative.push("dists".to_string()); 63 | assert_eq!(relative_to_str(&relative, None), "/debian/dists/"); 64 | relative.push("bookworm".to_string()); 65 | assert_eq!(relative_to_str(&relative, None), "/debian/dists/bookworm/"); 66 | assert_eq!( 67 | relative_to_str(&relative, Some("Release")), 68 | "/debian/dists/bookworm/Release" 69 | ); 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /tsumugu-parser/src/regex_manager/v2.rs: -------------------------------------------------------------------------------- 1 | use std::str::FromStr; 2 | 3 | use tracing::debug; 4 | 5 | use super::{Comparison, ExclusionManagerTrait, ExpandedRegex}; 6 | 7 | #[derive(Debug, Clone)] 8 | enum RegexType { 9 | Include(ExpandedRegex), 10 | Exclude(ExpandedRegex), 11 | } 12 | 13 | #[derive(Debug, Clone)] 14 | pub struct ExclusionManager { 15 | regexes: Vec, 16 | } 17 | 18 | impl ExclusionManager { 19 | pub fn new(args: &[String]) -> Self { 20 | debug!("args: {:?}", args); 21 | let mut regexes = Vec::new(); 22 | let mut iter = args.iter().peekable(); 23 | while let Some(arg) = iter.next() { 24 | if let Some(stripped) = arg.strip_prefix("--exclude=") { 25 | regexes.push(RegexType::Exclude( 26 | ExpandedRegex::from_str(stripped).expect("unexpected exclude regex"), 27 | )); 28 | } else if let Some(stripped) = arg.strip_prefix("--include=") { 29 | regexes.push(RegexType::Include( 30 | ExpandedRegex::from_str(stripped).expect("unexpected include regex"), 31 | )); 32 | } else if arg == "--exclude" { 33 | if let Some(s) = iter.peek() { 34 | regexes.push(RegexType::Exclude( 35 | ExpandedRegex::from_str(s).expect("unexpected exclude regex"), 36 | )); 37 | } 38 | } else if arg == "--include" 39 | && let Some(s) = iter.peek() 40 | { 41 | regexes.push(RegexType::Include( 42 | ExpandedRegex::from_str(s).expect("unexpected include regex"), 43 | )); 44 | } 45 | } 46 | debug!("regexes: {:?}", regexes); 47 | Self { regexes } 48 | } 49 | } 50 | 51 | impl ExclusionManagerTrait for ExclusionManager { 52 | fn match_str(&self, text: &str) -> Comparison { 53 | for regex in &self.regexes { 54 | match regex { 55 | RegexType::Exclude(regex) if regex.is_match(text) => return Comparison::Stop, 56 | RegexType::Include(regex) if regex.is_match(text) => return Comparison::Ok, 57 | _ => {} 58 | } 59 | } 60 | Comparison::Ok 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /fixtures/zabbix/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Zabbix Cloud Images and Appliances 5 | 6 | 7 | 8 |
9 |

Zabbix Cloud Images and Appliances

10 | Zabbix is an enterprise-class open source distributed monitoring solution designed to monitor and track performance and availability of network servers, devices and other IT resources. It supports distributed and WEB monitoring, auto-discovery, and more. 11 |

12 | Zabbix appliances are available on Google Cloud Platform along with previously released Zabbix appliances, Microsoft Azure, DigitalOcean and Amazon Web Services. Zabbix is now available on all major cloud platforms. 13 |

14 | These appliances are created and officially supported by Zabbix SIA. 15 |

16 | Installation instructions are available in Zabbix Cloud Images page. 17 |

18 | If you have any problems or suggestions, please report an issue on Zabbix Bug Tracking System. 19 |

20 | If you want to get professional support, installation or upgrade service, please see our Zabbix technical support service page. 21 | 22 |
23 |
24 | 25 | Index of /zabbix/ 26 | 27 |

Index of /zabbix/


../
28 | appliances/                                        27-Jul-2020 11:06                   -
29 | binaries/                                          01-Dec-2020 20:09                   -
30 | integrations/                                      12-Nov-2021 12:30                   -
31 | nightly/                                           24-Aug-2024 12:03                   -
32 | sources/                                           14-Dec-2020 13:39                   -
33 | 

34 | 35 |
36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /tsumugu-cli/src/cli/list.rs: -------------------------------------------------------------------------------- 1 | use tsumugu_parser::{ 2 | parser::{ListResult, ParserMux}, 3 | regex_manager::Comparison, 4 | utils::relative_str_process, 5 | }; 6 | 7 | use crate::{ 8 | ListArgs, 9 | utils::{build_client, get_exclusion_manager}, 10 | }; 11 | 12 | use tsumugu_net::client::impls::TokioHttpClient; 13 | 14 | // TODO: clean code 15 | pub(crate) fn list(args: &ListArgs, bind_address: Option) -> ! { 16 | let parser = ParserMux::new( 17 | args.parser.clone(), 18 | args.parser_match.clone(), 19 | args.auto_fallback, 20 | ); 21 | let req_client = build_client(args, parser.is_auto_redirect(), bind_address.as_ref(), true); 22 | let client = TokioHttpClient { 23 | runtime: tokio::runtime::Runtime::new().unwrap(), 24 | listing_client: req_client.clone(), 25 | download_client: req_client, 26 | }; 27 | let exclusion_manager = get_exclusion_manager(args); 28 | // get relative 29 | let upstream = &args.upstream; 30 | let upstream_path = parser.get_path(upstream); 31 | let relative = upstream_path 32 | .strip_prefix(&args.upstream_base) 33 | .unwrap() 34 | .to_str() 35 | .unwrap() 36 | .to_owned(); 37 | let relative = relative_str_process(&relative); 38 | assert!(relative.starts_with('/') && relative.ends_with('/')); 39 | let list = parser 40 | .get_list_with_filter(&client, upstream, &relative) 41 | .unwrap(); 42 | let match_cmp = exclusion_manager.match_str(&relative); 43 | 44 | println!("Relative: {relative}"); 45 | println!("Exclusion: {:?}", match_cmp); 46 | if match_cmp == Comparison::Stop { 47 | tracing::warn!("This listing would NOT be accessed at all."); 48 | } 49 | match list { 50 | ListResult::Redirect(url) => { 51 | println!("Redirect to {url}"); 52 | } 53 | ListResult::List(list) => { 54 | for item in list { 55 | print!("{item}"); 56 | let new_relative = format!("{}{}", relative, item.name); 57 | tracing::debug!("new_relative: {new_relative}"); 58 | println!( 59 | "{}", 60 | match exclusion_manager.match_str(new_relative.as_str()) { 61 | Comparison::Stop => " (stop)", 62 | Comparison::ListOnly => " (list only)", 63 | Comparison::Ok => "", 64 | } 65 | ); 66 | } 67 | } 68 | } 69 | 70 | std::process::exit(0); 71 | } 72 | -------------------------------------------------------------------------------- /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: Rust 2 | 3 | on: 4 | - push 5 | - pull_request 6 | 7 | env: 8 | CARGO_TERM_COLOR: always 9 | 10 | jobs: 11 | build: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v5 15 | with: 16 | submodules: 'recursive' 17 | - name: Generate Cargo.toml.cache (Ignore version=) 18 | run: | 19 | sed '/^version = /d' Cargo.toml >> Cargo.toml.cache 20 | sed '/^version = /d' tsumugu-parser/Cargo.toml >> Cargo.toml.cache 21 | sed '/^version = /d' tsumugu-cli/Cargo.toml >> Cargo.toml.cache 22 | - uses: actions/cache@v4 23 | with: 24 | path: | 25 | ~/.cargo/registry/index 26 | ~/.cargo/registry/cache 27 | ~/.cargo/git 28 | target 29 | key: musl-cargo-${{ hashFiles('./Cargo.toml.cache') }} 30 | - name: Run fileserver 31 | run: | 32 | fixtures/start_fileserver.sh & 33 | 34 | - name: Remove files for a build with correct version info 35 | run: | 36 | find target/ -name 'shadow*' -exec rm -r {} + || true 37 | find target/ -name 'tsumugu' -delete || true 38 | - name: Test & Compile 39 | run: | 40 | mkdir -p ~/.cargo/{git,registry} 41 | # Fix git permission issue with Docker and shadow-rs 42 | sudo chown -R root . 43 | docker run --rm -t \ 44 | --mount type=bind,source=${{ github.workspace }},target=/volume \ 45 | --mount type=bind,source=$HOME/.cargo/registry,target=/root/.cargo/registry \ 46 | --mount type=bind,source=$HOME/.cargo/git,target=/root/.cargo/git \ 47 | --network=host \ 48 | clux/muslrust:stable \ 49 | cargo test 50 | docker run --rm -t \ 51 | --mount type=bind,source=${{ github.workspace }},target=/volume \ 52 | --mount type=bind,source=$HOME/.cargo/registry,target=/root/.cargo/registry \ 53 | --mount type=bind,source=$HOME/.cargo/git,target=/root/.cargo/git \ 54 | --network=host \ 55 | clux/muslrust:stable \ 56 | cargo build --release 57 | sudo chown -R runner ~/.cargo/ 58 | sudo chown -R runner . 59 | # show version info 60 | RUST_LOG=debug target/x86_64-unknown-linux-musl/release/tsumugu --version 61 | 62 | - name: Deploy - Create and Upload Release 63 | if: startsWith(github.ref, 'refs/tags/') 64 | uses: ncipollo/release-action@v1 65 | with: 66 | artifacts: target/x86_64-unknown-linux-musl/release/tsumugu 67 | - name: Release to crates.io 68 | if: startsWith(github.ref, 'refs/tags/') 69 | uses: katyo/publish-crates@v2 70 | with: 71 | registry-token: ${{ secrets.CARGO_REGISTRY_TOKEN }} 72 | -------------------------------------------------------------------------------- /fixtures/grml/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Index of /pool/main/m 5 | 6 | 7 |

Index of /pool/main/m

8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
[ICO]NameLast modifiedSizeDescription

[PARENTDIR]Parent Directory  -  
[DIR]mozilla-firefox-adblock/2006-11-27 07:00 -  
[DIR]msgid-chooser/2006-11-27 07:00 -  
[DIR]magicrescue/2006-12-10 10:30 -  
[DIR]memtest86+/2007-03-27 22:41 -  
[DIR]misdn-kernel/2007-07-11 19:45 -  
[DIR]minised/2007-11-04 19:15 -  
[DIR]md5deep/2007-11-04 21:09 -  
[DIR]multiseat/2010-01-08 17:40 -  
[DIR]mdadm/2013-02-22 10:40 -  
[DIR]madwifi/2024-10-07 18:11 -  

24 | 25 | -------------------------------------------------------------------------------- /tsumugu-net/src/client/impls.rs: -------------------------------------------------------------------------------- 1 | use anyhow::{Result, anyhow}; 2 | use tracing::trace; 3 | use url::Url; 4 | 5 | use super::*; 6 | use crate::utils::parse_last_modified; 7 | 8 | pub fn get_response_mtime(resp: &reqwest::Response) -> Result> { 9 | let last_modified = resp 10 | .headers() 11 | .get("Last-Modified") 12 | .ok_or(anyhow!("Last-Modified header not found"))? 13 | .to_str()?; 14 | parse_last_modified(last_modified) 15 | } 16 | 17 | pub struct TokioHttpClient { 18 | pub runtime: tokio::runtime::Runtime, 19 | pub listing_client: reqwest::Client, 20 | pub download_client: reqwest::Client, 21 | } 22 | 23 | fn tokio_resp_to_tsumugu_resp(resp: &reqwest::Response) -> HttpResponse { 24 | let content_length = resp.content_length(); 25 | let status_code = resp.status().as_u16(); 26 | let final_url = resp.url().clone(); 27 | let modified_time = get_response_mtime(resp); 28 | let headers = resp.headers().clone(); 29 | HttpResponse { 30 | body: String::new(), 31 | final_url, 32 | status_code, 33 | content_length, 34 | modified_time, 35 | headers, 36 | } 37 | } 38 | 39 | impl TokioHttpClient { 40 | fn select_client(&self, req_type: RequestType) -> &reqwest::Client { 41 | match req_type { 42 | RequestType::List => &self.listing_client, 43 | RequestType::Download => &self.download_client, 44 | } 45 | } 46 | 47 | pub fn init_with_defaults() -> Self { 48 | let client = reqwest::Client::new(); 49 | TokioHttpClient { 50 | runtime: tokio::runtime::Runtime::new().unwrap(), 51 | listing_client: client.clone(), 52 | download_client: client, 53 | } 54 | } 55 | } 56 | 57 | impl HttpClient for TokioHttpClient { 58 | fn get_text_with_type(&self, url: &Url, req_type: RequestType) -> Result { 59 | let future = async { 60 | self.select_client(req_type) 61 | .get(url.clone()) 62 | .send() 63 | .await? 64 | .error_for_status() 65 | }; 66 | let resp = self.runtime.block_on(future)?; 67 | let http_resp = tokio_resp_to_tsumugu_resp(&resp); 68 | let body_text = self.runtime.block_on(resp.text())?; 69 | Ok(HttpResponse { 70 | body: body_text, 71 | ..http_resp 72 | }) 73 | } 74 | fn head_with_type(&self, url: &Url, req_type: RequestType) -> Result { 75 | let future = async { 76 | self.select_client(req_type) 77 | .head(url.clone()) 78 | .send() 79 | .await? 80 | .error_for_status() 81 | }; 82 | let resp = self.runtime.block_on(future)?; 83 | trace!("HEAD {} -> {}: {:?}", url, resp.status(), resp); 84 | Ok(tokio_resp_to_tsumugu_resp(&resp)) 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /fixtures/monitoring-plugins/index.html: -------------------------------------------------------------------------------- 1 | 2 | Index of /monitoring-plugins/ 3 | 4 |

Index of /monitoring-plugins/


../
 5 | archive/                                           09-Oct-2015 16:12                   -
 6 | mib/                                               29-Nov-2013 20:20                   -
 7 | presentation/                                      26-Sep-2013 05:15                   -
 8 | snapshot/                                          10-Feb-2023 17:42                   -
 9 | monitoring-plugins-2.0.tar.gz                      11-Jul-2014 23:17             2610000
10 | monitoring-plugins-2.0.tar.gz.sha1                 11-Jul-2014 23:17                  72
11 | monitoring-plugins-2.1.1.tar.gz                    02-Dec-2014 07:46             2612331
12 | monitoring-plugins-2.1.1.tar.gz.sha1               02-Dec-2014 07:46                  74
13 | monitoring-plugins-2.1.2.tar.gz                    16-Oct-2015 17:40             2613060
14 | monitoring-plugins-2.1.2.tar.gz.sha1               16-Oct-2015 17:40                  74
15 | monitoring-plugins-2.1.tar.gz                      15-Oct-2014 20:32             2611940
16 | monitoring-plugins-2.1.tar.gz.sha1                 15-Oct-2014 20:32                  72
17 | monitoring-plugins-2.2.tar.gz                      29-Nov-2016 16:49             2461548
18 | monitoring-plugins-2.2.tar.gz.sha1                 29-Nov-2016 16:49                  72
19 | monitoring-plugins-2.3.1.tar.gz                    11-Apr-2021 17:07             2529669
20 | monitoring-plugins-2.3.1.tar.gz.sha1               11-Apr-2021 17:07                  74
21 | monitoring-plugins-2.3.2.tar.gz                    19-Oct-2022 20:58             2766966
22 | monitoring-plugins-2.3.2.tar.gz.sha1               19-Oct-2022 20:58                  74
23 | monitoring-plugins-2.3.3.tar.gz                    01-Feb-2023 21:53             2620192
24 | monitoring-plugins-2.3.3.tar.gz.sha1               01-Feb-2023 21:53                  74
25 | monitoring-plugins-2.3.tar.gz                      10-Dec-2020 05:50             2528556
26 | monitoring-plugins-2.3.tar.gz.sha1                 10-Dec-2020 05:50                  72
27 | timestamp                                          20-Jul-2023 10:46                  11
28 | 

29 | 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tsumugu 2 | 3 | A HTTP(S) syncing tool with lower overhead, for OSS mirrors. 4 | 5 | Instead of `HEAD`ing every single file, tsumugu parses directory listing HTML and downloads only files that do not seem to be up-to-date. 6 | 7 | ## Design goals 8 | 9 | To successfully sync from these domains, where lftp/rclone fails or finds difficulties: 10 | 11 | - [x] http://download.proxmox.com/ 12 | - [x] https://download.docker.com/ 13 | - [x] https://dl.winehq.org/wine-builds/ 14 | 15 | ## TODOs 16 | 17 | - [x] Add "--include": Sync even if the file is excluded by `--exclude` regex. 18 | - [x] Add supported Debian, Ubuntu, Fedora and RHEL versions support to `--include` regex. 19 | - Something like `--include debian/${DEBIAN_VERSIONS}`? 20 | - [x] Check for APT/YUM repo integrity (avoid keeping old invalid metadata files) 21 | - (This is experimental and may not work well) 22 | 23 | ## Project structure 24 | 25 | This project uses cargo workspace with the following crates: 26 | 27 | - [tsumugu-net](./tsumugu-net/): Abstraction of HTTP client (required by `tsumugu-parser`) and an implementation with `reqwest` + `tokio` (used by `tsumugu-cli`). 28 | - [tsumugu-parser](./tsumugu-parser/): A parser crate for various directory listing formats. Can be reused by other projects. 29 | - [tsumugu-cli](./tsumugu-cli/): The CLI tool for syncing. For historical reasons, the crate name (and binary name) is called `tsumugu`. 30 | 31 | ## Common notes 32 | 33 | ### Regex variables 34 | 35 | See [./tsumugu-parser/src/regex_manager/mod.rs](./tsumugu-parser/src/regex_manager/mod.rs) for available variables to use in inclusion and exclusion regexes. 36 | 37 | ### Exclusion and inclusion rules 38 | 39 | **There's a breaking change since 20240902. User regexes with `^` and `$` would be affected.** 40 | 41 | See [./docs/exclusion.md](./docs/exclusion.md). 42 | 43 | ### Deduplication 44 | 45 | Tsumugu relies on local file size and mtime to check if file shall be downloaded. Some file-level deduplicators like [jdupes](https://codeberg.org/jbruchon/jdupes) would ignore file mtime when deduplicating with hard links. This could be an issue for some repos, as some files would be redownloaded again and again every time as it does not have a correct mtime locally. 46 | 47 | Workarounds: 48 | 49 | - Set `--compare-size-only`. 50 | - Use filesystem-level/block-level deduplication like `zfs dedup`. 51 | - Use another file-level deduplicator which considers mtime (though I don't know which would do this). 52 | 53 | Also, if you are sure that some directory is identical with another, you could manually create a symlink for that. Tsumugu would ignore symlinks during syncing. 54 | 55 | ## Acknowledgements 56 | 57 | Special thanks to [NJU Mirror](https://mirrors.nju.edu.cn/) for extensive testing and bug reporting. 58 | 59 | ## Naming 60 | 61 | The name "tsumugu", and current branch name "pudding", are derived from the manga *A Drift Girl and a Noble Moon*. 62 | 63 |
64 | And... 65 | tsumugu, drawn as simplified version of hitori 66 | 67 | Tsumugu in the appearance of a very simplified version of Hitori (Obviously I am not very good at drawing though). 68 |
69 | 70 | Old (2020), unfinished golang version is named as "traverse", under the `main-old` branch. 71 | -------------------------------------------------------------------------------- /tsumugu-parser/src/extensions/yum.rs: -------------------------------------------------------------------------------- 1 | use std::{io::Read, path::Path}; 2 | 3 | use anyhow::Result; 4 | use flate2::read::GzDecoder; 5 | use tracing::info; 6 | use url::Url; 7 | 8 | fn get_locations_from_xml(s: &str) -> Vec { 9 | let re = regex::Regex::new(r#""#).unwrap(); 10 | let mut urls = Vec::new(); 11 | for line in s.lines() { 12 | if let Some(caps) = re.captures(line) { 13 | let url = caps.get(1).unwrap().as_str(); 14 | urls.push(url.to_string()); 15 | } 16 | } 17 | urls 18 | } 19 | 20 | pub fn is_yum_primary_xml(p: &Path) -> bool { 21 | p.file_name() 22 | .map(|f| f.to_str().unwrap()) 23 | .map(|f| f.ends_with("primary.xml.gz")) 24 | .unwrap_or(false) 25 | } 26 | 27 | // read and extract location 28 | pub fn read_primary_xml(p: &Path) -> Result> { 29 | let bytes = std::fs::read(p)?; 30 | let mut gzd = GzDecoder::new(&bytes[..]); 31 | let mut s = String::new(); 32 | gzd.read_to_string(&mut s)?; 33 | 34 | Ok(get_locations_from_xml(&s)) 35 | } 36 | 37 | pub enum YumXmlType { 38 | Primary, 39 | Repomd, 40 | } 41 | 42 | #[derive(Debug)] 43 | pub struct YumPackage { 44 | pub url: Url, 45 | pub relative: Vec, 46 | pub filename: String, 47 | } 48 | 49 | impl From for super::ExtensionPackage { 50 | fn from(val: YumPackage) -> Self { 51 | super::ExtensionPackage { 52 | url: val.url, 53 | relative: val.relative, 54 | filename: val.filename, 55 | } 56 | } 57 | } 58 | 59 | pub fn parse_package( 60 | packages_path: &Path, 61 | relative: &[String], 62 | packages_url: &Url, 63 | xml_type: YumXmlType, 64 | ) -> Result> { 65 | let packages = match xml_type { 66 | YumXmlType::Primary => read_primary_xml(packages_path)?, 67 | YumXmlType::Repomd => read_yum_repomd_xml(packages_path)?, 68 | }; 69 | let mut relative = relative.to_owned(); 70 | relative.pop(); // pop "repodata" 71 | 72 | let mut base_url = packages_url.clone(); 73 | base_url.path_segments_mut().unwrap().pop().pop().push(""); 74 | info!("base_url = {:?}", base_url); 75 | info!("relative = {:?}", relative); 76 | 77 | let mut res = vec![]; 78 | for package in packages { 79 | let url = base_url.join(&package)?; 80 | let split: Vec = package.split('/').map(|s| s.to_string()).collect(); 81 | let mut relative = relative.clone(); 82 | relative.append(&mut split.clone()); 83 | 84 | let basename = relative.pop().unwrap(); 85 | res.push(YumPackage { 86 | url, 87 | relative, 88 | filename: basename, 89 | }) 90 | } 91 | 92 | Ok(res) 93 | } 94 | 95 | // Well, brain-damaged mysql-repo even cannot show all primary.xml.gz... 96 | // So I have to use repomd.xml to get primary.xml.gz... 97 | // Good news is that it seems like existing functions for handling primary.xml.gz can be reused. 98 | pub fn is_yum_repomd_xml(p: &Path) -> bool { 99 | p.file_name() 100 | .map(|f| f.to_str().unwrap()) 101 | .map(|f| f == "repomd.xml") 102 | .unwrap_or(false) 103 | } 104 | 105 | pub fn read_yum_repomd_xml(p: &Path) -> Result> { 106 | let bytes = std::fs::read(p)?; 107 | let s = String::from_utf8_lossy(&bytes); 108 | 109 | Ok(get_locations_from_xml(s.as_ref())) 110 | } 111 | -------------------------------------------------------------------------------- /tsumugu-parser/src/regex_manager/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod v1; 2 | pub mod v2; 3 | 4 | use std::str::FromStr; 5 | 6 | use regex::Regex; 7 | 8 | // Submit an issue if you find this out-of-date! 9 | // And assuming that all vars are distro_ver 10 | const REGEX_REPLACEMENTS: &[(&str, &str)] = &[ 11 | // https://endoflife.date/debian 12 | ( 13 | "${DEBIAN_CURRENT}", 14 | "(?bullseye|bookworm|trixie)", 15 | ), 16 | // https://endoflife.date/ubuntu (excluding ESM) 17 | ("${UBUNTU_LTS}", "(?jammy|noble)"), 18 | ("${UBUNTU_NONLTS}", "(?plucky|questing)"), 19 | // https://endoflife.date/fedora 20 | ("${FEDORA_CURRENT}", "(?41|42|43)"), 21 | // CentOS is no longer supported -- this regex is replaced to something that could match nothing 22 | ( 23 | "${CENTOS_CURRENT}", 24 | "(?NONEXISTFILENAMESOITCOULDNEVERMATCHANYTHING)", 25 | ), 26 | // https://endoflife.date/rhel (excluding ELCS) 27 | ("${RHEL_CURRENT}", "(?8|9|10)"), 28 | // https://endoflife.date/opensuse 29 | ("${OPENSUSE_CURRENT}", "(?15.6|16.0)"), 30 | // https://endoflife.date/sles 31 | ("${SLES_CURRENT}", "(?15)"), 32 | ]; 33 | 34 | /// ExpandedRegex contains inner and rev_inner, and would transparently add '/' before string 35 | /// (and convert regex with ^). A warning would be given if text input contains '/' at front. 36 | #[derive(Debug, Clone)] 37 | pub struct ExpandedRegex { 38 | pub inner: Regex, 39 | /// v1 compatibility field 40 | rev_inner: Regex, 41 | } 42 | 43 | impl FromStr for ExpandedRegex { 44 | type Err = regex::Error; 45 | 46 | fn from_str(s: &str) -> Result { 47 | // If starts with ^ and not ^/, change start matching character from ^ to ^/ 48 | let s = if s.starts_with('^') && !s.starts_with("^/") { 49 | &format!("^/{}", &s[1..]) 50 | } else { 51 | s 52 | }; 53 | let mut s1 = s.to_string(); 54 | for (from, to) in REGEX_REPLACEMENTS { 55 | s1 = s1.replace(from, to); 56 | } 57 | let mut s2 = s.to_string(); 58 | for (from, _) in REGEX_REPLACEMENTS.iter().rev() { 59 | s2 = s2.replace(from, "(?.+)"); 60 | } 61 | Ok(Self { 62 | inner: Regex::new(&s1)?, 63 | rev_inner: Regex::new(&s2)?, 64 | }) 65 | } 66 | } 67 | 68 | // Delegate to inner 69 | impl ExpandedRegex { 70 | fn text_transform(text: &str) -> String { 71 | if !text.starts_with('/') { 72 | tracing::warn!( 73 | "(unexpected internal input: string given to match_str shall start with /, anything wrong?)" 74 | ); 75 | format!("/{}", text) 76 | } else { 77 | text.to_string() 78 | } 79 | } 80 | 81 | pub fn is_match(&self, text: &str) -> bool { 82 | self.inner.is_match(&Self::text_transform(text)) 83 | } 84 | 85 | /// v1 compatibility method 86 | pub fn is_others_match(&self, text: &str) -> bool { 87 | let text = &Self::text_transform(text); 88 | !self.inner.is_match(text) && self.rev_inner.is_match(text) 89 | } 90 | } 91 | 92 | #[derive(Debug, Clone, Copy, PartialEq)] 93 | pub enum Comparison { 94 | Stop, 95 | /// v1 compatibility field 96 | ListOnly, 97 | Ok, 98 | } 99 | 100 | pub trait ExclusionManagerTrait: Send + Sync { 101 | fn match_str(&self, text: &str) -> Comparison; 102 | } 103 | 104 | pub fn get_exclusion_manager_v2(args: &[String]) -> Box { 105 | Box::new(v2::ExclusionManager::new(args)) 106 | } 107 | 108 | pub fn get_exclusion_manager_v1( 109 | exclude: &[ExpandedRegex], 110 | include: &[ExpandedRegex], 111 | ) -> Box { 112 | Box::new(v1::ExclusionManager::new(exclude, include)) 113 | } 114 | 115 | #[cfg(test)] 116 | mod tests { 117 | use super::*; 118 | 119 | #[test] 120 | fn test_expanded_regex() { 121 | let regex = ExpandedRegex::from_str("^/deb/dists/${DEBIAN_CURRENT}").unwrap(); 122 | assert!(regex.is_match("/deb/dists/bookworm/Release")); 123 | assert!(!regex.is_match("/deb/dists/wheezy/Release")); 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /fixtures/artifactrepo/index.html: -------------------------------------------------------------------------------- 1 | 2 | Index of 3 | openjdk-local 4 | 5 | 6 | 7 |

8 |

Index of openjdk-local

9 |

10 |
Name         Last modified     Size
11 |
12 |
10/          22-Aug-2021 15:18 -
13 | 10.0.1/      22-Aug-2021 15:18 -
14 | 10.0.2/      22-Aug-2021 15:18 -
15 | 11.0.1/      22-Aug-2021 15:19 -
16 | 11.0.2/      22-Aug-2021 15:19 -
17 | 12/          22-Aug-2021 15:19 -
18 | 12.0.1/      22-Aug-2021 15:19 -
19 | 12.0.2/      22-Aug-2021 15:19 -
20 | 13/          22-Aug-2021 15:19 -
21 | 13.0.1/      22-Aug-2021 15:20 -
22 | 13.0.2/      22-Aug-2021 15:20 -
23 | 14/          22-Aug-2021 15:20 -
24 | 14.0.1/      22-Aug-2021 15:20 -
25 | 14.0.2/      22-Aug-2021 15:20 -
26 | 15/          22-Aug-2021 15:21 -
27 | 15.0.1/      22-Aug-2021 15:21 -
28 | 15.0.2/      22-Aug-2021 15:21 -
29 | 16/          22-Aug-2021 15:21 -
30 | 16.0.1/      22-Aug-2021 15:21 -
31 | 16.0.2/      22-Aug-2021 15:22 -
32 | 17/          15-Sep-2021 03:36 -
33 | 17.0.1/      20-Oct-2021 03:38 -
34 | 17.0.2/      19-Jan-2022 03:39 -
35 | 18/          23-Mar-2022 03:36 -
36 | 18.0.1/      21-Apr-2022 03:36 -
37 | 18.0.1.1/    03-May-2022 03:40 -
38 | 18.0.2/      20-Jul-2022 03:37 -
39 | 18.0.2.1/    20-Aug-2022 03:42 -
40 | 19/          21-Sep-2022 03:36 -
41 | 19.0.1/      19-Oct-2022 03:37 -
42 | 19.0.2/      20-Dec-2022 12:17 -
43 | 20/          14-Feb-2023 00:53 -
44 | 20.0.1/      30-Mar-2023 16:43 -
45 | 20.0.2/      26-Jun-2023 15:00 -
46 | 21/          12-Aug-2023 00:39 -
47 | 21.0.1/      06-Oct-2023 16:18 -
48 | 21.0.2/      06-Jan-2024 15:26 -
49 | 22/          17-Feb-2024 00:20 -
50 | 22.0.1/      26-Mar-2024 13:00 -
51 | 22.0.2/      20-Jun-2024 17:06 -
52 | 9/           22-Aug-2021 15:22 -
53 | 9.0.1/       21-May-2022 03:39 -
54 | 9.0.4/       22-Aug-2021 15:22 -
55 | java-jse-ri/ 22-Aug-2021 15:22 -
56 | 
57 |
58 |
59 | ArtifactRepo/ Server Port 443 60 |
61 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /docs/exclusion.md: -------------------------------------------------------------------------------- 1 | # Tsumugu exclusion/inclusion logic and rules 2 | 3 | ## v2 4 | 5 | v2 is a breaking but much simpler change. It is based on two simple rules: 6 | 7 | - Match excludes and includes by their order in argv. 8 | - If nothing is matched, include. 9 | 10 | You need `--exclusion-v2` to enable this new behavior. 11 | 12 | > [!TIP] 13 | > To include `/a/b/c/d`, `/`, `/a/`, `/a/b/`, `/a/b/c/` and `/a/b/c/d` shall all be included. 14 | 15 | ## v1 16 | 17 | Currently tsumugu follows a simple algorithm to determine whether a path should be completely excluded, partially excluded, or included: 18 | 19 | 0. When parsing regex, a `rev_inner` regex will be generated by replacing variables (`${UBUNTU_LTS}`, etc.) to `(?.+)` (aka, match everything). The `rev_inner` would be used like this: 20 | 21 | ```rust 22 | pub fn is_others_match(&self, text: &str) -> bool { 23 | !self.inner.is_match(text) && self.rev_inner.is_match(text) 24 | } 25 | ``` 26 | 27 | 1. First, users' exclusions and inclusions are preprocessed. For all **exclusions, if it is a prefix of any inclusion**, it will be put into the `list_only_regexes`, otherwise it will be put into `instant_stop_regexes`. All inclusions are in `include_regexes`. 28 | 2. While working threads are handling listing requests: 29 | 1. Check with `instant_stop_regexes` and `include_regexes`: 30 | 31 | ```rust 32 | for regex in &self.instant_stop_regexes { 33 | if regex.is_match(text) { 34 | return Comparison::Stop; 35 | } 36 | } 37 | for regex in &self.include_regexes { 38 | if regex.is_match(text) { 39 | return Comparison::Ok; 40 | } 41 | } 42 | ``` 43 | 44 | 2. Then, the path will be checked with `rev_inner` regex by `is_others_match()`, and also completely excluded if matches (a fast shortcut). 45 | 46 | This is used for cases like Fedora -- it has many versions (currently from 1 to 40). Listing other version folders not in `${FEDORA_CURRENT}` is a waste of time and network. With this trick we could skip these unmatched versions. 47 | 3. Finally, if the path matches `list_only_regexes`, files under this directory will be ignored (unless they are matched by `include_regexes`), but subdirectories will still be listed. Paths that are not matched by any regexes will be included as usual. 48 | 49 | In this process some paths, which would be unnecessary, will still be listed. However, this logic suits needs of filtering OS versions well. 50 | 51 | ## Relative path 52 | 53 | Also note that currently, this is used when generating relative path for comparison: 54 | 55 | ```rust 56 | pub fn relative_to_str(relative: &[String], filename: Option<&str>) -> String { 57 | let mut r = relative.join("/"); 58 | if r.starts_with('/') { 59 | warn!("unexpected / at the beginning of relative ({r})"); 60 | } else { 61 | r.insert(0, '/'); 62 | } 63 | if r.len() != 1 { 64 | if r.ends_with('/') { 65 | warn!("unexpected / at the end of relative ({r})") 66 | } else { 67 | r.push('/') 68 | } 69 | } 70 | 71 | // here r already has / at the end 72 | match filename { 73 | None => r, 74 | Some(filename) => { 75 | assert!(!filename.starts_with('/') && !filename.ends_with('/')); 76 | format!("{}{}", r, filename) 77 | } 78 | } 79 | } 80 | ``` 81 | 82 | As a result: 83 | 84 | 1. All relative paths for comparison have "/" at front. 85 | 2. Directory paths have "/" at back, and files don't. 86 | 87 | Examples: 88 | 89 | 1. `http://example.com/file` => `/file` 90 | 2. `http://example.com/dir` => `/dir/` 91 | 3. `http://example.com/dir/file` => `/dir/file` 92 | 93 | Not that for compatibilities considerations, this trick is done: User regex which starts with `^` and not `^/`, would be replaced: `^` -> `^/` (this might break some very rare regexes). 94 | 95 | So you could **write `/something$` to exclude ALL files and directories with name `something`**, instead of using 2 regexes (`^something$` and `/something$`, to match `something` at root and others not in root). 96 | 97 | And also, `upstream` itself is NOT included when comparing. So if your upstream is set to `https://some.example.com/dir/`, you need to exclude `^something/` to exclude `https://some.example.com/dir/something/` instead of `^dir/something/`. 98 | 99 | Test with [tsumugu list](./parser.md#debugging), if in doubt. 100 | -------------------------------------------------------------------------------- /tsumugu-parser/src/parser/s3indexbuilder.rs: -------------------------------------------------------------------------------- 1 | // For https://github.com/mhagander/s3indexbuilder 2 | 3 | use crate::{ 4 | listing::{FileSize, FileType, ListItem}, 5 | parser::{ 6 | ListResult, Parser, ParserError, assert_if_url_has_no_trailing_slash, 7 | get_real_name_from_href, handle_net, parse_error, 8 | }, 9 | }; 10 | use anyhow::{Result, anyhow}; 11 | use chrono::{FixedOffset, NaiveDateTime}; 12 | use scraper::{Html, Selector}; 13 | 14 | use tsumugu_net::client::HttpClient; 15 | 16 | #[derive(Debug, Clone, Default)] 17 | pub struct S3Indexbuilder; 18 | 19 | impl Parser for S3Indexbuilder { 20 | fn name(&self) -> &'static str { 21 | "s3indexbuilder format" 22 | } 23 | 24 | fn get_list(&self, client: &dyn HttpClient, url: &url::Url) -> Result { 25 | let resp = handle_net!(client.get_text(url))?; 26 | let url = &resp.final_url; 27 | assert_if_url_has_no_trailing_slash(url); 28 | let document = Html::parse_document(&resp.body); 29 | let selector = Selector::parse("table").unwrap(); 30 | let table = document 31 | .select(&selector) 32 | .next() 33 | .ok_or(parse_error!("No found in document"))?; 34 | let selector = Selector::parse("tr").unwrap(); 35 | let mut items = Vec::new(); 36 | for element in table.select(&selector) { 37 | let td_selector = Selector::parse("td").unwrap(); 38 | let tds: Vec<_> = element.select(&td_selector).collect(); 39 | assert_eq!( 40 | tds.len(), 41 | 3, 42 | "Expected 3 "))?; 29 | let selector = Selector::parse("tr").unwrap(); 30 | let mut items = Vec::new(); 31 | for element in indexlist.select(&selector) { 32 | let a = element 33 | .select(&Selector::parse("a").unwrap()) 34 | .next() 35 | .ok_or_else(|| parse_error!("Cannot find "))?; 36 | let mtime = element 37 | .select(&Selector::parse(".m").unwrap()) 38 | .next() 39 | .ok_or_else(|| parse_error!("Cannot find .m"))?; 40 | let size = element 41 | .select(&Selector::parse(".s").unwrap()) 42 | .next() 43 | .ok_or_else(|| parse_error!("Cannot find .s"))?; 44 | 45 | let displayed_filename = a.inner_html(); 46 | if displayed_filename == ".." { 47 | continue; 48 | } 49 | let href = a 50 | .value() 51 | .attr("href") 52 | .ok_or_else(|| parse_error!("Cannot find href inside "))?; 53 | let name = get_real_name_from_href(href); 54 | let href = url.join(href)?; 55 | 56 | let type_ = if href.as_str().ends_with('/') { 57 | FileType::Directory 58 | } else { 59 | FileType::File 60 | }; 61 | 62 | let mtime = mtime.inner_html(); 63 | let mtime = mtime.trim(); 64 | let mtime = NaiveDateTime::parse_from_str(mtime, "%Y-%b-%d %H:%M:%S")?; 65 | 66 | let size = size.inner_html(); 67 | // Currently we just use simple replace to handle HTML entities 68 | // if we need a more sophisticated way to handle it, we should use a crate 69 | // like https://crates.io/crates/htmlentity 70 | let size = size.replace(" ", ""); 71 | let size = size.trim(); 72 | let size = if size == "-" { 73 | None 74 | } else { 75 | let (n_size, unit) = FileSize::get_humanized(size); 76 | Some(FileSize::HumanizedBinary(n_size, unit)) 77 | }; 78 | 79 | // debug!("{} {} {} {:?} {:?}", href, name, mtime, size, type_); 80 | items.push(ListItem::new(href, name, type_, size, mtime, None)) 81 | } 82 | 83 | Ok(ListResult::List(items)) 84 | } 85 | } 86 | 87 | #[cfg(test)] 88 | mod tests { 89 | use crate::listing::SizeUnit; 90 | 91 | use super::*; 92 | use crate::parser::tests::*; 93 | 94 | #[test] 95 | fn test_buildroot_root() { 96 | let context = init_client(); 97 | let items = LighttpdListingParser 98 | .get_list( 99 | &context, 100 | &Url::parse("http://localhost:1921/buildroot/").unwrap(), 101 | ) 102 | .unwrap(); 103 | match items { 104 | ListResult::List(items) => { 105 | assert_eq!(items[0].name, "18xx-ti-utils"); 106 | assert_eq!(items[0].type_, FileType::Directory); 107 | assert_eq!(items[0].size, None); 108 | assert_eq!( 109 | items[0].mtime, 110 | NaiveDateTime::parse_from_str("2021-01-11 15:59:23", "%Y-%m-%d %H:%M:%S") 111 | .unwrap() 112 | ); 113 | let last_item = items.last().unwrap(); 114 | assert_eq!(last_item.name, "zyre-v2.0.0.tar.gz"); 115 | assert_eq!(last_item.type_, FileType::File); 116 | assert_eq!( 117 | last_item.size, 118 | Some(FileSize::HumanizedBinary(262.1, SizeUnit::K)) 119 | ); 120 | assert_eq!( 121 | last_item.mtime, 122 | NaiveDateTime::parse_from_str("2018-03-08 11:18:46", "%Y-%m-%d %H:%M:%S") 123 | .unwrap() 124 | ); 125 | } 126 | _ => unreachable!(), 127 | } 128 | } 129 | 130 | #[test] 131 | fn test_buildroot_subfolder() { 132 | let context = init_client(); 133 | let items = LighttpdListingParser 134 | .get_list( 135 | &context, 136 | &Url::parse("http://localhost:1921/buildroot/acl/").unwrap(), 137 | ) 138 | .unwrap(); 139 | match items { 140 | ListResult::List(items) => { 141 | assert_eq!(items.len(), 4); 142 | assert_eq!(items[0].name, "acl-2.2.52.src.tar.gz"); 143 | assert_eq!(items[0].type_, FileType::File); 144 | assert_eq!( 145 | items[0].size, 146 | Some(FileSize::HumanizedBinary(377.5, SizeUnit::K)) 147 | ); 148 | assert_eq!( 149 | items[0].mtime, 150 | NaiveDateTime::parse_from_str("2013-05-19 06:10:38", "%Y-%m-%d %H:%M:%S") 151 | .unwrap() 152 | ); 153 | assert_eq!(items[3].name, "acl-2.3.2.tar.xz"); 154 | assert_eq!(items[3].type_, FileType::File); 155 | assert_eq!( 156 | items[3].size, 157 | Some(FileSize::HumanizedBinary(362.9, SizeUnit::K)) 158 | ); 159 | assert_eq!( 160 | items[3].mtime, 161 | NaiveDateTime::parse_from_str("2024-02-07 03:04:10", "%Y-%m-%d %H:%M:%S") 162 | .unwrap() 163 | ); 164 | } 165 | _ => unreachable!(), 166 | } 167 | } 168 | } 169 | -------------------------------------------------------------------------------- /tsumugu-parser/src/extensions/apt.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use std::path::{Path, PathBuf}; 3 | use tracing::warn; 4 | use url::Url; 5 | 6 | pub fn is_apt_package(p: &Path) -> bool { 7 | // check if basename is Packages 8 | let basename = p.file_name().unwrap().to_str().unwrap(); 9 | if basename != "Packages" { 10 | return false; 11 | } 12 | // check if parents contain dists 13 | let parents = p.ancestors(); 14 | for iter in parents { 15 | let basename = iter.file_name().unwrap().to_str().unwrap(); 16 | if basename == "dists" { 17 | return true; 18 | } 19 | } 20 | false 21 | } 22 | 23 | // In every iter packages_path and packages_url be updated to their parents 24 | // When they reach the dists directory, return the root of debian 25 | // Otherwise when one of them reach the root, return error 26 | fn get_debian_root( 27 | packages_path: &Path, 28 | relative: &[String], 29 | packages_url: &Url, 30 | ) -> Result<(PathBuf, Vec, Url)> { 31 | fn pop(p: &mut PathBuf, r: Option<&mut Vec>, u: &mut Url) -> Result<()> { 32 | if !p.pop() { 33 | anyhow::bail!( 34 | "Cannot find debian root (path can not be popped, path = {:?})", 35 | p 36 | ); 37 | } 38 | if u.path() == "/" { 39 | anyhow::bail!( 40 | "Cannot find debian root (url can not be popped, url = {:?})", 41 | u 42 | ); 43 | } 44 | if let Some(r) = r 45 | && r.pop().is_none() 46 | { 47 | anyhow::bail!( 48 | "Cannot find debian root (relative can not be popped, relative = {:?})", 49 | r 50 | ); 51 | } 52 | u.path_segments_mut().unwrap().pop(); 53 | Ok(()) 54 | } 55 | let mut packages_path = packages_path.to_path_buf(); 56 | let mut relative = relative.to_owned(); 57 | let mut packages_url = packages_url.clone(); 58 | // first pop of file name to match relative 59 | pop(&mut packages_path, None, &mut packages_url)?; 60 | loop { 61 | let basename = packages_path.file_name().unwrap().to_str().unwrap(); 62 | let url_basename = packages_url.path_segments().unwrap().next_back().unwrap(); 63 | if basename == "dists" && url_basename == "dists" { 64 | // we don't wanna dists folder in return value 65 | pop(&mut packages_path, Some(&mut relative), &mut packages_url)?; 66 | // add trailing slash to packages_url 67 | packages_url.path_segments_mut().unwrap().push(""); 68 | return Ok((packages_path, relative, packages_url)); 69 | } 70 | if basename != url_basename { 71 | warn!( 72 | "basename = {}, url_basename = {}, relative = {:?}", 73 | basename, url_basename, relative 74 | ); 75 | } 76 | pop(&mut packages_path, Some(&mut relative), &mut packages_url)?; 77 | } 78 | } 79 | 80 | #[derive(Debug)] 81 | pub struct AptPackage { 82 | pub url: Url, 83 | pub relative: Vec, 84 | #[allow(dead_code)] 85 | pub size: usize, 86 | pub filename: String, 87 | } 88 | 89 | impl From for super::ExtensionPackage { 90 | fn from(val: AptPackage) -> Self { 91 | super::ExtensionPackage { 92 | url: val.url, 93 | relative: val.relative, 94 | filename: val.filename, 95 | } 96 | } 97 | } 98 | 99 | pub fn parse_package( 100 | packages_path: &Path, 101 | relative: &[String], 102 | packages_url: &Url, 103 | ) -> Result> { 104 | let data = std::fs::read_to_string(packages_path)?; 105 | let packages = apt_parser::Packages::from(&data); 106 | let (_, root_relative, debian_root_url) = 107 | get_debian_root(packages_path, relative, packages_url)?; 108 | // ignore errors 109 | let mut res = vec![]; 110 | for package in packages { 111 | let pool_url = package.filename; 112 | let size = package.size; 113 | let url = debian_root_url.join(&pool_url)?; 114 | 115 | let mut pool_split: Vec = pool_url.split('/').map(|s| s.to_string()).collect(); 116 | let mut relative = root_relative.clone(); 117 | relative.append(&mut pool_split); 118 | 119 | let basename = relative.pop().unwrap(); 120 | 121 | res.push(AptPackage { 122 | url, 123 | relative, 124 | size: size as usize, 125 | filename: basename, 126 | }) 127 | } 128 | 129 | Ok(res) 130 | } 131 | 132 | #[cfg(test)] 133 | mod tests { 134 | use super::*; 135 | use test_log::test; 136 | 137 | #[test] 138 | fn test_debian_root() { 139 | let packages_path = Path::new("/var/www/html/dists/buster/main/binary-amd64/Packages"); 140 | let relative = vec![ 141 | "dists".to_string(), 142 | "buster".to_string(), 143 | "main".to_string(), 144 | "binary-amd64".to_string(), 145 | ]; 146 | let packages_url = 147 | Url::parse("http://localhost/dists/buster/main/binary-amd64/Packages").unwrap(); 148 | let (debian_root_path, root_relative, debian_root_url) = 149 | get_debian_root(packages_path, &relative, &packages_url).unwrap(); 150 | assert_eq!(debian_root_path, Path::new("/var/www/html/")); 151 | assert_eq!(root_relative, Vec::::new()); 152 | assert_eq!(debian_root_url, Url::parse("http://localhost/").unwrap()); 153 | 154 | let packages_path = 155 | Path::new("/var/www/html/mysql/apt/ubuntu/dists/jammy/mysql-8.0/binary-amd64/Packages"); 156 | let relative = vec![ 157 | "apt".to_string(), 158 | "ubuntu".to_string(), 159 | "dists".to_string(), 160 | "jammy".to_string(), 161 | "mysql-8.0".to_string(), 162 | "binary-amd64".to_string(), 163 | ]; 164 | let packages_url = Url::parse( 165 | "http://repo.mysql.com/apt/ubuntu/dists/jammy/mysql-8.0/binary-amd64/Packages", 166 | ) 167 | .unwrap(); 168 | let (debian_root_path, root_relative, debian_root_url) = 169 | get_debian_root(packages_path, &relative, &packages_url).unwrap(); 170 | assert_eq!( 171 | debian_root_path, 172 | Path::new("/var/www/html/mysql/apt/ubuntu/") 173 | ); 174 | assert_eq!(root_relative, vec!["apt".to_string(), "ubuntu".to_string()]); 175 | assert_eq!( 176 | debian_root_url, 177 | Url::parse("http://repo.mysql.com/apt/ubuntu/").unwrap() 178 | ); 179 | } 180 | } 181 | -------------------------------------------------------------------------------- /tsumugu-cli/src/utils.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use chrono::FixedOffset; 3 | use chrono::TimeZone; 4 | use chrono::{DateTime, Utc}; 5 | use futures_util::Future; 6 | use tracing::debug; 7 | use tracing::warn; 8 | use tsumugu_parser::regex_manager::{get_exclusion_manager_v1, get_exclusion_manager_v2}; 9 | use url::Url; 10 | 11 | use crate::CommonArgs; 12 | 13 | // A simple diagnose of (frustrating) proxy settings 14 | fn proxy_precheck() { 15 | // follows the order of get_from_environment() in reqwest src/proxy.rs 16 | let mut http_proxy = std::env::var("HTTP_PROXY"); 17 | if http_proxy.is_err() { 18 | http_proxy = std::env::var("http_proxy"); 19 | } 20 | let mut https_proxy = std::env::var("HTTPS_PROXY"); 21 | if https_proxy.is_err() { 22 | https_proxy = std::env::var("https_proxy"); 23 | } 24 | let mut all_proxy = std::env::var("ALL_PROXY"); 25 | if all_proxy.is_err() { 26 | all_proxy = std::env::var("all_proxy"); 27 | } 28 | if http_proxy.is_err() && https_proxy.is_err() && all_proxy.is_err() { 29 | debug!("No proxy environment is given."); 30 | return; 31 | } 32 | fn check_format(s: &str) { 33 | if s.starts_with("socks://") { 34 | warn!("Typo in proxy env detected: use socks5:// or socks5h:// protocol please."); 35 | return; 36 | } 37 | if !s.starts_with("http://") 38 | && !s.starts_with("https://") 39 | && !s.starts_with("socks5://") 40 | && !s.starts_with("socks5h://") 41 | { 42 | warn!("Unknown protocol in proxy env, this might be silently ignored by reqwest."); 43 | return; 44 | } 45 | let url = match Url::parse(s) { 46 | Ok(s) => s, 47 | Err(e) => { 48 | warn!("Failed to parse proxy URL {}: {}", s, e); 49 | return; 50 | } 51 | }; 52 | if url.scheme() == "socks5" || url.scheme() == "socks5h" { 53 | // extra check for hostname resolve ability 54 | if let Err(e) = url.socket_addrs(|| Some(1080)) { 55 | warn!("Failed to get socket addr from {}: {}", url, e); 56 | warn!("This might be silently ignored later."); 57 | return; 58 | } 59 | } 60 | debug!("Seems OK with this proxy URL: {}", url); 61 | } 62 | if let Ok(s) = http_proxy { 63 | check_format(&s); 64 | } 65 | if let Ok(s) = https_proxy { 66 | check_format(&s); 67 | } 68 | if let Ok(s) = all_proxy { 69 | check_format(&s); 70 | } 71 | } 72 | 73 | // Helper structs for custom header support 74 | #[derive(Debug, Clone)] 75 | pub struct Header { 76 | pub name: reqwest::header::HeaderName, 77 | pub value: reqwest::header::HeaderValue, 78 | } 79 | 80 | pub fn headers_to_headermap(value: &[Header]) -> reqwest::header::HeaderMap { 81 | let mut headers = reqwest::header::HeaderMap::new(); 82 | for header in value.iter() { 83 | headers.insert(header.name.clone(), header.value.clone()); 84 | } 85 | headers 86 | } 87 | 88 | #[derive(Debug)] 89 | pub struct HeaderParseError; 90 | 91 | impl std::fmt::Display for HeaderParseError { 92 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 93 | write!(f, "Failed to parse header") 94 | } 95 | } 96 | 97 | impl std::error::Error for HeaderParseError {} 98 | 99 | impl std::str::FromStr for Header { 100 | type Err = HeaderParseError; 101 | 102 | fn from_str(s: &str) -> std::result::Result { 103 | let parts: Vec<&str> = s.splitn(2, ':').collect(); 104 | 105 | let name = parts[0].trim(); 106 | let value = parts[1].trim(); 107 | 108 | if parts.len() != 2 { 109 | return Err(HeaderParseError); 110 | } 111 | 112 | let header_name = 113 | reqwest::header::HeaderName::from_str(name).map_err(|_| HeaderParseError)?; 114 | let header_value = 115 | reqwest::header::HeaderValue::from_str(value).map_err(|_| HeaderParseError)?; 116 | 117 | Ok(Header { 118 | name: header_name, 119 | value: header_value, 120 | }) 121 | } 122 | } 123 | 124 | pub(crate) fn get_exclusion_manager( 125 | args: &CommonArgs, 126 | ) -> Box { 127 | if args.exclusion_v2 { 128 | let args = std::env::args().collect::>(); 129 | get_exclusion_manager_v2(&args) 130 | } else { 131 | get_exclusion_manager_v1(&args.exclude, &args.include) 132 | } 133 | } 134 | 135 | pub(crate) fn build_client( 136 | args: &CommonArgs, 137 | redirect: bool, 138 | bind_address: Option<&String>, 139 | auto_compress: bool, 140 | ) -> reqwest::Client { 141 | proxy_precheck(); 142 | let minute = std::time::Duration::new(60, 0); 143 | let mut builder = reqwest::Client::builder() 144 | .user_agent(args.user_agent.clone()) 145 | .local_address(bind_address.map(|x| x.parse::().unwrap())) 146 | .default_headers(args.headers()) 147 | // hard code 1min connect/read timeout currently 148 | .connect_timeout(minute) 149 | .read_timeout(minute) 150 | .gzip(auto_compress) 151 | .brotli(auto_compress) 152 | .deflate(auto_compress); 153 | if !redirect { 154 | builder = builder.redirect(reqwest::redirect::Policy::none()); 155 | } 156 | builder.build().unwrap() 157 | } 158 | 159 | pub(crate) async fn again_async Fut>(f: F, retry: usize) -> Result 160 | where 161 | Fut: Future>, 162 | { 163 | let mut count = 0; 164 | loop { 165 | match f().await { 166 | Ok(x) => return Ok(x), 167 | Err(e) => { 168 | warn!("Error: {:?}, retrying {}/{}", e, count, retry); 169 | count += 1; 170 | if count > retry { 171 | return Err(e); 172 | } 173 | } 174 | } 175 | } 176 | } 177 | 178 | pub(crate) fn is_symlink(path: &std::path::Path) -> bool { 179 | path.symlink_metadata() 180 | .map(|m| m.file_type().is_symlink()) 181 | .unwrap_or(false) 182 | } 183 | 184 | pub(crate) fn naive_to_utc( 185 | naive: &chrono::NaiveDateTime, 186 | timezone: Option, 187 | ) -> DateTime { 188 | match timezone { 189 | None => DateTime::::from_naive_utc_and_offset(*naive, Utc), 190 | Some(timezone) => timezone.from_local_datetime(naive).unwrap().into(), 191 | } 192 | } 193 | 194 | #[cfg(test)] 195 | mod tests { 196 | use super::*; 197 | use test_log::test; 198 | 199 | #[test] 200 | fn test_naive_to_utc() { 201 | let naive = 202 | chrono::NaiveDateTime::parse_from_str("2021-01-01 00:00:00", "%Y-%m-%d %H:%M:%S") 203 | .unwrap(); 204 | let timezone = FixedOffset::east_opt(3600 * 8); 205 | let utc = naive_to_utc(&naive, timezone); 206 | assert_eq!(utc.to_string(), "2020-12-31 16:00:00 UTC"); 207 | let utc = naive_to_utc(&naive, None); 208 | assert_eq!(utc.to_string(), "2021-01-01 00:00:00 UTC"); 209 | } 210 | } 211 | -------------------------------------------------------------------------------- /tsumugu-parser/src/parser/caddy.rs: -------------------------------------------------------------------------------- 1 | /// A parser for default caddy file_server format 2 | use crate::listing::{FileSize, FileType, ListItem}; 3 | 4 | use tsumugu_net::client::HttpClient; 5 | 6 | use super::*; 7 | use anyhow::Result; 8 | use chrono::NaiveDateTime; 9 | use scraper::{Html, Selector}; 10 | 11 | #[derive(Debug, Clone, Default)] 12 | pub struct CaddyListingParser; 13 | 14 | impl Parser for CaddyListingParser { 15 | fn name(&self) -> &'static str { 16 | "Caddy" 17 | } 18 | 19 | fn get_list(&self, client: &dyn HttpClient, url: &url::Url) -> Result { 20 | let resp = handle_net!(client.get_text(url))?; 21 | let url: &Url = &resp.final_url; 22 | assert_if_url_has_no_trailing_slash(url); 23 | let document = Html::parse_document(&resp.body); 24 | let selector = Selector::parse("tr.file").unwrap(); 25 | let mut items = Vec::new(); 26 | for element in document.select(&selector) { 27 | // name and herf 28 | let selector = Selector::parse("td a").unwrap(); 29 | let a = element 30 | .select(&selector) 31 | .next() 32 | .ok_or(parse_error!("td a not found in element"))?; 33 | let href = a 34 | .value() 35 | .attr("href") 36 | .ok_or(parse_error!("no href found in element"))?; 37 | // Caddy file_server will append "./" to href 38 | let name = get_real_name_from_href(href) 39 | .trim_start_matches("./") 40 | .to_string(); 41 | let href = url.join(href)?; 42 | let type_ = if href.as_str().ends_with('/') { 43 | FileType::Directory 44 | } else { 45 | FileType::File 46 | }; 47 | // size 48 | let selector = Selector::parse("td.size div.sizebar div.sizebar-text").unwrap(); 49 | let size = match element.select(&selector).next() { 50 | Some(s) => { 51 | let size_text = s.inner_html(); 52 | // ↱  would be added by caddy when it's a symlink 53 | // https://github.com/caddyserver/caddy/commit/9338741ca79a74247ced86bc26e4994138470852 54 | let size_text = size_text.trim().trim_start_matches("↱ "); 55 | let (n_size, unit) = FileSize::get_humanized(size_text); 56 | Some(FileSize::HumanizedBinary(n_size, unit)) 57 | } 58 | None => None, 59 | }; 60 | // date 61 | let selector = Selector::parse("td.timestamp time").unwrap(); 62 | let mtime = element 63 | .select(&selector) 64 | .next() 65 | .ok_or(parse_error!("td.timestamp time not found in element"))? 66 | .value() 67 | .attr("datetime") 68 | .ok_or(parse_error!("no datetime found in (header maybe?), skipping..."); 40 | continue; 41 | } 42 | }; 43 | let a = match td_a.select(&Selector::parse("a").unwrap()).next() { 44 | Some(a) => a, 45 | None => { 46 | return Err(parse_error!("Cannot find in first cell.")); 47 | } 48 | }; 49 | let href = a 50 | .value() 51 | .attr("href") 52 | .ok_or(parse_error!("No href found in element in first cell"))?; 53 | let displayed_filename = a.inner_html(); 54 | 55 | if displayed_filename == "Parent Directory/" || href == "../" { 56 | continue; 57 | } 58 | 59 | let name = get_real_name_from_href(href); 60 | let href = url.join(href)?; 61 | let type_ = if href.as_str().ends_with('/') { 62 | FileType::Directory 63 | } else { 64 | FileType::File 65 | }; 66 | let size = td_iterator 67 | .next() 68 | .ok_or(parse_error!("Cannot get size in td"))? 69 | .inner_html(); 70 | let size = size.trim(); 71 | let date = td_iterator 72 | .next() 73 | .ok_or(parse_error!("Cannot get date in td"))? 74 | .inner_html(); 75 | let date = &date_normalization(date.trim()); 76 | 77 | // decide (guess) which time format to use 78 | let (date_fmt, _) = guess_date_fmt(date); 79 | let naive_date; 80 | let timezone; 81 | if !date_fmt_has_timezone(&date_fmt) { 82 | naive_date = NaiveDateTime::parse_from_str(date, &date_fmt)?; 83 | timezone = None; 84 | } else { 85 | let date = DateTime::parse_from_str(date, &date_fmt)?; 86 | naive_date = date.naive_utc(); 87 | timezone = Some(date.offset().to_owned()); 88 | } 89 | 90 | items.push(ListItem::new( 91 | href, 92 | name, 93 | type_, 94 | { 95 | if size == "-" { 96 | None 97 | } else { 98 | let (n_size, unit) = FileSize::get_humanized(size); 99 | Some(FileSize::HumanizedBinary(n_size, unit)) 100 | } 101 | }, 102 | naive_date, 103 | timezone, 104 | )); 105 | } 106 | 107 | Ok(ListResult::List(items)) 108 | } 109 | } 110 | 111 | #[cfg(test)] 112 | mod tests { 113 | use chrono::FixedOffset; 114 | 115 | use super::*; 116 | use crate::listing::SizeUnit; 117 | use crate::parser::tests::*; 118 | 119 | #[test] 120 | fn test_njumirrors() { 121 | let context = init_client(); 122 | let items = FancyIndexListingParser 123 | .get_list( 124 | &context, 125 | &Url::parse("http://localhost:1921/bmclapi/").unwrap(), 126 | ) 127 | .unwrap(); 128 | match items { 129 | ListResult::List(items) => { 130 | assert_eq!(items[0].name, "bouncycastle"); 131 | assert_eq!(items[0].type_, FileType::Directory); 132 | assert_eq!(items[0].size, None); 133 | assert_eq!( 134 | items[0].mtime, 135 | NaiveDateTime::parse_from_str("2024-04-23 19:01:54", "%Y-%m-%d %H:%M:%S") 136 | .unwrap() 137 | ); 138 | assert_eq!(items[items.len() - 1].name, "lwjgURL"); 139 | assert_eq!(items[items.len() - 1].type_, FileType::File); 140 | assert_eq!( 141 | items[items.len() - 1].size, 142 | Some(FileSize::HumanizedBinary(1767.0, SizeUnit::B)) 143 | ); 144 | assert_eq!( 145 | items[items.len() - 1].mtime, 146 | NaiveDateTime::parse_from_str("2021-04-30 20:55:32", "%Y-%m-%d %H:%M:%S") 147 | .unwrap() 148 | ); 149 | } 150 | _ => unreachable!(), 151 | } 152 | } 153 | 154 | #[test] 155 | fn test_loongnix() { 156 | let context = init_client(); 157 | let items = FancyIndexListingParser 158 | .get_list( 159 | &context, 160 | &Url::parse("http://localhost:1921/loongnix/").unwrap(), 161 | ) 162 | .unwrap(); 163 | match items { 164 | ListResult::List(items) => { 165 | assert_eq!(items[0].name, "contrib"); 166 | assert_eq!(items[0].type_, FileType::Directory); 167 | assert_eq!(items[0].size, None); 168 | assert_eq!( 169 | items[0].mtime, 170 | NaiveDateTime::parse_from_str("2023-08-15 05:48", "%Y-%m-%d %H:%M").unwrap() 171 | ); 172 | assert_eq!(items[items.len() - 1].name, "Release.gpg"); 173 | assert_eq!(items[items.len() - 1].type_, FileType::File); 174 | assert_eq!( 175 | items[items.len() - 1].size, 176 | Some(FileSize::HumanizedBinary(659.0, SizeUnit::B)) 177 | ); 178 | assert_eq!( 179 | items[items.len() - 1].mtime, 180 | NaiveDateTime::parse_from_str("2023-08-15 05:48", "%Y-%m-%d %H:%M").unwrap() 181 | ); 182 | } 183 | _ => unreachable!(), 184 | } 185 | } 186 | 187 | #[test] 188 | fn test_misc_1() { 189 | // In fact this is NOT a fancyindex page, but it basically match the layout of that. 190 | let context = init_client(); 191 | let items = FancyIndexListingParser 192 | .get_list( 193 | &context, 194 | &Url::parse("http://localhost:1921/misc/1/").unwrap(), 195 | ) 196 | .unwrap(); 197 | match items { 198 | ListResult::List(items) => { 199 | assert_eq!(items.len(), 1); 200 | assert_eq!(items[0].name, "passwd"); 201 | assert_eq!(items[0].type_, FileType::File); 202 | assert_eq!( 203 | items[0].size, 204 | Some(FileSize::HumanizedBinary(3.3, SizeUnit::K)) 205 | ); 206 | assert_eq!( 207 | items[0].mtime, 208 | NaiveDateTime::parse_from_str("2024-08-24 15:04:11", "%Y-%m-%d %H:%M:%S") 209 | .unwrap() 210 | ); 211 | assert_eq!(items[0].timezone, FixedOffset::east_opt(0),); 212 | } 213 | _ => unreachable!(), 214 | } 215 | } 216 | } 217 | --------------------------------------------------------------------------------
elements, found {}", 43 | tds.len() 44 | ); 45 | let a = tds[0] 46 | .child_elements() 47 | .next() 48 | .ok_or(parse_error!("No element found in first "))?; 49 | if a.inner_html() == "../" { 50 | continue; 51 | } 52 | let href = a 53 | .value() 54 | .attr("href") 55 | .ok_or(parse_error!("No href found in element in first "))?; 56 | let name = get_real_name_from_href(href); 57 | let href = url.join(href)?; 58 | let type_ = if href.as_str().ends_with('/') { 59 | FileType::Directory 60 | } else { 61 | FileType::File 62 | }; 63 | let date = tds[1].inner_html(); 64 | let date = date.trim(); // %d-%b-%Y %H:%M 65 | let mtime = match type_ { 66 | FileType::File => NaiveDateTime::parse_from_str(date, "%d-%b-%Y %H:%M")?, 67 | FileType::Directory => NaiveDateTime::default(), 68 | }; 69 | let size = tds[2].inner_html(); 70 | let size = size.trim(); 71 | let size = if size.is_empty() { 72 | None 73 | } else { 74 | let size = size 75 | .parse::() 76 | .map_err(|e| parse_error!("Expected size to be u64: {}", e))?; 77 | Some(FileSize::Precise(size)) 78 | }; 79 | items.push(ListItem::new( 80 | href, 81 | name, 82 | type_, 83 | size, 84 | mtime, 85 | FixedOffset::east_opt(0), 86 | )); 87 | } 88 | 89 | Ok(ListResult::List(items)) 90 | } 91 | } 92 | 93 | #[cfg(test)] 94 | mod tests { 95 | use super::*; 96 | use crate::parser::tests::*; 97 | use url::Url; 98 | 99 | #[test] 100 | fn test_postgresql_srpms_testing() { 101 | let context = init_client(); 102 | let items = S3Indexbuilder 103 | .get_list( 104 | &context, 105 | &Url::parse("http://localhost:1921/postgresql/srpms/").unwrap(), 106 | ) 107 | .unwrap(); 108 | match items { 109 | ListResult::List(items) => { 110 | assert_eq!(items.len(), 149); 111 | assert_eq!(items[0].name, "bgw_replstatus_13-1.0.6-5PGDG.f42.src.rpm"); 112 | assert_eq!(items[0].type_, FileType::File); 113 | assert_eq!(items[0].size, Some(FileSize::Precise(19340))); 114 | assert_eq!( 115 | items[0].mtime, 116 | NaiveDateTime::parse_from_str("2025-03-26 13:41", "%Y-%m-%d %H:%M").unwrap() 117 | ); 118 | } 119 | _ => unreachable!(), 120 | } 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /fixtures/loongnix/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Nginx Directory 7 | 8 | 9 | 10 | 11 | 12 | 18 |
19 |

Loongnix操作系统源

20 |

访问地址 http://pkg.loongnix.cn

21 |

Directory: 22 | /loongnix/dists/DaoXiangHu-cartoons/

23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 |
File Name  ↓ File Size  ↓ Date  ↓ 
--
-2023-08-15 05:48
-2023-08-15 05:48
-2023-08-15 05:48
10.0 MiB2023-08-15 05:48
659.9 KiB2023-08-15 05:48
454.6 KiB2023-08-15 05:48
7.9 KiB2023-08-15 05:48
7.2 KiB2023-08-15 05:48
659 B2023-08-15 05:48
35 | 38 | 39 | 40 | 41 | 68 | 69 | 70 | 76 | -------------------------------------------------------------------------------- /tsumugu-cli/src/compare.rs: -------------------------------------------------------------------------------- 1 | use std::path::Path; 2 | 3 | use chrono::{DateTime, FixedOffset, Utc}; 4 | use tracing::{debug, warn}; 5 | 6 | use tsumugu_parser::listing::{FileSize, FileType, ListItem, SizeUnit}; 7 | 8 | use crate::utils::naive_to_utc; 9 | 10 | pub(crate) fn compare_filetype(fstype: std::fs::FileType, tsumugu_type: FileType) -> bool { 11 | match tsumugu_type { 12 | FileType::File => fstype.is_file(), 13 | FileType::Directory => fstype.is_dir(), 14 | } 15 | } 16 | 17 | pub(crate) fn should_download_by_list( 18 | path: &Path, 19 | remote: &ListItem, 20 | remote_timezone: Option, 21 | skip_if_exists: bool, 22 | size_only: bool, 23 | ) -> bool { 24 | let local_metadata = match path.metadata() { 25 | Ok(m) => { 26 | if skip_if_exists || remote.skip_check { 27 | debug!("Skipping {:?} because it exists", path); 28 | return false; 29 | } 30 | m 31 | } 32 | Err(e) => { 33 | if e.kind() != std::io::ErrorKind::NotFound { 34 | warn!("Failed to get metadata of {:?}: {:?}", path, e); 35 | } 36 | return true; 37 | } 38 | }; 39 | if !compare_filetype(local_metadata.file_type(), remote.type_) { 40 | // TODO: delete old file which type is not correct 41 | warn!("Type mismatch: {:?} remote {:?}", path, remote.type_); 42 | return true; 43 | } 44 | let local_size = local_metadata.len(); 45 | let is_size_match = match remote.size.unwrap_or(FileSize::Precise(0)) { 46 | FileSize::Precise(size) => local_size == size, 47 | FileSize::HumanizedBinary(size, SizeUnit::B) => { 48 | // SizeUnit::B is a special case, it means the size is in bytes 49 | local_size == size as u64 50 | } 51 | // A very rough size check is used here, 52 | // as it looks like size returned by server may not be very accurate 53 | FileSize::HumanizedBinary(size, unit) => { 54 | let base = 1024_f64.powf(unit.get_exp().into()); 55 | let lsize = local_size as f64 / base; 56 | (lsize - size).abs() < 2.0 57 | } 58 | FileSize::HumanizedDecimal(size, unit) => { 59 | let base = 1000_f64.powf(unit.get_exp().into()); 60 | let lsize = local_size as f64 / base; 61 | (lsize - size).abs() < 2.0 62 | } 63 | }; 64 | if !is_size_match { 65 | debug!( 66 | "Size mismatch: {:?} local {:?} remote {:?}", 67 | path, local_size, remote.size 68 | ); 69 | return true; 70 | } 71 | if size_only { 72 | return false; 73 | } 74 | let local_mtime: DateTime = match local_metadata.modified() { 75 | Ok(m) => m, 76 | Err(_) => { 77 | // Here we expect all fs to support mtime 78 | unreachable!() 79 | } 80 | } 81 | .into(); 82 | // Use remote timezone or not? 83 | let timezone = match remote.timezone { 84 | None => remote_timezone, 85 | Some(tz) => Some(tz), 86 | }; 87 | let remote_mtime = naive_to_utc(&remote.mtime, timezone); 88 | let offset = remote_mtime - local_mtime; 89 | debug!("DateTime offset: {:?} {:?}", path, offset); 90 | match timezone { 91 | None => { 92 | // allow an offset to up to 24hrs 93 | offset.num_hours().abs() > 24 94 | } 95 | Some(_) => { 96 | // allow an offset up to 1min 97 | offset.num_minutes().abs() > 1 98 | } 99 | } 100 | } 101 | 102 | pub(crate) fn should_download_by_header( 103 | path: &Path, 104 | resp: &reqwest::Response, 105 | size_only: bool, 106 | ) -> bool { 107 | // Construct a valid "ListItem" and pass to should_download_by_list 108 | debug!("Checking {:?} by header: {:?}", path, resp); 109 | let item = ListItem { 110 | url: resp.url().clone(), 111 | name: path.file_name().unwrap().to_str().unwrap().to_string(), 112 | type_: if resp.url().as_str().ends_with('/') { 113 | FileType::Directory 114 | } else { 115 | FileType::File 116 | }, 117 | size: Some(FileSize::Precise(match resp.content_length() { 118 | Some(l) => l, 119 | None => { 120 | warn!( 121 | "No content-length from upstream ({}), go downloading anyway", 122 | resp.url() 123 | ); 124 | return true; 125 | } 126 | })), 127 | mtime: match tsumugu_net::client::impls::get_response_mtime(resp) { 128 | Ok(m) => m, 129 | Err(e) => { 130 | warn!( 131 | "Cannot get mtime from {} ({}), go downloading anyway", 132 | resp.url(), 133 | e 134 | ); 135 | return true; 136 | } 137 | } 138 | .naive_utc(), 139 | timezone: None, 140 | skip_check: false, 141 | }; 142 | should_download_by_list(path, &item, FixedOffset::east_opt(0), false, size_only) 143 | } 144 | -------------------------------------------------------------------------------- /tsumugu-parser/src/regex_manager/v1.rs: -------------------------------------------------------------------------------- 1 | use super::{Comparison, ExclusionManagerTrait, ExpandedRegex}; 2 | 3 | #[derive(Debug, Clone)] 4 | pub struct ExclusionManager { 5 | /// Stop the task immediately if any of these regexes match. 6 | instant_stop_regexes: Vec, 7 | /// Continue, but don't download anything if any of these regexes match. 8 | list_only_regexes: Vec, 9 | /// Include only these regexes. 10 | include_regexes: Vec, 11 | } 12 | 13 | impl ExclusionManager { 14 | pub fn new(exclusions: &[ExpandedRegex], inclusions: &[ExpandedRegex]) -> Self { 15 | let mut instant_stop_regexes = Vec::new(); 16 | let mut list_only_regexes = Vec::new(); 17 | 18 | for exclusion in exclusions { 19 | let regex_str = exclusion.inner.as_str(); 20 | let mut flag = false; 21 | for inclusion in inclusions { 22 | if inclusion.inner.as_str().starts_with(regex_str) { 23 | list_only_regexes.push(exclusion.clone()); 24 | flag = true; 25 | break; 26 | } 27 | } 28 | if !flag { 29 | instant_stop_regexes.push(exclusion.clone()); 30 | } 31 | } 32 | 33 | Self { 34 | instant_stop_regexes, 35 | list_only_regexes, 36 | include_regexes: inclusions.to_vec(), 37 | } 38 | } 39 | } 40 | 41 | impl ExclusionManagerTrait for ExclusionManager { 42 | fn match_str(&self, text: &str) -> Comparison { 43 | for regex in &self.instant_stop_regexes { 44 | if regex.is_match(text) { 45 | return Comparison::Stop; 46 | } 47 | } 48 | for regex in &self.include_regexes { 49 | if regex.is_match(text) { 50 | return Comparison::Ok; 51 | } 52 | } 53 | // Performance: it is possible that a regex for inclusion shown like this: 54 | // ^fedora/${FEDORA_CURRENT} 55 | // And the remote corresponding folder has a lot of subfolders. 56 | // This is a "shortcut" to avoid checking all subfolders. 57 | for regex in &self.include_regexes { 58 | if regex.is_others_match(text) { 59 | return Comparison::Stop; 60 | } 61 | } 62 | for regex in &self.list_only_regexes { 63 | if regex.is_match(text) { 64 | return Comparison::ListOnly; 65 | } 66 | } 67 | Comparison::Ok 68 | } 69 | } 70 | 71 | #[cfg(test)] 72 | mod tests { 73 | use std::str::FromStr; 74 | 75 | use test_log::test; 76 | use tracing::debug; 77 | 78 | use super::*; 79 | 80 | #[test] 81 | fn test_exclusion() { 82 | let target = 83 | "/debian/pmg/dists/stretch/pmgtest/binary-amd64/grub-efi-amd64-bin_2.02-pve6.changelog"; 84 | let exclusions = 85 | vec![ExpandedRegex::from_str("pmg/dists/.+/pmgtest/.+changelog$").unwrap()]; 86 | let inclusions = vec![]; 87 | let exclusion_manager = ExclusionManager::new(&exclusions, &inclusions); 88 | assert_eq!(exclusion_manager.match_str(target), Comparison::Stop); 89 | } 90 | 91 | #[test] 92 | fn test_partial() { 93 | let target1 = "/yum/mysql-tools-community/fc/24/x86_64"; 94 | let target2 = "/yum/mysql-tools-community/fc/42/x86_64"; 95 | let target3 = "/yum/mysql-tools-community/fc/"; 96 | let target4 = "/yum/mysql-tools-community/fc/24/"; 97 | let target5 = "/yum/mysql-tools-community/fc/42/"; 98 | let exclusions = vec![ExpandedRegex::from_str("/fc/").unwrap()]; 99 | let inclusions = vec![ExpandedRegex::from_str("/fc/${FEDORA_CURRENT}").unwrap()]; 100 | debug!("exclusions: {:?}", exclusions); 101 | debug!("inclusions: {:?}", inclusions); 102 | let exclusion_manager = ExclusionManager::new(&exclusions, &inclusions); 103 | assert_eq!(exclusion_manager.match_str(target1), Comparison::Stop); 104 | assert_eq!(exclusion_manager.match_str(target2), Comparison::Ok); 105 | assert_eq!(exclusion_manager.match_str(target3), Comparison::ListOnly); 106 | assert_eq!(exclusion_manager.match_str(target4), Comparison::Stop); 107 | assert_eq!(exclusion_manager.match_str(target5), Comparison::Ok); 108 | } 109 | 110 | #[test] 111 | fn test_exclude_dbg() { 112 | let target1 = "/yum/mysql-8.0-community/docker/el/8/aarch64/mysql-community-server-minimal-8.0.33-1.el8.aarch64.rpm"; 113 | let target2 = "/yum/mysql-8.0-community/docker/el/8/debuginfo/x86_64/mysql-community-server-minimal-debuginfo-8.0.24-1.el8.x86_64.rpm"; 114 | let exclusions = vec![ 115 | ExpandedRegex::from_str("/el/").unwrap(), 116 | ExpandedRegex::from_str("debuginfo").unwrap(), 117 | ]; 118 | let inclusions = vec![ExpandedRegex::from_str("/el/${RHEL_CURRENT}").unwrap()]; 119 | let exclusion_manager = ExclusionManager::new(&exclusions, &inclusions); 120 | assert_eq!(exclusion_manager.match_str(target1), Comparison::Ok); 121 | assert_eq!(exclusion_manager.match_str(target2), Comparison::Stop); 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /tsumugu-parser/src/listing.rs: -------------------------------------------------------------------------------- 1 | // Module for handling directory listing 2 | 3 | use std::fmt::Display; 4 | 5 | use chrono::{FixedOffset, NaiveDateTime}; 6 | use url::Url; 7 | 8 | #[derive(Debug, PartialEq, Clone, Copy)] 9 | pub enum FileType { 10 | File, 11 | Directory, 12 | } 13 | 14 | #[derive(Debug, PartialEq, Clone, Copy)] 15 | pub enum SizeUnit { 16 | B, 17 | K, 18 | M, 19 | G, 20 | T, 21 | P, 22 | } 23 | 24 | impl Display for SizeUnit { 25 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 26 | let unit = match self { 27 | SizeUnit::B => "B", 28 | SizeUnit::K => "K", 29 | SizeUnit::M => "M", 30 | SizeUnit::G => "G", 31 | SizeUnit::T => "T", 32 | SizeUnit::P => "P", 33 | }; 34 | write!(f, "{unit}") 35 | } 36 | } 37 | 38 | impl SizeUnit { 39 | pub fn get_exp(self) -> u32 { 40 | match self { 41 | SizeUnit::B => 0, 42 | SizeUnit::K => 1, 43 | SizeUnit::M => 2, 44 | SizeUnit::G => 3, 45 | SizeUnit::T => 4, 46 | SizeUnit::P => 5, 47 | } 48 | } 49 | } 50 | 51 | #[derive(Debug, Clone, Copy, PartialEq)] 52 | pub enum FileSize { 53 | Precise(u64), 54 | /// 1024B -> 1KiB 55 | HumanizedBinary(f64, SizeUnit), 56 | #[allow(dead_code)] 57 | /// 1000B -> 1KB 58 | HumanizedDecimal(f64, SizeUnit), 59 | } 60 | 61 | impl Display for FileSize { 62 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 63 | match self { 64 | FileSize::Precise(size) => write!(f, "{}", size), 65 | FileSize::HumanizedBinary(size, unit) => write!(f, "{size} {unit}"), 66 | FileSize::HumanizedDecimal(size, unit) => write!(f, "{size} {unit}"), 67 | } 68 | } 69 | } 70 | 71 | impl FileSize { 72 | pub fn get_humanized(s: &str) -> (f64, SizeUnit) { 73 | // separate numeric and unit 74 | let mut numeric = String::new(); 75 | let mut unit = String::new(); 76 | for c in s.chars() { 77 | if c.is_ascii_digit() || c == '.' { 78 | numeric.push(c); 79 | } else { 80 | unit.push(c); 81 | } 82 | } 83 | let unit = unit.to_lowercase(); 84 | let unit = unit.trim(); 85 | 86 | let numeric = numeric.parse::().unwrap(); 87 | let unit = match unit.chars().next() { 88 | None => SizeUnit::B, 89 | Some(u) => match u { 90 | 'b' => SizeUnit::B, 91 | 'k' => SizeUnit::K, 92 | 'm' => SizeUnit::M, 93 | 'g' => SizeUnit::G, 94 | 't' => SizeUnit::T, 95 | 'p' => SizeUnit::P, 96 | _ => panic!("Unknown unit: {unit}"), 97 | }, 98 | }; 99 | 100 | (numeric, unit) 101 | } 102 | 103 | pub fn get_estimated(&self) -> u64 { 104 | match self { 105 | FileSize::Precise(size) => *size, 106 | FileSize::HumanizedBinary(size, unit) => { 107 | let exp = unit.get_exp(); 108 | (size * 1024_f64.powi(exp as i32)) as u64 109 | } 110 | FileSize::HumanizedDecimal(size, unit) => { 111 | let exp = unit.get_exp(); 112 | (size * 1000_f64.powi(exp as i32)) as u64 113 | } 114 | } 115 | } 116 | } 117 | 118 | #[derive(Debug, Clone)] 119 | pub struct ListItem { 120 | pub url: Url, 121 | pub name: String, 122 | pub type_: FileType, 123 | pub size: Option, 124 | /// mtime is parsed from HTML, which is the local datetime of the "server" (not necessarily localtime or UTC) 125 | pub mtime: NaiveDateTime, 126 | /// Some HTML provides "timezone", parser shall set this if so (otherwise just None) 127 | pub timezone: Option, 128 | /// Don't check size and mtime: download only if the file doesn't exist. 129 | /// This is expected to be set by apt/yum parser extension (parser will not use this). 130 | pub skip_check: bool, 131 | } 132 | 133 | impl ListItem { 134 | pub fn new( 135 | url: Url, 136 | name: String, 137 | type_: FileType, 138 | size: Option, 139 | mtime: NaiveDateTime, 140 | timezone: Option, 141 | ) -> Self { 142 | Self { 143 | url, 144 | name, 145 | type_, 146 | size, 147 | mtime, 148 | timezone, 149 | skip_check: false, 150 | } 151 | } 152 | } 153 | 154 | impl Display for ListItem { 155 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 156 | let size_str = match self.size { 157 | Some(size) => size.to_string(), 158 | None => String::from("(none)"), 159 | }; 160 | let mtime_str = self.mtime.format("%Y-%m-%d %H:%M:%S").to_string(); 161 | let timezone = match self.timezone { 162 | None => "", 163 | Some(tz) => &format!("({})", tz), 164 | }; 165 | write!( 166 | f, 167 | "{} {:?} {} {}{} {}", 168 | self.url, self.type_, size_str, mtime_str, timezone, self.name 169 | ) 170 | } 171 | } 172 | -------------------------------------------------------------------------------- /tsumugu-parser/src/parser/gradle.rs: -------------------------------------------------------------------------------- 1 | use crate::listing::{FileSize, FileType, ListItem}; 2 | use chrono::{DateTime, NaiveDateTime}; 3 | use scraper::{Html, Selector}; 4 | use tracing::info; 5 | 6 | use super::*; 7 | use anyhow::Result; 8 | 9 | #[derive(Debug, Clone, Default)] 10 | pub struct GradleListingParser {} 11 | 12 | impl Parser for GradleListingParser { 13 | fn name(&self) -> &'static str { 14 | "services.gradle.org" 15 | } 16 | 17 | fn get_list(&self, client: &dyn HttpClient, url: &url::Url) -> Result { 18 | let resp = handle_net!(client.get_text(url))?; 19 | let url: &Url = &resp.final_url; 20 | assert_if_url_has_no_trailing_slash(url); 21 | let document = Html::parse_document(&resp.body); 22 | let selector = Selector::parse("ul li").unwrap(); 23 | let mut items = Vec::new(); 24 | for element in document.select(&selector) { 25 | // Select first, then s 26 | let a_selector = Selector::parse("a").unwrap(); 27 | let span_selector = Selector::parse("span").unwrap(); 28 | let size_selector = Selector::parse("span.size").unwrap(); 29 | let date_selector = Selector::parse("span.date").unwrap(); 30 | 31 | if element.select(&span_selector).next().is_none() { 32 | info!("No in this
  • . Maybe it's a header"); 33 | continue; 34 | } 35 | 36 | let a = match element.select(&a_selector).next() { 37 | Some(a) => a, 38 | None => { 39 | return Err(parse_error!("No in given
  • ")); 40 | } 41 | }; 42 | let href = a 43 | .value() 44 | .attr("href") 45 | .ok_or(parse_error!("No href found in element"))?; 46 | let displayed_filename = a.inner_html(); 47 | 48 | if displayed_filename == "Parent Directory/" || href == "../" { 49 | continue; 50 | } 51 | 52 | let name = get_real_name_from_href(href); 53 | let href = url.join(href)?; 54 | let type_ = if href.as_str().ends_with('/') { 55 | FileType::Directory 56 | } else { 57 | FileType::File 58 | }; 59 | let size = element 60 | .select(&size_selector) 61 | .next() 62 | .ok_or(parse_error!("Cannot get size"))? 63 | .inner_html(); 64 | let size = size.trim(); 65 | let date = element 66 | .select(&date_selector) 67 | .next() 68 | .ok_or(parse_error!("Cannot get date"))? 69 | .inner_html(); 70 | let date = &date_normalization(date.trim()); 71 | 72 | // decide (guess) which time format to use 73 | let (date_fmt, _) = guess_date_fmt(date); 74 | let naive_date; 75 | let timezone; 76 | if !date_fmt_has_timezone(&date_fmt) { 77 | naive_date = NaiveDateTime::parse_from_str(date, &date_fmt)?; 78 | timezone = None; 79 | } else { 80 | let date = DateTime::parse_from_str(date, &date_fmt)?; 81 | naive_date = date.naive_utc(); 82 | timezone = Some(date.offset().to_owned()); 83 | } 84 | 85 | items.push(ListItem::new( 86 | href, 87 | name, 88 | type_, 89 | { 90 | if size == "-" { 91 | None 92 | } else { 93 | let (n_size, unit) = FileSize::get_humanized(size); 94 | Some(FileSize::HumanizedBinary(n_size, unit)) 95 | } 96 | }, 97 | naive_date, 98 | timezone, 99 | )); 100 | } 101 | 102 | Ok(ListResult::List(items)) 103 | } 104 | } 105 | 106 | #[cfg(test)] 107 | mod tests { 108 | use chrono::FixedOffset; 109 | use test_log::test; 110 | 111 | use crate::listing::SizeUnit; 112 | 113 | use super::*; 114 | use crate::parser::tests::*; 115 | 116 | #[test] 117 | fn test_gradle() { 118 | let context = init_client(); 119 | let items = GradleListingParser::default() 120 | .get_list( 121 | &context, 122 | &url::Url::parse("http://localhost:1921/gradle").unwrap(), 123 | ) 124 | .unwrap(); 125 | match items { 126 | ListResult::List(items) => { 127 | assert_eq!(items.len(), 64); 128 | assert_eq!(items[0].name, "gradle-8.10-wrapper.jar.sha256"); 129 | assert_eq!(items[0].type_, FileType::File); 130 | assert_eq!( 131 | items[0].size, 132 | Some(FileSize::HumanizedBinary(64.0, SizeUnit::B)) 133 | ); 134 | assert_eq!( 135 | items[0].mtime, 136 | NaiveDateTime::parse_from_str("14-Aug-2024 11:18", "%d-%b-%Y %H:%M").unwrap() 137 | ); 138 | assert_eq!(items[0].timezone, FixedOffset::east_opt(0),); 139 | } 140 | _ => unreachable!(), 141 | } 142 | } 143 | } 144 | -------------------------------------------------------------------------------- /fixtures/wine-builds/index.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | Index of /wine-builds 6 | 7 | 8 | 9 | 10 |
    11 | 12 |
    13 |
    14 | WineHQ 15 |
    16 | 17 |
    Run Windows applications on Linux, BSD, Solaris and Mac OS X.
    18 | 19 | 24 | 25 |
    26 | 33 |
    34 | 35 |
    36 |
    37 | 38 | 39 |

    Wine Download Server

    40 | 41 |
    42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 |
    [ICO]NameLast modifiedSize
    [PARENTDIR]Parent Directory  -
    [DIR]android/2022-01-18 15:14 -
    [DIR]debian/2019-01-07 19:52 -
    [DIR]fedora/2023-04-20 14:52 -
    [DIR]macosx/2017-03-30 15:49 -
    [DIR]mageia/2017-09-29 23:46 -
    [DIR]ubuntu/2019-01-03 09:20 -
    [   ]Release.key2017-03-28 14:54 3.0K
    [   ]winehq.key2018-12-19 08:07 3.1K
    55 |
    56 |
    57 | 58 |
    59 | 60 |
    61 |
    62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 77 | 78 | 82 | 83 | -------------------------------------------------------------------------------- /fixtures/influxdata/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | InfluxData - Package Repository 6 | 18 | 19 | 20 | 21 |
    22 |

    23 | InfluxData - Package Repository 24 |

    25 |
    26 |
    27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 37 | 38 | 39 | 40 | 41 | 44 | 45 | 46 | 47 | 48 | 51 | 52 | 53 | 54 | 55 | 58 | 59 | 60 | 61 | 62 | 65 | 66 | 67 | 68 | 69 | 72 | 73 | 74 | 75 | 76 | 79 | 80 | 81 | 82 | 83 | 86 | 87 | 88 | 89 | 90 | 93 | 94 | 95 | 96 | 97 | 100 | 101 | 102 | 103 | 104 | 107 | 108 | 109 | 110 |
    FilenameLast ModifiedSize
    35 | centos/ 36 | --
    42 | debian/ 43 | --
    49 | packages/ 50 | --
    56 | rhel/ 57 | --
    63 | stable/ 64 | --
    70 | ubuntu/ 71 | --
    77 | influxdata-archive.key 78 | 2023-01-26 21:11:34+00:003935
    84 | influxdata-archive_compat.key 85 | 2023-01-26 21:11:34+00:001684
    91 | influxdb.key 92 | 2023-09-11 19:24:29+00:003906
    98 | influxdb2.key 99 | 2023-01-26 21:11:34+00:003902
    105 | packages.db 106 | 2025-06-27 06:37:22+00:00462848
    111 |
    112 |

    Instructions

    113 |

    114 | This package repo contains RPM and DEB builds of the InfluxData products, 115 | including InfluxDB, Telegraf, Chronograf, and Kapacitor. Below are 116 | instructions on how to add this package repo to both RPM and DEB based 117 | systems. 118 |

    119 |

    DEB Repo

    120 |

    For DEB-based platforms (e.g. Ubuntu, Debian):

    121 |
    
    122 |   wget -q https://repos.influxdata.com/influxdata-archive_compat.key
    123 |   echo '393e8779c89ac8d958f81f942f9ad7fb82a25e133faddaf92e15b16e6ac9ce4c influxdata-archive_compat.key' | sha256sum -c && cat influxdata-archive_compat.key | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/influxdata-archive_compat.gpg > /dev/null
    124 |   echo 'deb [signed-by=/etc/apt/trusted.gpg.d/influxdata-archive_compat.gpg] https://repos.influxdata.com/debian stable main' | sudo tee /etc/apt/sources.list.d/influxdata.list
    125 |     
    126 |

    RPM Repo

    127 |

    For RPM-based platforms (e.g. RHEL, CentOS):

    128 |
    
    129 |   cat <<EOF | sudo tee /etc/yum.repos.d/influxdata.repo
    130 |   [influxdata]
    131 |   name = InfluxData Repository - Stable
    132 |   baseurl = https://repos.influxdata.com/stable/\$basearch/main
    133 |   enabled = 1
    134 |   gpgcheck = 1
    135 |   gpgkey = https://repos.influxdata.com/influxdata-archive_compat.key
    136 |   EOF
    137 |     
    138 |

    GPG Keys

    139 |

    140 | There are currently two GPG keys users can use. The preferred key is 141 | influxdata-archive.key. However, if users are running on an 142 | older distribution (e.g. CentOS/RHEL 7, Ubuntu 18.04 LTS, or Debian 143 | Buster), then the influxdata-archive_compat.key is required 144 | for use. This is due to older versions of APT and RPM that do not support 145 | subkeys for verification. 146 |

    147 |
      148 |
    • influxdata-archive.key: preferred key
    • 149 |
    • influxdata-archive_compat.key: preferred key for older distribution
    • 150 |
    • influxdb.key: deprecated, do not use
    • 151 |
    • influxdb2.key: deprecated, do not use
    • 152 |
    153 | 159 | 160 | 161 | -------------------------------------------------------------------------------- /fixtures/nodejs/latest-jod/index.html: -------------------------------------------------------------------------------- 1 | Index of /dist/latest-jod/

    Index of /dist/latest-jod/


     2 | ../
     3 | docs/                                                             -                   -
     4 | win-arm64/                                                        -                   -
     5 | win-x64/                                                          -                   -
     6 | win-x86/                                                          -                   -
     7 | SHASUMS256.txt.asc                                 24 Sept 2025, 13:12               4.7 KB
     8 | SHASUMS256.txt.sig                                 24 Sept 2025, 13:12                566 B
     9 | SHASUMS256.txt                                     24 Sept 2025, 13:10               3.8 KB
    10 | node-v22.20.0-aix-ppc64.tar.gz                     24 Sept 2025, 13:10                72 MB
    11 | node-v22.20.0-arm64.msi                            24 Sept 2025, 13:10                28 MB
    12 | node-v22.20.0-darwin-arm64.tar.gz                  24 Sept 2025, 13:10                50 MB
    13 | node-v22.20.0-darwin-arm64.tar.xz                  24 Sept 2025, 13:10                26 MB
    14 | node-v22.20.0-darwin-x64.tar.gz                    24 Sept 2025, 13:10                51 MB
    15 | node-v22.20.0-darwin-x64.tar.xz                    24 Sept 2025, 13:10                27 MB
    16 | node-v22.20.0-headers.tar.gz                       24 Sept 2025, 13:10                10 MB
    17 | node-v22.20.0-headers.tar.xz                       24 Sept 2025, 13:10               567 KB
    18 | node-v22.20.0-linux-arm64.tar.gz                   24 Sept 2025, 13:10                56 MB
    19 | node-v22.20.0-linux-arm64.tar.xz                   24 Sept 2025, 13:10                30 MB
    20 | node-v22.20.0-linux-armv7l.tar.gz                  24 Sept 2025, 13:10                51 MB
    21 | node-v22.20.0-linux-armv7l.tar.xz                  24 Sept 2025, 13:10                26 MB
    22 | node-v22.20.0-linux-ppc64le.tar.gz                 24 Sept 2025, 13:10                59 MB
    23 | node-v22.20.0-linux-ppc64le.tar.xz                 24 Sept 2025, 13:10                31 MB
    24 | node-v22.20.0-linux-s390x.tar.gz                   24 Sept 2025, 13:10                57 MB
    25 | node-v22.20.0-linux-s390x.tar.xz                   24 Sept 2025, 13:10                29 MB
    26 | node-v22.20.0-linux-x64.tar.gz                     24 Sept 2025, 13:10                57 MB
    27 | node-v22.20.0-linux-x64.tar.xz                     24 Sept 2025, 13:10                31 MB
    28 | node-v22.20.0-win-arm64.7z                         24 Sept 2025, 13:10                20 MB
    29 | node-v22.20.0-win-arm64.zip                        24 Sept 2025, 13:10                31 MB
    30 | node-v22.20.0-win-x64.7z                           24 Sept 2025, 13:10                22 MB
    31 | node-v22.20.0-win-x64.zip                          24 Sept 2025, 13:10                36 MB
    32 | node-v22.20.0-win-x86.7z                           24 Sept 2025, 13:10                20 MB
    33 | node-v22.20.0-win-x86.zip                          24 Sept 2025, 13:10                33 MB
    34 | node-v22.20.0-x64.msi                              24 Sept 2025, 13:10                31 MB
    35 | node-v22.20.0-x86.msi                              24 Sept 2025, 13:10                29 MB
    36 | node-v22.20.0.pkg                                  24 Sept 2025, 13:10                88 MB
    37 | node-v22.20.0.tar.gz                               24 Sept 2025, 13:10               105 MB
    38 | node-v22.20.0.tar.xz                               24 Sept 2025, 13:10                50 MB
    39 | 

    -------------------------------------------------------------------------------- /fixtures/nodejs/v4.9.1/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Index of /dist/v4.9.1/ 4 | 18 | 19 | 20 |

    Index of /dist/v4.9.1/


    ../
    21 | docs/                                                             -                   -
    22 | win-x64/                                                          -                   -
    23 | win-x86/                                                          -                   -
    24 | SHASUMS256.txt.asc                                 04-Nov-2024 17:40               4.1 KB
    25 | SHASUMS256.txt.sig                                 04-Nov-2024 17:40                310 B
    26 | SHASUMS256.txt                                     04-Nov-2024 17:40               3.6 KB
    27 | node-v4.9.1-darwin-x64.tar.gz                      30-Oct-2024 18:21                10 MB
    28 | node-v4.9.1-darwin-x64.tar.xz                      04-Nov-2024 17:40               7.1 MB
    29 | node-v4.9.1-headers.tar.gz                         04-Nov-2024 17:40               471 KB
    30 | node-v4.9.1-headers.tar.xz                         04-Nov-2024 17:40               342 KB
    31 | node-v4.9.1-linux-arm64.tar.gz                     30-Oct-2024 18:21                12 MB
    32 | node-v4.9.1-linux-arm64.tar.xz                     04-Nov-2024 17:40               7.7 MB
    33 | node-v4.9.1-linux-armv6l.tar.gz                    30-Oct-2024 18:21                11 MB
    34 | node-v4.9.1-linux-armv6l.tar.xz                    04-Nov-2024 17:40               7.3 MB
    35 | node-v4.9.1-linux-armv7l.tar.gz                    30-Oct-2024 18:21                11 MB
    36 | node-v4.9.1-linux-armv7l.tar.xz                    04-Nov-2024 17:40               7.3 MB
    37 | node-v4.9.1-linux-ppc64.tar.gz                     30-Oct-2024 18:21                12 MB
    38 | node-v4.9.1-linux-ppc64.tar.xz                     04-Nov-2024 17:40               7.5 MB
    39 | node-v4.9.1-linux-ppc64le.tar.gz                   30-Oct-2024 18:21                12 MB
    40 | node-v4.9.1-linux-ppc64le.tar.xz                   04-Nov-2024 17:40               7.6 MB
    41 | node-v4.9.1-linux-x64.tar.gz                       30-Oct-2024 18:21                12 MB
    42 | node-v4.9.1-linux-x64.tar.xz                       04-Nov-2024 17:40               8.2 MB
    43 | node-v4.9.1-linux-x86.tar.gz                       30-Oct-2024 18:21                12 MB
    44 | node-v4.9.1-linux-x86.tar.xz                       04-Nov-2024 17:40               7.8 MB
    45 | node-v4.9.1-sunos-x64.tar.gz                       30-Oct-2024 18:21                13 MB
    46 | node-v4.9.1-sunos-x64.tar.xz                       30-Oct-2024 18:21               8.4 MB
    47 | node-v4.9.1-sunos-x86.tar.gz                       30-Oct-2024 18:21                12 MB
    48 | node-v4.9.1-sunos-x86.tar.xz                       04-Nov-2024 17:40               7.7 MB
    49 | node-v4.9.1-win-x64.7z                             04-Nov-2024 17:40               6.1 MB
    50 | node-v4.9.1-win-x64.zip                            30-Oct-2024 18:21                11 MB
    51 | node-v4.9.1-win-x86.7z                             04-Nov-2024 17:40               5.4 MB
    52 | node-v4.9.1-win-x86.zip                            30-Oct-2024 18:21               9.6 MB
    53 | node-v4.9.1-x64.msi                                30-Oct-2024 18:21                11 MB
    54 | node-v4.9.1-x86.msi                                30-Oct-2024 18:21              10.0 MB
    55 | node-v4.9.1.pkg                                    30-Oct-2024 18:21                13 MB
    56 | node-v4.9.1.tar.gz                                 30-Oct-2024 18:21                23 MB
    57 | node-v4.9.1.tar.xz                                 30-Oct-2024 18:21                13 MB
    58 | 

    59 | 60 | -------------------------------------------------------------------------------- /tsumugu-parser/src/parser/lighttpd.rs: -------------------------------------------------------------------------------- 1 | use crate::listing::{FileSize, FileType, ListItem}; 2 | use chrono::NaiveDateTime; 3 | use scraper::{Html, Selector}; 4 | // use tracing::debug; 5 | 6 | use tsumugu_net::client::HttpClient; 7 | 8 | use super::*; 9 | use anyhow::{Result, anyhow}; 10 | 11 | #[derive(Debug, Clone, Default)] 12 | pub struct LighttpdListingParser; 13 | 14 | impl Parser for LighttpdListingParser { 15 | fn name(&self) -> &'static str { 16 | "Lighttpd" 17 | } 18 | 19 | fn get_list(&self, client: &dyn HttpClient, url: &url::Url) -> Result { 20 | let resp = handle_net!(client.get_text(url))?; 21 | let url: &Url = &resp.final_url; 22 | assert_if_url_has_no_trailing_slash(url); 23 | let document = Html::parse_document(&resp.body); 24 | let selector = Selector::parse("tbody").unwrap(); 25 | let indexlist = document 26 | .select(&selector) 27 | .next() 28 | .ok_or_else(|| parse_error!("Cannot find
  • in order, instead of using class name, to improve compatibility for strange pages 33 | let td_selector = Selector::parse("td").unwrap(); 34 | let mut td_iterator = element.select(&td_selector); 35 | 36 | let td_a = match td_iterator.next() { 37 | Some(tda) => tda, 38 | None => { 39 | warn!("Cannot find in this