├── rustfmt.toml ├── .gitignore ├── .pre-commit ├── examples ├── simple.rs └── perf_data.rs ├── Cargo.toml ├── src ├── timeout.rs ├── err.rs ├── config.rs ├── upload.rs ├── test.rs ├── lib.rs └── list_actions.rs └── README.md /rustfmt.toml: -------------------------------------------------------------------------------- 1 | edition = "2021" 2 | imports_granularity = "Module" 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | **/*.rs.bk 3 | Cargo.lock 4 | rusty-tags.vi 5 | -------------------------------------------------------------------------------- /.pre-commit: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env sh 2 | 3 | cargo fmt -- --check 4 | code="$?" 5 | 6 | if [ "$code" -ne 0 ]; then 7 | echo "Please run \`cargo fmt' before committing" 8 | exit "$code" 9 | fi 10 | -------------------------------------------------------------------------------- /examples/simple.rs: -------------------------------------------------------------------------------- 1 | use s3_algo::*; 2 | 3 | #[tokio::main] 4 | async fn main() { 5 | const N_FILES: usize = 10; 6 | let files = 7 | (0..N_FILES).map(|i| ObjectSource::data(format!("hey, {}", i), format!("hey{}", i))); 8 | let s3 = S3Algo::new(testing_sdk_client().await); 9 | s3.upload_files( 10 | "test-bucket".into(), 11 | files, 12 | |result| async move { println!("File {}/{} successfully uploaded", result.seq + 1, N_FILES)}, 13 | |client| client.put_object() 14 | ) 15 | .await 16 | .unwrap(); 17 | } 18 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "s3-algo" 3 | description = "High-performance algorithms for batch operations to Amazon S3" 4 | version = "0.7.0" 5 | authors = ["Erlend Langseth <3rlendhl@gmail.com>"] 6 | license = "MIT" 7 | edition = "2018" 8 | 9 | documentation = "https://docs.rs/s3-algo/" 10 | repository = "https://github.com/openanalytics/s3-algo" 11 | 12 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 13 | 14 | [dependencies] 15 | futures = "0.3.17" 16 | futures-stopwatch = "0.3.0" 17 | futures-retry = "0.6.0" 18 | tokio = {version = "1.10", features = ["time", "fs", "macros", "io-util", "sync", "rt-multi-thread"]} 19 | tokio-util = {version = "0.7.0", features = ["codec"]} 20 | bytes = "1.2.1" 21 | serde = {optional = true, version = "1.0.130", features = ["derive"]} 22 | snafu = {version = "0.6.1", features = ["futures"]} 23 | walkdir = "2.2.9" 24 | 25 | aws-sdk-s3 = "1.14.0" 26 | aws-config = "1.1.4" 27 | aws-smithy-http = "0.60.4" 28 | aws-smithy-async = "1.1.4" 29 | aws-smithy-types-convert = {version = "0.60.4", features = ["convert-streams"]} 30 | 31 | [target.'cfg(windows)'.dependencies] 32 | # only needed on windows for substituting \ with / in paths 33 | path-slash = "0.2.1" 34 | 35 | [dev-dependencies] 36 | tempdir = "0.3.7" 37 | multi-default-trait-impl = "0.1.2" 38 | rand = "0.8.5" 39 | clap = "3.0.0" 40 | 41 | [features] 42 | default = ["serde1"] 43 | serde1 = ["serde"] 44 | -------------------------------------------------------------------------------- /src/timeout.rs: -------------------------------------------------------------------------------- 1 | //! The `Timeout` trait defines the how the timeout value of a multi-file upload evolves based on 2 | //! past file upload results. A default implementation `TimeoutState` is provided. 3 | use crate::config::*; 4 | use crate::RequestReport; 5 | use std::time::Duration; 6 | pub trait Timeout: Send + 'static { 7 | /// Size is in either bytes or objects, depending on the type of requests. 8 | fn get_timeout(&self, size: usize, retries: usize) -> Duration; 9 | /// Update the internal estimate of the extra timeout per unit of size 10 | fn update(&mut self, _: &RequestReport); 11 | /// get estimated upload speed 12 | fn get_estimate(&self) -> f64; 13 | } 14 | /// State for timeouts, especially tailored toward uploading files. 15 | /// But can be useful in any case where the size of an operation in bytes is known. 16 | pub struct TimeoutState { 17 | seconds_per_unit_estimate: f64, 18 | cfg: AlgorithmConfig, 19 | specific: SpecificTimings, 20 | } 21 | impl TimeoutState { 22 | pub fn new(cfg: AlgorithmConfig, specific: SpecificTimings) -> TimeoutState { 23 | TimeoutState { 24 | seconds_per_unit_estimate: specific.seconds_per_unit, 25 | cfg, 26 | specific, 27 | } 28 | } 29 | } 30 | impl Timeout for TimeoutState { 31 | /// Not used by algorithm 32 | fn get_estimate(&self) -> f64 { 33 | self.seconds_per_unit_estimate 34 | } 35 | fn get_timeout(&self, size: usize, retries: usize) -> Duration { 36 | let backoff = self.cfg.backoff.powi(retries as i32); 37 | let time_estimate = (size as f64) * self.seconds_per_unit_estimate * backoff; 38 | Duration::from_secs_f64( 39 | self.cfg.base_timeout * backoff + self.cfg.timeout_fraction * time_estimate, 40 | ) 41 | } 42 | fn update(&mut self, result: &RequestReport) { 43 | if result.size > self.specific.minimum_units_for_estimation { 44 | let target = result.success_time.as_secs_f64() / (result.size as f64); 45 | self.seconds_per_unit_estimate = self.cfg.avg_power * self.seconds_per_unit_estimate 46 | + (1.0 - self.cfg.avg_power) * target; 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /examples/perf_data.rs: -------------------------------------------------------------------------------- 1 | use clap::*; 2 | use s3_algo::*; 3 | use std::io::Write; 4 | use std::path::{Path, PathBuf}; 5 | 6 | #[tokio::main] 7 | async fn main() { 8 | let mut app = App::new("Example 'perf_data'") 9 | .before_help("Upload a directory to S3 on localhost.") 10 | .arg( 11 | Arg::with_name("source") 12 | .help("Path to a folder to upload to S3") 13 | .required(true), 14 | ) 15 | .arg( 16 | Arg::with_name("dest_bucket") 17 | .help("Destination bucket") 18 | .required(true), 19 | ) 20 | .arg( 21 | Arg::with_name("dest_prefix") 22 | .help("Destination prefix") 23 | .required(true), 24 | ) 25 | .arg( 26 | Arg::with_name("parallelization") 27 | .short('n') 28 | .takes_value(true) 29 | .help("Maximum number of simultaneous upload requests"), 30 | ); 31 | let matches = app.clone().get_matches(); 32 | 33 | if let (Some(path), Some(bucket), Some(prefix)) = ( 34 | matches.value_of("source"), 35 | matches.value_of("dest_bucket"), 36 | matches.value_of("dest_prefix"), 37 | ) { 38 | let parallelization = value_t_or_exit!(matches.value_of("parallelization"), usize); 39 | benchmark_s3_upload( 40 | Path::new(path).to_path_buf(), 41 | bucket.to_owned(), 42 | prefix.to_owned(), 43 | parallelization, 44 | ) 45 | .await; 46 | println!("Done"); 47 | } else { 48 | app.print_help().unwrap() 49 | } 50 | } 51 | 52 | async fn benchmark_s3_upload( 53 | dir_path: PathBuf, 54 | bucket: String, 55 | prefix: String, 56 | copy_parallelization: usize, 57 | ) { 58 | let cfg = Config { 59 | copy_parallelization, 60 | ..Default::default() 61 | }; 62 | let s3 = testing_sdk_client().await; 63 | let algo = S3Algo::with_config(s3, cfg); 64 | 65 | upload_perf_log_init(&mut std::io::stdout()); 66 | let progress = |res| async move { upload_perf_log_update(&mut std::io::stdout(), res) }; 67 | 68 | algo.upload_files( 69 | bucket, 70 | files_recursive(dir_path, PathBuf::from(&prefix)), 71 | progress, 72 | |client| client.put_object(), 73 | ) 74 | .await 75 | .unwrap(); 76 | } 77 | 78 | // Helpers for writing data 79 | macro_rules! write_cell { 80 | ($out:expr, $x:expr) => { 81 | let _ = write!($out, "{0: >18}", format!("{:.5}", $x)); 82 | }; 83 | } 84 | pub fn upload_perf_log_init(out: &mut W) { 85 | let _ = writeln!( 86 | out, 87 | "{0: >w$}{1: >w$}{2: >w$}{3: >w$}{4: >w$}{5: >w$}", 88 | "attempts", 89 | "bytes", 90 | "success_ms", 91 | "total_ms", 92 | "MBps", 93 | "MBps est", 94 | w = 18 95 | ); 96 | } 97 | pub fn upload_perf_log_update(out: &mut W, res: RequestReport) { 98 | // TODO: Write performance data to file with tokio 99 | let megabytes = res.size as f64 / 1_000_000.0; 100 | let speed = megabytes / res.success_time.as_secs_f64(); 101 | write_cell!(out, res.attempts); 102 | write_cell!(out, res.size); 103 | write_cell!(out, res.success_time.as_millis()); 104 | write_cell!(out, res.total_time.as_millis()); 105 | write_cell!(out, speed); 106 | write_cell!(out, res.est); 107 | let _ = writeln!(out); 108 | } 109 | -------------------------------------------------------------------------------- /src/err.rs: -------------------------------------------------------------------------------- 1 | use aws_sdk_s3::error::SdkError; 2 | use aws_sdk_s3::operation::copy_object::CopyObjectError; 3 | use aws_sdk_s3::operation::delete_object::DeleteObjectError; 4 | use aws_sdk_s3::operation::delete_objects::DeleteObjectsError; 5 | use aws_sdk_s3::operation::get_object::GetObjectError; 6 | use aws_sdk_s3::operation::list_objects_v2::ListObjectsV2Error; 7 | use aws_sdk_s3::operation::put_object::PutObjectError; 8 | use aws_sdk_s3::primitives::ByteStreamError; 9 | use snafu::{Backtrace, Snafu}; 10 | use std::io; 11 | 12 | #[derive(Snafu, Debug)] 13 | #[snafu(visibility = "pub")] 14 | pub enum Error { 15 | #[snafu(display("Io error: {}: {}", description, source))] 16 | Io { 17 | source: io::Error, 18 | description: String, 19 | backtrace: Backtrace, 20 | }, 21 | /// Error originating from tokio::Delay 22 | #[snafu(display("Tokio timer error: {}", source))] 23 | Delay { 24 | source: tokio::time::error::Error, 25 | backtrace: Backtrace, 26 | }, 27 | #[snafu(display("S3 operation timed out"))] 28 | Timeout { 29 | source: tokio::time::error::Elapsed, 30 | }, 31 | #[snafu(display("Error listing objects in S3: {:?}", source))] 32 | ListObjectsV2 { 33 | source: SdkError, 34 | }, 35 | #[snafu(display("Error deleting objects in S3: {:?}", source))] 36 | DeleteObjects { 37 | source: SdkError, 38 | }, 39 | DeleteObject { 40 | source: SdkError, 41 | }, 42 | CopyObject { 43 | source: SdkError, 44 | }, 45 | #[snafu(display("GetObject s3://{}/{}: {:#?}", bucket, key, source))] 46 | GetObject { 47 | key: String, 48 | bucket: String, 49 | source: SdkError, 50 | }, 51 | #[snafu(display("IO error: {}", source))] 52 | TokioIo { 53 | source: tokio::io::Error, 54 | }, 55 | AnyError { 56 | source: Box, 57 | }, 58 | 59 | #[snafu(display("Downloading objects: missing key or size property"))] 60 | MissingKeyOrSize, 61 | #[snafu(display("Downloading objects: missing content_length property"))] 62 | MissingContentLength, 63 | 64 | // AWS SDK Errors 65 | #[snafu(display("S3 'put object' error on key '{}': {}", key, source))] 66 | PutObject { 67 | source: SdkError, 68 | key: String, 69 | backtrace: Backtrace, 70 | }, 71 | 72 | #[snafu(display("Error listing objects in S3: {:?}", source))] 73 | NewListObjectsV2 { 74 | source: SdkError, 75 | }, 76 | 77 | #[snafu(display("Error deleting objects in S3: {:?}", source))] 78 | NewDeleteObjects { 79 | source: SdkError, 80 | }, 81 | NewDeleteObject { 82 | source: SdkError, 83 | }, 84 | NewCopyObject { 85 | source: SdkError, 86 | }, 87 | #[snafu(display("GetObject s3://{}/{}: {:#?}", bucket, key, source))] 88 | NewGetObject { 89 | key: String, 90 | bucket: String, 91 | source: SdkError, 92 | }, 93 | } 94 | 95 | impl From> for Error 96 | where 97 | T: std::error::Error + Send + Sync + 'static, 98 | { 99 | fn from(err: SdkError) -> Self { 100 | Self::AnyError { 101 | source: Box::new(err), 102 | } 103 | } 104 | } 105 | 106 | impl From for Error { 107 | fn from(err: ByteStreamError) -> Self { 108 | Self::AnyError { 109 | source: Box::new(err), 110 | } 111 | } 112 | } 113 | 114 | #[cfg(test)] 115 | mod test { 116 | use super::*; 117 | use snafu::GenerateBacktrace; 118 | #[test] 119 | fn error_traits() { 120 | fn foo(_: T) {} 121 | foo(Error::Io { 122 | source: io::Error::from_raw_os_error(1), 123 | description: "hello".into(), 124 | backtrace: Backtrace::generate(), 125 | }); 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /src/config.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserialize, Serialize}; 2 | #[derive(Clone, Debug, Serialize, Deserialize)] 3 | #[serde(default)] 4 | #[serde(deny_unknown_fields)] 5 | pub struct Config { 6 | /// Maximum number of simultaneous upload requests 7 | pub copy_parallelization: usize, 8 | 9 | pub algorithm: AlgorithmConfig, 10 | 11 | /// The "unit" of a delete request is number of objects 12 | pub delete_requests: SpecificTimings, 13 | 14 | /// NOTE: For now, `put_request` is used both in S3 `get`, `put` and `copy` operations. 15 | /// Reason: We don't know if it's worth it with different configurations for these operations 16 | /// that all have a duration that depends on the number of bytes of the objects in question. 17 | /// The "unit" for such requests are number of bytes. 18 | pub put_requests: SpecificTimings, 19 | } 20 | 21 | impl Default for Config { 22 | fn default() -> Self { 23 | Self { 24 | copy_parallelization: 20, 25 | algorithm: Default::default(), 26 | delete_requests: SpecificTimings { 27 | seconds_per_unit: 0.2, 28 | minimum_units_for_estimation: 10, 29 | }, 30 | put_requests: SpecificTimings { 31 | seconds_per_unit: 1.0 / 1_000_000.0, // 1 MBPS = 1e-06 seconds per MB 32 | minimum_units_for_estimation: 10, 33 | }, 34 | } 35 | } 36 | } 37 | 38 | #[derive(Clone, Debug, Serialize, Deserialize)] 39 | #[serde(deny_unknown_fields)] 40 | pub struct AlgorithmConfig { 41 | /// The base timeout which will always be there (an estimate of RTT) 42 | pub base_timeout: f64, 43 | 44 | /// Timeout is set to a fraction of expected upload time (> 1.0) 45 | pub timeout_fraction: f64, 46 | 47 | /// Every retry, the timeout is multiplied by backoff (> 1.0) 48 | pub backoff: f64, 49 | 50 | /// Number of times to retry a single request before giving up 51 | pub n_retries: usize, 52 | 53 | /// To estimate the upload speed incrementally, we use an exponential average: 54 | /// `new_avg_speed = avg_power * new_speed + (1 - avg_power) * avg_speed`. 55 | /// 56 | /// Thus, between 0.0 and 1.0, closer to 1.0 means that newer data points have 57 | /// more significance. 58 | pub avg_power: f64, 59 | } 60 | impl Default for AlgorithmConfig { 61 | fn default() -> Self { 62 | Self { 63 | base_timeout: 0.5, 64 | timeout_fraction: 1.5, 65 | backoff: 1.5, 66 | n_retries: 8, 67 | avg_power: 0.7, 68 | } 69 | } 70 | } 71 | 72 | /// These settings are specific to the kind of operation we do. For example delete or put in S3. 73 | #[derive(Clone, Debug, Serialize, Deserialize)] 74 | pub struct SpecificTimings { 75 | /// The initial estimate of extra timeout per unit (byte or object) 76 | pub seconds_per_unit: f64, 77 | /// The amount of units in a request, below which it does not affect estimation 78 | pub minimum_units_for_estimation: usize, 79 | } 80 | 81 | impl SpecificTimings { 82 | /// Sane default setting for when the size is number of bytes 83 | pub fn default_for_bytes() -> Self { 84 | Self { 85 | seconds_per_unit: 1.0 / 1_000_000.0, // 1 MBPS 86 | minimum_units_for_estimation: 500_000, // 500 KB 87 | } 88 | } 89 | /// Sane default setting for when the size is number of objects 90 | pub fn default_for_objects() -> Self { 91 | Self { 92 | seconds_per_unit: 0.2, 93 | minimum_units_for_estimation: 2, 94 | } 95 | } 96 | } 97 | 98 | // DRAFT 99 | // 100 | // Now, we don't have "avg_min_bytes". Because... we will just substract the assumed constant 101 | // anyway. 102 | // What if the assumption is wrong? 103 | // Well, it should be rather small anyway. It is exclusively thought to be the RTT... 104 | // If the substraction is negative after all...? Then... idk 105 | 106 | // put_timeout_per_byte..? 107 | // should configure it as an assumed MBPS just like before. expected_upload_speed. 108 | // delete_timeout_per_object..? 109 | // quite straight-forward seconds per object 110 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # `s3-algo` 2 | 3 | High-performance algorithms for batch operations in Amazon S3, on top of [rusoto](https://github.com/rusoto/rusoto). 4 | Reliability and performance achieved through a configurable timeout/retry/backoff algorithm, for high volumn of requests. 5 | Monitor progress closely with closures that get called for every finished request, for accurate user feedback. 6 | 7 | 8 | https://docs.aws.amazon.com/AmazonS3/latest/dev/optimizing-performance-guidelines.html 9 | 10 | - Upload multiple files with `s3_upload_files`. 11 | - List files with `s3_list_objects` or `s3_list_prefix`, and then execute deletion or copy on all the files. 12 | 13 | This crate is only in its infancy, and we happily welcome PR's, feature requests, suggestions for improvement of the API. 14 | 15 | # Running tests and examples 16 | Both tests and examples require that an S3 service such as `minio` is running locally at port 9000. 17 | Tests assume that a credentials profile exists - for example in `~/.aws/credentials`: 18 | 19 | ``` 20 | [testing] 21 | aws_access_key_id = 123456789 22 | aws_secret_access_key = 123456789 23 | ``` 24 | 25 | # Listing, deleting and copying objects 26 | Is all done with entrypoint `s3_list_objects()` or `s3_list_prefix()`, which return a `ListObjects` 27 | object which can delete and copy files. 28 | Example: 29 | 30 | ```rust 31 | s3_list_prefix(s3, "test-bucket".to_string(), "some/prefix".to_string()) 32 | .delete_all() 33 | .await 34 | .unwrap(); 35 | ``` 36 | 37 | # Upload 38 | ## Features of the `s3_upload_files` function 39 | * As generic as possible, to support many use cases. 40 | * It is possible to collect detailed data from the upload through a closure - one can choose to use this data to analyze performance, or for example to implement a live progress percentage report. 41 | * Backoff mechanism 42 | * Fast. Several mechanisms are in place, such as [aggressive timeouts](https://docs.aws.amazon.com/AmazonS3/latest/dev/optimizing-performance-guidelines.html), parallelization and streaming files from file system while uploading. 43 | 44 | ## Algorithm details 45 | The documentation for `UploadConfig` may help illuminate the components of the algorithm. 46 | The currnetly most important aspect of the algorithm revolves around deciding timeout values. That is, how long to wait for a request before trying again. 47 | It is important for performance that the timeout is tight enough. 48 | The main mechanism to this end is the estimation of the upload bandwidth through a running exponential average of the upload speed (on success) of individual files. 49 | Additionally, on each successive retry, the timeout increases by some factor (back-off). 50 | 51 | ## Yet to consider 52 | * Is the algorithm considerate with respect to other processes that want to use the same network? For example in the case of congestion. It does implement increasing back-off intervals after failed requests, but the real effect on a shared network should be tested. 53 | 54 | 55 | ## Examples 56 | ### `perf_data` 57 | Command-line interface for uploading any directory to any bucket and prefix in a locally running S3 service (such as `minio`). 58 | Example: 59 | ``` 60 | cargo run --example perf_data -- -n 3 ./src test-bucket lala 61 | ``` 62 | 63 | Prints: 64 | ``` 65 | attempts bytes success_ms total_ms MBps MBps est 66 | 1 1990 32 32 0.06042 1.00000 67 | 1 24943 33 33 0.74043 1.00000 68 | 1 2383 29 29 0.08211 1.00000 69 | 1 417 13 13 0.03080 1.00000 70 | 1 8562 16 16 0.51480 1.00000 71 | ``` 72 | `total_ms` is the total time including all retries, and `success_ms` is the time of only the last attempt. 73 | The distinction between these two is useful in real cases where `attempts` is not always `1`. 74 | 75 | You can then verify that the upload happened by entering the container. Something like: 76 | 77 | ``` 78 | $ docker exec -it $(docker ps --filter "ancestor=minio" --format "{{.Names}}") bash 79 | [user@144aff4dae5b ~]$ ls s3/ 80 | test-bucket/ 81 | [user@144aff4dae5b ~]$ ls s3/test-bucket/ 82 | lala 83 | ``` 84 | 85 | 86 | -------------------------------------------------------------------------------- /src/upload.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | use aws_sdk_s3::operation::put_object::builders::PutObjectFluentBuilder; 3 | use aws_sdk_s3::primitives::ByteStream; 4 | 5 | impl S3Algo { 6 | /// Upload multiple files to S3. 7 | /// 8 | /// `upload_files` provides counting of uploaded files and bytes through the `progress` closure: 9 | /// 10 | /// For common use cases it is adviced to use [`files_recursive`](files_recursive) for the `files` parameter. 11 | /// 12 | /// `progress` will be called after the upload of each file, with some data about that upload. 13 | /// The first `usize` parameter is the number of this file in the upload, while [`RequestReport`](struct.RequestReport.html) 14 | /// holds more data such as size in bytes, and the duration of the upload. It is thus possible to 15 | /// report progress both in amount of files, or amount of bytes, depending on what granularity is 16 | /// desired. 17 | /// `progress` returns a generic `F: Future` to support async operations like, for example, logging the 18 | /// results to a file; this future will be run as part of the upload algorithm. 19 | /// 20 | /// `default_request` constructs the default request struct - only the fields `bucket`, `key`, 21 | /// `body` and `content_length` are overwritten by the upload algorithm. 22 | pub async fn upload_files( 23 | &self, 24 | bucket: String, 25 | files: I, 26 | progress: P, 27 | default_request: R, 28 | ) -> Result<(), Error> 29 | where 30 | P: Fn(RequestReport) -> F + Clone + Send + Sync + 'static, 31 | F: Future + Send + 'static, 32 | I: Iterator + Send + 'static, 33 | R: Fn(&Client) -> PutObjectFluentBuilder + Clone + Unpin + Sync + Send + 'static, 34 | { 35 | let copy_parallelization = self.config.copy_parallelization; 36 | let n_retries = self.config.algorithm.n_retries; 37 | 38 | let timeout_state = Arc::new(Mutex::new(TimeoutState::new( 39 | self.config.algorithm.clone(), 40 | self.config.put_requests.clone(), 41 | ))); 42 | let timeout_state2 = timeout_state.clone(); 43 | 44 | let jobs = files.map(move |src| { 45 | let (default, bucket, s3) = (default_request.clone(), bucket.clone(), self.s3.clone()); 46 | s3_request( 47 | move || { 48 | src.clone() 49 | .create_upload_future(s3.clone(), bucket.clone(), default.clone()) 50 | }, 51 | |_, size| size, 52 | n_retries, 53 | timeout_state.clone(), 54 | ) 55 | .boxed() 56 | }); 57 | 58 | // Run jobs in parallel, 59 | // adding eventual delays after each file upload and also at the end, 60 | // and counting the progress 61 | stream::iter(jobs) 62 | .buffer_unordered(copy_parallelization) 63 | .zip(stream::iter(0..)) 64 | .map(|(result, i)| result.map(|result| (i, result))) 65 | .try_for_each(move |(i, (mut result, _))| { 66 | let progress = progress.clone(); 67 | let timeout_state = timeout_state2.clone(); 68 | async move { 69 | result.seq = i; 70 | timeout_state.lock().await.update(&result); 71 | progress(result).map(Ok).await 72 | } 73 | }) 74 | .await 75 | } 76 | } 77 | 78 | #[derive(Clone, Debug)] 79 | pub enum ObjectSource { 80 | File { path: PathBuf, key: String }, 81 | Data { data: Vec, key: String }, 82 | } 83 | impl ObjectSource { 84 | pub fn file(path: PathBuf, key: String) -> Self { 85 | Self::File { path, key } 86 | } 87 | pub fn data>>(data: D, key: String) -> Self { 88 | Self::Data { 89 | data: data.into(), 90 | key, 91 | } 92 | } 93 | pub async fn create_stream(&self) -> Result<(ByteStream, usize), Error> { 94 | match self { 95 | Self::File { path, .. } => { 96 | let file = tokio::fs::File::open(path.clone()).await.with_context({ 97 | let path = path.clone(); 98 | move || err::Io { 99 | description: path.display().to_string(), 100 | } 101 | })?; 102 | let metadata = file.metadata().await.with_context({ 103 | let path = path.clone(); 104 | move || err::Io { 105 | description: path.display().to_string(), 106 | } 107 | })?; 108 | 109 | let len = metadata.len() as usize; 110 | // let boxbody = BoxBody::new( 111 | // FramedRead::new(file, BytesCodec::new()).map_ok(bytes::BytesMut::freeze), 112 | // ); 113 | // let sdk_body = SdkBody::from_dyn(boxbody); 114 | 115 | Ok((ByteStream::read_from().file(file).build().await?, len)) 116 | } 117 | Self::Data { data, .. } => Ok((data.clone().into(), data.len())), 118 | } 119 | } 120 | pub async fn create_upload_future( 121 | self, 122 | s3: aws_sdk_s3::Client, 123 | bucket: String, 124 | default: R, 125 | ) -> Result<(impl Future>, usize), Error> 126 | where 127 | R: Fn(&Client) -> PutObjectFluentBuilder + Clone + Unpin + Sync + Send + 'static, 128 | { 129 | let (stream, len) = self.create_stream().await?; 130 | let key = self.get_key().to_owned(); 131 | let (s3, bucket, default) = (s3.clone(), bucket.clone(), default.clone()); 132 | let future = async move { 133 | default(&s3) 134 | .set_bucket(Some(bucket.clone())) 135 | .set_key(Some(key.clone())) 136 | .set_body(Some(stream)) 137 | .set_content_length(Some(len as i64)) 138 | .send() 139 | .await 140 | .map_err(|e| e.into()) 141 | // .await 142 | .map(drop) 143 | }; 144 | Ok((future, len)) 145 | } 146 | pub fn get_key(&self) -> &str { 147 | match self { 148 | Self::File { key, .. } => key, 149 | Self::Data { key, .. } => key, 150 | } 151 | } 152 | } 153 | 154 | /// Convenience function (using `walkdir`) to traverse all files in directory `src_dir`. Returns an 155 | /// iterator that can be used as input to `S3Algo::upload_files`, which uploads files 156 | /// with a key equal to the file's path with `src_dir` stripped away, and with `key_prefix` 157 | /// prepended. 158 | pub fn files_recursive( 159 | src_dir: PathBuf, 160 | key_prefix: PathBuf, 161 | ) -> impl Iterator { 162 | #[cfg(windows)] 163 | use path_slash::PathExt; 164 | walkdir::WalkDir::new(&src_dir) 165 | .into_iter() 166 | .filter_map(move |entry| { 167 | let src_dir = src_dir.clone(); 168 | let key_prefix = key_prefix.clone(); 169 | entry.ok().and_then(move |entry| { 170 | if entry.file_type().is_file() { 171 | let path = entry.path().to_owned(); 172 | let key_suffix = path.strip_prefix(&src_dir).unwrap().to_path_buf(); 173 | let key = key_prefix.join(&key_suffix); 174 | Some(ObjectSource::File { 175 | path, 176 | #[cfg(unix)] 177 | key: key.to_string_lossy().to_string(), 178 | #[cfg(windows)] 179 | key: key.to_slash_lossy().to_string(), 180 | }) 181 | } else { 182 | None 183 | } 184 | }) 185 | }) 186 | } 187 | 188 | #[cfg(test)] 189 | mod test { 190 | use super::*; 191 | use tempdir::TempDir; 192 | #[test] 193 | fn test_files_recursive() { 194 | let tmp_dir = TempDir::new("s3-testing").unwrap(); 195 | let dir = tmp_dir.path(); 196 | for i in 0..10 { 197 | std::fs::write(dir.join(format!("img_{}.tif", i)), "file contents").unwrap(); 198 | } 199 | let files = files_recursive(dir.to_owned(), PathBuf::new()); 200 | assert_eq!(files.count(), 10); 201 | } 202 | } 203 | -------------------------------------------------------------------------------- /src/test.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | use rand::Rng; 3 | use std::path::Path; 4 | use std::sync::Arc; 5 | use tempdir::TempDir; 6 | use timeout::TimeoutState; 7 | use tokio::io::AsyncReadExt; 8 | use tokio::sync::Mutex; 9 | 10 | /* 11 | /// Timeout implementation used for testing 12 | struct TimeoutState; 13 | impl Timeout for TimeoutState { 14 | fn get_timeout(&self, _bytes: usize, _attempts: usize) -> Duration { 15 | Duration::from_secs(4) 16 | } 17 | fn update(&mut self, _: &RequestReport) {} 18 | fn get_estimate(&self) -> f64 { 19 | 0.0 20 | } 21 | } 22 | */ 23 | 24 | pub(crate) fn rand_string(n: usize) -> String { 25 | rand::thread_rng() 26 | .sample_iter(&rand::distributions::Alphanumeric) 27 | .take(n) 28 | .map(|x| x as char) 29 | .collect::() 30 | } 31 | 32 | #[test] 33 | fn everything_is_sync_and_static() { 34 | // This is only to test that it compiles 35 | fn verify(_: F) 36 | where 37 | F: Future + Send + 'static, 38 | { 39 | } 40 | 41 | verify(s3_request( 42 | || async move { Ok((async move { Ok(()) }, 0)) }, 43 | |_, size| size, 44 | 5, 45 | Arc::new(Mutex::new(TimeoutState::new( 46 | AlgorithmConfig::default(), 47 | SpecificTimings::default_for_bytes(), 48 | ))), 49 | )) 50 | } 51 | 52 | #[tokio::test] 53 | async fn test_s3_upload_files() { 54 | const N_FILES: usize = 100; 55 | let tmp_dir = TempDir::new("s3-testing").unwrap(); 56 | 57 | let s3 = testing_sdk_client().await; 58 | let algo = S3Algo::new(s3.clone()); 59 | let dir_key = upload_test_files(algo.clone(), tmp_dir.path(), N_FILES) 60 | .await 61 | .unwrap(); 62 | 63 | // Check that all files are there 64 | for i in 0..N_FILES { 65 | // let key = format!("{}/img_{}.tif", dir_key, i); 66 | let key = dir_key.join(format!("img_{}.tif", i)); 67 | 68 | let response = s3 69 | .get_object() 70 | .bucket("test-bucket".to_string()) 71 | .key(key.to_str().unwrap().to_string()) 72 | .send() 73 | .await 74 | .unwrap(); 75 | 76 | let mut body = response.body.into_async_read(); 77 | let mut content = Vec::new(); 78 | body.read_to_end(&mut content).await.unwrap(); 79 | let content = std::str::from_utf8(&content).unwrap(); 80 | assert_eq!(content, "file contents"); 81 | } 82 | } 83 | 84 | #[tokio::test] 85 | async fn test_s3_timeouts() { 86 | // TODO finish test 87 | // Currently just prints things to inspect how timeout behaves 88 | 89 | let bytes: Vec = vec![500_000, 999_999, 1_000_001, 2_000_000]; 90 | // Test that timeout on successive errors follows a desired curve 91 | 92 | // These are all parameters related to timeout, shown explicitly 93 | let cfg = Config { 94 | algorithm: AlgorithmConfig { 95 | backoff: 1.5, 96 | base_timeout: 0.5, 97 | timeout_fraction: 1.5, 98 | avg_power: 0.7, 99 | ..Default::default() 100 | }, 101 | ..Default::default() 102 | }; 103 | 104 | for bytes in bytes { 105 | println!("# Bytes = {}", bytes); 106 | let timeout = TimeoutState::new(cfg.algorithm.clone(), cfg.put_requests.clone()); 107 | 108 | let timeouts = (1..=10) 109 | .map(|retries| timeout.get_timeout(bytes, retries)) 110 | .collect::>(); 111 | println!("{:?}", timeouts); 112 | } 113 | } 114 | 115 | /// Returns the common prefix of all files in S3 116 | async fn upload_test_files(s3: S3Algo, parent: &Path, n_files: usize) -> Result { 117 | let dir_key = Path::new(&rand_string(4)) 118 | .join(rand_string(4)) 119 | .join(rand_string(4)); 120 | let dir = parent.join(&dir_key); 121 | std::fs::create_dir_all(&dir).unwrap(); 122 | for i in 0..n_files { 123 | std::fs::write(dir.join(format!("img_{}.tif", i)), "file contents").unwrap(); 124 | } 125 | 126 | println!("Upload {} to {:?} ", dir.display(), dir_key); 127 | s3.upload_files( 128 | "test-bucket".into(), 129 | files_recursive(dir.clone(), dir.strip_prefix(parent).unwrap().to_owned()), 130 | |_| async move {}, 131 | |client| client.put_object(), 132 | ) 133 | .await?; 134 | Ok(dir_key) 135 | } 136 | 137 | // TODO uncomment after rewriting move_all function ETC 138 | /* 139 | #[tokio::test] 140 | async fn test_move_files() { 141 | const N_FILES: usize = 100; 142 | let s3 = testing_sdk_client().await; 143 | let algo = S3Algo::new(s3.clone()); 144 | let tmp_dir = TempDir::new("s3-testing").unwrap(); 145 | let prefix = upload_test_files(algo.clone(), tmp_dir.path(), N_FILES) 146 | .await 147 | .unwrap(); 148 | let new_prefix = PathBuf::from("haha/lala"); 149 | println!( 150 | "Move prefix {} to {}", 151 | prefix.display(), 152 | new_prefix.display() 153 | ); 154 | 155 | // TODO try also the following more manual way of doing the same 156 | /* 157 | algo.list_prefix("test-bucket".into(), format!("{}", prefix.display())) 158 | .boxed() // hope we can remove boxed() soon (it's for reducing type size) 159 | .move_all( 160 | move |key| { 161 | let key = PathBuf::from(key); 162 | let name = key.file_name().unwrap(); 163 | format!("{}/{}", new_prefix2.display(), name.to_str().unwrap()) 164 | }, 165 | None, 166 | ) 167 | .await 168 | .unwrap(); 169 | */ 170 | algo.list_prefix("test-bucket".into(), prefix.to_str().map(|x| x.to_owned())) 171 | .boxed() // hope we can remove boxed() soon (it's for reducing type size) 172 | .move_to_prefix( 173 | None, 174 | new_prefix.to_str().unwrap().to_owned(), 175 | Default::default, 176 | ) 177 | .boxed() 178 | .await 179 | .unwrap(); 180 | 181 | // Check that all files are under `new_prefix` and not under `prefix` 182 | for i in 0..N_FILES { 183 | let key = new_prefix.join(format!("img_{}.tif", i)); 184 | let response = s3.get_object(GetObjectRequest { 185 | bucket: "test-bucket".to_string(), 186 | key: key.to_str().unwrap().to_string(), 187 | ..Default::default() 188 | }); 189 | let _ = response.await.unwrap(); 190 | 191 | let key = prefix.join(format!("img_{}.tif", i)); 192 | let response = s3.get_object(GetObjectRequest { 193 | bucket: "test-bucket".to_string(), 194 | key: key.to_str().unwrap().to_string(), 195 | ..Default::default() 196 | }); 197 | let _ = response.await.unwrap_err(); 198 | } 199 | } 200 | */ 201 | 202 | // TODO: uncomment after rewriting copy_all function 203 | /* 204 | #[tokio::test] 205 | async fn test_copy_files() { 206 | const N_FILES: usize = 100; 207 | let s3 = testing_s3_client(); 208 | let algo = S3Algo::new(s3.clone()); 209 | let tmp_dir = TempDir::new("s3-testing").unwrap(); 210 | let prefix = upload_test_files(algo.clone(), tmp_dir.path(), N_FILES) 211 | .await 212 | .unwrap(); 213 | 214 | let n = Arc::new(std::sync::Mutex::new(0_usize)); 215 | let m = n.clone(); 216 | algo.list_prefix("test-bucket".into(), prefix.to_str().unwrap().to_owned()) 217 | .boxed() // hope we can remove boxed() soon (it's for reducing type size) 218 | .copy_all( 219 | Some("test-bucket2".into()), 220 | move |key| { 221 | *m.lock().unwrap() += 1; 222 | format!("test_copy_files/{}", key) 223 | }, 224 | Default::default, 225 | ) 226 | .boxed() 227 | .await 228 | .unwrap(); 229 | assert_eq!(*n.lock().unwrap(), N_FILES); 230 | 231 | // Check that all objects are present in both buckets 232 | for i in 0..N_FILES { 233 | let key = format!("test_copy_files/{}/img_{}.tif", prefix.display(), i); 234 | let response = s3.get_object(GetObjectRequest { 235 | bucket: "test-bucket2".to_string(), 236 | key, 237 | ..Default::default() 238 | }); 239 | let _ = response.await.unwrap(); 240 | 241 | let key = prefix.join(format!("img_{}.tif", i)); 242 | let response = s3.get_object(GetObjectRequest { 243 | bucket: "test-bucket".to_string(), 244 | key: key.to_str().unwrap().to_string(), 245 | ..Default::default() 246 | }); 247 | let _ = response.await.unwrap(); 248 | } 249 | } 250 | */ 251 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! # S3 high-performance algorithms 2 | //! High-performance algorithms for batch operations in Amazon S3. 3 | //! 4 | //! https://docs.aws.amazon.com/AmazonS3/latest/dev/optimizing-performance-guidelines.html 5 | //! 6 | //! - Upload multiple files with `S3Algo::upload_files`. 7 | //! - List files with `S3Algo::s3_list_objects` or `S3Algo::s3_list_prefix`, 8 | //! and then execute deletion or copy on all the files. 9 | 10 | use crate::timeout::*; 11 | use aws_config::default_provider::credentials::DefaultCredentialsChain; 12 | use aws_config::meta::region::RegionProviderChain; 13 | use aws_config::BehaviorVersion; 14 | use aws_sdk_s3::config::retry::RetryConfig; 15 | use aws_sdk_s3::Client; 16 | use futures::future::{Future, TryFutureExt}; 17 | use futures::prelude::*; 18 | use futures::stream; 19 | use futures_retry::{FutureRetry, RetryPolicy}; 20 | use futures_stopwatch::try_stopwatch; 21 | use snafu::futures::TryFutureExt as S; 22 | use snafu::ResultExt; 23 | use std::marker::Unpin; 24 | use std::path::PathBuf; 25 | use std::sync::Arc; 26 | use std::time::Duration; 27 | use tokio::sync::Mutex; 28 | 29 | mod config; 30 | pub mod err; 31 | mod list_actions; 32 | mod upload; 33 | 34 | pub use list_actions::*; 35 | pub use upload::*; 36 | pub mod timeout; 37 | pub use config::*; 38 | pub use err::Error; 39 | 40 | #[cfg(test)] 41 | mod test; 42 | 43 | #[derive(Clone)] 44 | pub struct S3Algo { 45 | s3: Client, 46 | config: Config, 47 | } 48 | impl S3Algo { 49 | pub fn new(s3: Client) -> Self { 50 | Self { 51 | s3, 52 | config: Config::default(), 53 | } 54 | } 55 | pub fn with_config(s3: Client, config: Config) -> Self { 56 | Self { s3, config } 57 | } 58 | } 59 | 60 | /// Result of a single S3 request. 61 | #[derive(Debug, Clone, Copy)] 62 | pub struct RequestReport { 63 | /// The number of this request in a series of multiple requests (0 if not applicable) 64 | pub seq: usize, 65 | /// Size of request - in bytes or in number of objects, depending on the type of request. 66 | pub size: usize, 67 | /// The total time including all retries 68 | pub total_time: Duration, 69 | /// The time of the successful request 70 | pub success_time: Duration, 71 | /// Number of attempts. A value of `1` means no retries - success on first attempt. 72 | pub attempts: usize, 73 | /// Estimated sec/unit that was used in this request. Useful for 74 | /// debugging the upload algorithm and not much more. 75 | pub est: f64, 76 | } 77 | 78 | /// Issue a single S3 request, with retries and appropriate timeouts using sane defaults. 79 | /// Basically an easier, less general version of `s3_request`. 80 | /// 81 | /// `extra_initial_timeout`: initial timeout of request (will increase with backoff) added to 82 | /// `cfg.base_timeout`. It can be set to 0 if the S3 operation is a small one, but if the operation 83 | /// size depends on for example a byte count or object count, set it to something that depends on 84 | /// that. 85 | pub async fn s3_single_request( 86 | future_factory: F, 87 | extra_initial_timeout_s: f64, 88 | ) -> Result<(RequestReport, R), Error> 89 | where 90 | F: Fn() -> G + Unpin + Clone + Send + Sync + 'static, 91 | G: Future> + Send, 92 | { 93 | // Configure a one-time Timeout that gives the desired initial_timeout_s on first try. 94 | // We tell `s3_request` that the request is of size `1` 95 | 96 | let timeout = TimeoutState::new( 97 | AlgorithmConfig::default(), 98 | SpecificTimings { 99 | seconds_per_unit: extra_initial_timeout_s, 100 | minimum_units_for_estimation: 0, // doesn't matter 101 | }, 102 | ); 103 | 104 | s3_request( 105 | move || { 106 | let factory = future_factory.clone(); 107 | async move { Ok((factory(), 1)) } 108 | }, 109 | |_, size| size, 110 | 10, 111 | Arc::new(Mutex::new(timeout)), 112 | ) 113 | .await 114 | } 115 | 116 | /// Every request to S3 should be issued with `s3_request`, which puts the appropriate timeouts and 117 | /// retries the request, as well as times it. 118 | /// 119 | /// `future_factory` is a bit funky, being a closure that returns a future that resolves to another 120 | /// future. We need the closure F to run the request multiple times. Its return type G is a future 121 | /// because it might need to for example open a file using async, which might then be used in H to 122 | /// stream from the file... 123 | /// This is needed so that we can get e.g. the length of the file before streaming to S3. 124 | /// 125 | /// `get_size(report, expected)`: get the real size of the request. For some types of requests 126 | /// (e.g. DeleteObjects/PutObject), we know the size upfront, so real size = expected. 127 | /// For others (ListObjectsV2), we need to result of the action to know the size. 128 | /// The size returned from this function is only used to construct the `RequestReport`, which in 129 | /// turn is only useful for eventual progress closures. So the existence of `get_size` parameter is 130 | /// due to the feature of monitoring progress. 131 | /// 132 | /// The "expected" size returned by `future_factory` on the other hand is needed to calculate the 133 | /// timeout. 134 | pub(crate) async fn s3_request( 135 | future_factory: F, 136 | get_size: S, 137 | n_retries: usize, 138 | timeout: Arc>, 139 | ) -> Result<(RequestReport, R), Error> 140 | where 141 | F: Fn() -> G + Unpin + Clone + Send + Sync + 'static, 142 | G: Future> + Send, 143 | H: Future> + Send, 144 | S: Fn(&R, usize) -> usize + Unpin + Clone + Send + Sync + 'static, 145 | T: timeout::Timeout, 146 | { 147 | let mut attempts1 = 0; 148 | let mut attempts2 = 0; 149 | try_stopwatch( 150 | // Time the entire file upload (across all retries) 151 | FutureRetry::new( 152 | // Future factory - creates a future that reads file while uploading it 153 | move || { 154 | let (future_factory, timeout, get_size) = 155 | (future_factory.clone(), timeout.clone(), get_size.clone()); 156 | 157 | async move { 158 | attempts1 += 1; 159 | let (request, expected_size) = future_factory().await?; 160 | let (est, timeout_value) = { 161 | let t = timeout.lock().await; 162 | (t.get_estimate(), t.get_timeout(expected_size, attempts1)) 163 | }; 164 | try_stopwatch( 165 | tokio::time::timeout(timeout_value, request) 166 | .with_context(|| err::Timeout {}) 167 | .map(|result| result.and_then(|x| x)), // flatten the Result, timeout err> 168 | ) 169 | .map_ok(move |(response, success_time)| { 170 | let real_size = get_size(&response, expected_size); 171 | (response, success_time, real_size, est) 172 | }) 173 | .await 174 | } 175 | }, 176 | // retry function 177 | { 178 | move |e| { 179 | attempts2 += 1; 180 | if attempts2 > n_retries { 181 | RetryPolicy::ForwardError(e) 182 | } else { 183 | RetryPolicy::WaitRetry(Duration::from_millis(200)) // TODO adjust the time, maybe depending on retries 184 | } 185 | } 186 | }, 187 | ), 188 | ) 189 | .await 190 | .map( 191 | move |(((response, success_time, size, est), attempts), total_time)| { 192 | ( 193 | RequestReport { 194 | seq: 0, 195 | size, 196 | total_time, 197 | success_time, 198 | attempts, 199 | est, 200 | }, 201 | response, 202 | ) 203 | }, 204 | ) 205 | .map_err(|(err, _attempts)| err) 206 | } 207 | 208 | pub async fn retriable_s3_client() -> Client { 209 | let retry_config = RetryConfig::standard() 210 | .with_max_attempts(3) 211 | .with_initial_backoff(Duration::from_secs(10)); 212 | 213 | let region_provider = RegionProviderChain::default_provider(); 214 | let sdk_config = aws_config::defaults(BehaviorVersion::v2024_03_28()) 215 | .region(region_provider) 216 | .load() 217 | .await; 218 | 219 | let mut s3_config_builder = aws_sdk_s3::config::Builder::from(&sdk_config); 220 | s3_config_builder.set_retry_config(Some(retry_config)); 221 | 222 | aws_sdk_s3::Client::from_conf(s3_config_builder.build()) 223 | } 224 | 225 | pub async fn testing_sdk_client() -> Client { 226 | let retry_config = RetryConfig::standard() 227 | .with_max_attempts(3) 228 | .with_initial_backoff(Duration::from_secs(10)); 229 | 230 | let credentials_provider = DefaultCredentialsChain::builder() 231 | .profile_name("testing") 232 | .build() 233 | .await; 234 | let region_provider = RegionProviderChain::first_try("EuWest1"); 235 | let sdk_config = aws_config::defaults(BehaviorVersion::v2024_03_28()) 236 | .region(region_provider) 237 | .endpoint_url("http://localhost:9000") 238 | .credentials_provider(credentials_provider) 239 | .load() 240 | .await; 241 | 242 | let mut s3_config_builder = aws_sdk_s3::config::Builder::from(&sdk_config); 243 | s3_config_builder.set_retry_config(Some(retry_config)); 244 | s3_config_builder.set_force_path_style(Some(true)); 245 | 246 | Client::from_conf(s3_config_builder.build()) 247 | } 248 | -------------------------------------------------------------------------------- /src/list_actions.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | use aws_sdk_s3::operation::list_objects_v2::ListObjectsV2Output; 3 | use aws_sdk_s3::primitives::ByteStream; 4 | use aws_sdk_s3::types::{Delete, Object, ObjectIdentifier}; 5 | use aws_smithy_types_convert::stream::PaginationStreamExt; 6 | use futures::future::ok; 7 | use futures::stream::Stream; 8 | use std::future::Future; 9 | use std::pin::Pin; 10 | use std::task::{Context, Poll}; 11 | use tokio::io; 12 | 13 | /// A stream that can list objects, and (using member functions) delete or copy listed files. 14 | pub struct ListObjects { 15 | s3: Client, 16 | config: Config, 17 | bucket: String, 18 | /// Common prefix (as requested) of the listed objects. Empty string if all objects were 19 | /// requestd. 20 | prefix: String, 21 | stream: S, 22 | } 23 | impl ListObjects 24 | where 25 | S: Stream> + Sized + Send + 'static, 26 | { 27 | pub fn boxed( 28 | self, 29 | ) -> ListObjects> + Send>>> { 30 | ListObjects { 31 | s3: self.s3, 32 | config: self.config, 33 | bucket: self.bucket, 34 | stream: self.stream.boxed(), 35 | prefix: self.prefix, 36 | } 37 | } 38 | 39 | /// Calls an async closure on all the individual objects of the list operation 40 | pub async fn process(self, f: P) -> Result<(), Error> 41 | where 42 | P: Fn(Object) -> F + Clone, 43 | F: Future, 44 | { 45 | let ListObjects { 46 | stream, prefix: _, .. 47 | } = self; 48 | stream 49 | .try_filter_map(|response| ok(response.contents)) 50 | .map_ok(|x| stream::iter(x).map(Ok)) 51 | .try_flatten() 52 | .try_for_each_concurrent(None, move |object| { 53 | let f = f.clone(); 54 | async move { 55 | f(object).await; 56 | Ok(()) 57 | } 58 | }) 59 | .await 60 | } 61 | /// Download all listed objects - returns a stream of the contents. 62 | /// Used as a basis for other `download_all_*` functions. 63 | pub fn download_all_stream( 64 | self, 65 | ) -> impl Stream), Error>> { 66 | let ListObjects { 67 | s3, 68 | config: _, 69 | bucket, 70 | stream, 71 | prefix: _, 72 | } = self; 73 | stream 74 | .try_filter_map(|response| ok(response.contents)) 75 | .map_ok(|x| stream::iter(x).map(Ok)) 76 | .try_flatten() 77 | .map(|result| { 78 | result.and_then(|obj| { 79 | let Object { key, size, .. } = obj; 80 | if let Some(key) = key { 81 | Ok((key, size)) 82 | } else { 83 | Err(Error::MissingKeyOrSize) 84 | } 85 | }) 86 | }) 87 | .and_then(move |(key, _)| { 88 | let (s3, bucket) = (s3.clone(), bucket.clone()); 89 | 90 | async move { 91 | let output = s3 92 | .get_object() 93 | .bucket(bucket.clone()) 94 | .key(key.clone()) 95 | .send() 96 | .await 97 | .context(err::GetObject { 98 | key: key.clone(), 99 | bucket, 100 | })?; 101 | Ok((key, output.body, output.content_length)) 102 | } 103 | }) 104 | } 105 | 106 | pub fn download_all_to_vec(self) -> impl Stream), Error>> { 107 | self.download_all_stream() 108 | .and_then(|(key, body, _)| async move { 109 | let mut contents = vec![]; 110 | io::copy(&mut body.into_async_read(), &mut contents) 111 | .await 112 | .context(err::TokioIo)?; 113 | Ok((key, contents)) 114 | }) 115 | } 116 | 117 | /* 118 | /// Download all listed objects to file system. 119 | /// UNIMPLEMENTED. 120 | pub fn download_all(self) -> impl Future> { 121 | // TODO use download_all_stream 122 | ok(unimplemented!()) 123 | } 124 | */ 125 | 126 | /// Delete all listed objects. 127 | /// 128 | /// With the two arguments, you can implement a detailed real-time progress report of both how 129 | /// many files have been listed, and how many files have been deleted. 130 | /// 131 | /// `list_progress`: Closure that is given number of files listed as argument. Is called 132 | /// several times, one for each batch of files listed. 133 | /// `delete_progress`: Closure that is given RequestReport of a delete request. The `size` 134 | /// field refers to the number of fields deleted. 135 | /// 136 | pub fn delete_all( 137 | self, 138 | list_progress: P1, 139 | delete_progress: P2, 140 | ) -> impl Future> 141 | where 142 | P1: Fn(usize) -> F1 + Clone + Send + Sync + 'static, 143 | P2: Fn(RequestReport) -> F2 + Clone + Send + Sync + 'static, 144 | F1: Future + Send + 'static, 145 | F2: Future + Send + 'static, 146 | { 147 | // For each ListObjectsV2Output, send a request to delete all the listed objects 148 | let ListObjects { 149 | s3, 150 | config, 151 | bucket, 152 | stream, 153 | prefix: _, 154 | } = self; 155 | let timeout = Arc::new(Mutex::new(TimeoutState::new( 156 | config.algorithm.clone(), 157 | config.delete_requests.clone(), 158 | ))); 159 | let n_retries = config.algorithm.n_retries; 160 | stream.try_for_each_concurrent(None, move |object| { 161 | let (s3, bucket, timeout, delete_progress2, list_progress2) = ( 162 | s3.clone(), 163 | bucket.clone(), 164 | timeout.clone(), 165 | delete_progress.clone(), 166 | list_progress.clone(), 167 | ); 168 | let objects = object 169 | .contents 170 | .unwrap_or_default() // unwrap or empty Vec 171 | .iter() 172 | .filter_map(|obj| { 173 | obj.key.as_ref().map(|key| { 174 | ObjectIdentifier::builder() 175 | .set_key(Some(key.clone())) 176 | .set_version_id(None) 177 | .build() 178 | .unwrap() // unwrap: shouldn't fail building as the key comes directly 179 | // from S3 180 | }) 181 | }) 182 | .collect::>(); 183 | let n_objects = objects.len(); 184 | 185 | async move { 186 | list_progress2(n_objects).await; 187 | let (report, _) = s3_request( 188 | move || { 189 | let (s3, bucket, objects) = (s3.clone(), bucket.clone(), objects.clone()); 190 | async move { 191 | let (s3, bucket, objects) = 192 | (s3.clone(), bucket.clone(), objects.clone()); 193 | Ok(( 194 | async move { 195 | s3.delete_objects() 196 | .set_bucket(Some(bucket)) 197 | .set_delete(Some( 198 | Delete::builder() 199 | .set_objects(Some(objects)) 200 | .build() 201 | .unwrap(), // unwrap: shouldn't fail building 202 | // because all the input comes directly from S3 203 | )) 204 | .send() 205 | .await 206 | .map_err(|e| e.into()) 207 | }, 208 | n_objects, 209 | )) 210 | } 211 | }, 212 | |_, size| size, 213 | n_retries, 214 | timeout.clone(), 215 | ) 216 | .await?; 217 | timeout.lock().await.update(&report); 218 | delete_progress2(report).await; 219 | Ok(()) 220 | } 221 | }) 222 | } 223 | 224 | /// Flatten into a stream of Objects. 225 | pub fn flatten(self) -> impl Stream> { 226 | self.stream 227 | .try_filter_map(|response| ok(response.contents)) 228 | .map_ok(|x| stream::iter(x).map(Ok)) 229 | .try_flatten() 230 | } 231 | 232 | /* 233 | /// This function exists to provide a stream to copy all objects, for both `copy_all` and 234 | /// `move_all`. The `String` that is the stream's `Item` is the _source key_. An `Ok` value 235 | /// thus signals (relevant when used in `move_all`) that a certain key is ready for deletion. 236 | fn copy_all_stream( 237 | self, 238 | dest_bucket: Option, 239 | mapping: F, 240 | default_request: R, 241 | ) -> impl Stream> 242 | where 243 | F: Fn(&str) -> String + Clone + Send + Sync + Unpin + 'static, 244 | R: Fn() -> CopyObjectRequest + Clone + Unpin + Sync + Send + 'static, 245 | { 246 | let ListObjects { 247 | s3, 248 | config, 249 | bucket, 250 | stream, 251 | prefix: _, 252 | } = self; 253 | let timeout = Arc::new(Mutex::new(TimeoutState::new( 254 | config.algorithm.clone(), 255 | config.put_requests.clone(), 256 | ))); 257 | let n_retries = config.algorithm.n_retries; 258 | let dest_bucket = dest_bucket.unwrap_or_else(|| bucket.clone()); 259 | stream 260 | .try_filter_map(|response| ok(response.1.contents)) 261 | .map_ok(|x| stream::iter(x).map(Ok)) 262 | .try_flatten() 263 | .try_filter_map(|obj| { 264 | // Just filter out any object that does not have both of `key` and `size` 265 | let Object { key, size, .. } = obj; 266 | ok(key.and_then(|key| size.map(|size| (key, size)))) 267 | }) 268 | .and_then(move |(key, size)| { 269 | let (s3, timeout) = (s3.clone(), timeout.clone()); 270 | let request = CopyObjectRequest { 271 | copy_source: format!("{}/{}", bucket, key), 272 | bucket: dest_bucket.clone(), 273 | key: mapping(&key), 274 | ..default_request() 275 | }; 276 | // println!("COPY REQUEST\n{:#?}", request); 277 | s3_request( 278 | move || { 279 | let (s3, request) = (s3.clone(), request.clone()); 280 | async move { 281 | let (s3, request) = (s3.clone(), request.clone()); 282 | Ok((async move{s3.copy_object(request).context(err::CopyObject).await}, size as usize)) 283 | } 284 | }, 285 | |_, size| size, 286 | n_retries, 287 | timeout, 288 | ) 289 | .map_ok(|_| key) 290 | }) 291 | } 292 | 293 | /// Copy all listed objects, to a different S3 location as defined in `mapping` and 294 | /// `dest_bucket`. 295 | /// If `other_bucket` is not provided, copy to same bucket 296 | pub fn copy_all( 297 | self, 298 | dest_bucket: Option, 299 | mapping: F, 300 | default_request: R, 301 | ) -> impl Future> 302 | where 303 | F: Fn(&str) -> String + Clone + Send + Sync + Unpin + 'static, 304 | R: Fn() -> CopyObjectRequest + Clone + Unpin + Sync + Send + 'static, 305 | { 306 | self.copy_all_stream(dest_bucket, mapping, default_request) 307 | .try_for_each(|_| async { Ok(()) }) 308 | } 309 | // TODO: Is it possible to change copy_all so that we can move_all by just chaining copy_all 310 | // and delete_all? Then copy_all would need to return a stream of old keys, but does that make 311 | // sense in general? 312 | // For now, this is code duplication. 313 | pub fn move_all( 314 | self, 315 | dest_bucket: Option, 316 | mapping: F, 317 | default_request: R, 318 | ) -> impl Future> 319 | where 320 | F: Fn(&str) -> String + Clone + Send + Sync + Unpin + 'static, 321 | R: Fn() -> CopyObjectRequest + Clone + Unpin + Sync + Send + 'static, 322 | { 323 | let src_bucket = self.bucket.clone(); 324 | let timeout = Arc::new(Mutex::new(TimeoutState::new( 325 | self.config.algorithm.clone(), 326 | self.config.delete_requests.clone(), 327 | ))); 328 | let n_retries = self.config.algorithm.n_retries; 329 | let s3 = self.s3.clone(); 330 | self.copy_all_stream(dest_bucket, mapping, default_request) 331 | .and_then(move |src_key| { 332 | let delete_request = DeleteObjectRequest { 333 | bucket: src_bucket.clone(), 334 | key: src_key, 335 | ..Default::default() 336 | }; 337 | let (s3, timeout) = (s3.clone(), timeout.clone()); 338 | s3_request( 339 | move || { 340 | let (s3, delete_request) = (s3.clone(), delete_request.clone()); 341 | async move { 342 | let (s3, delete_request) = (s3.clone(), delete_request.clone()); 343 | Ok(( 344 | async move { 345 | s3.delete_object(delete_request) 346 | .context(err::DeleteObject) 347 | .await 348 | }, 349 | 1, 350 | )) 351 | } 352 | }, 353 | |_, _| 1, 354 | n_retries, 355 | timeout, 356 | ) 357 | .map_ok(drop) 358 | .boxed() 359 | }) 360 | .try_for_each(|_| async { Ok(()) }) 361 | .boxed() 362 | } 363 | /// Move all listed objects by substituting their common prefix with `new_prefix`. 364 | pub fn move_to_prefix( 365 | self, 366 | dest_bucket: Option, 367 | new_prefix: String, 368 | default_request: R, 369 | ) -> impl Future> 370 | where 371 | R: Fn() -> CopyObjectRequest + Clone + Unpin + Sync + Send + 'static, 372 | { 373 | let old_prefix = self.prefix.clone(); 374 | let substitute_prefix = 375 | move |source: &str| format!("{}{}", new_prefix, source.trim_start_matches(&old_prefix)); 376 | self.move_all(dest_bucket, substitute_prefix, default_request) 377 | .boxed() 378 | } 379 | */ 380 | } 381 | 382 | impl Stream for ListObjects 383 | where 384 | S: Stream> + Sized + Send + Unpin, 385 | { 386 | type Item = Result; 387 | fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context) -> Poll> { 388 | Pin::new(&mut self.stream).poll_next(cx) 389 | } 390 | } 391 | 392 | impl S3Algo { 393 | /// List objects of a bucket. 394 | pub fn list_prefix( 395 | &self, 396 | bucket: String, 397 | prefix: Option, 398 | ) -> ListObjects> + Sized + Send> { 399 | // TODO: Reintroduce retry and timeout 400 | 401 | let stream = self 402 | .s3 403 | .list_objects_v2() 404 | .bucket(bucket.clone()) 405 | .set_prefix(prefix) 406 | .into_paginator() 407 | .send(); 408 | let stream = PaginationStreamExt::into_stream_03x(stream) 409 | // Turn into a stream of Objects 410 | .map_err(|source| Error::ListObjectsV2 { source }); 411 | 412 | ListObjects { 413 | s3: self.s3.clone(), 414 | config: self.config.clone(), 415 | stream, 416 | bucket, 417 | prefix: String::new(), 418 | } 419 | } 420 | } 421 | 422 | #[cfg(test)] 423 | mod test { 424 | use super::*; 425 | use crate::test::rand_string; 426 | use std::sync::atomic::{AtomicUsize, Ordering}; 427 | #[tokio::test] 428 | async fn test_s3_delete_files_progress() { 429 | // Minio does paging at 10'000 fles, so we need more than that. 430 | // It means this test will take a minutes or two. 431 | let algo = S3Algo::new(testing_sdk_client().await); 432 | let dir = rand_string(14); 433 | let dir2 = dir.clone(); 434 | const N_FILES: usize = 11_000; 435 | let files = (0..N_FILES).map(move |i| ObjectSource::Data { 436 | data: vec![1, 2, 3], 437 | key: format!("{}/{}.file", dir2, i), 438 | }); 439 | algo.upload_files( 440 | "test-bucket".into(), 441 | files, 442 | |result| async move { 443 | if result.seq % 100 == 0 { 444 | println!("{} files uploaded", result.seq); 445 | } 446 | }, 447 | |client| client.put_object(), 448 | ) 449 | .await 450 | .unwrap(); 451 | 452 | let listed_files = Arc::new(AtomicUsize::new(0)); 453 | let deleted_files = Arc::new(AtomicUsize::new(0)); 454 | let listed_files2 = listed_files.clone(); 455 | let deleted_files2 = deleted_files.clone(); 456 | 457 | // Do one listing only to check the exact file names 458 | let present = Arc::new(Mutex::new(std::collections::HashSet::new())); 459 | algo.list_prefix("test-bucket".into(), Some(dir.clone())) 460 | .process(|object| async { 461 | let name = object.key.unwrap_or_else(|| "NONE".to_string()); 462 | println!("OBJ {}", name); 463 | present.lock().await.insert(name); 464 | }) 465 | .await 466 | .unwrap(); 467 | let mut present = present.lock().await; 468 | 469 | // All files are present 470 | for i in 0..N_FILES { 471 | let file_name = &format!("{}/{}.file", dir, i); 472 | assert!(present.remove(file_name)); 473 | } 474 | 475 | // No unexpected filesnames. 476 | // Because once, it listed 11_200 files instead of 11_000 477 | if !present.is_empty() { 478 | println!("Left-over object names: {:?}", present); 479 | panic!("Not empty ({} files)", present.len()); 480 | } 481 | 482 | // Assert that number of files is N_FILES 483 | let count = algo 484 | .list_prefix("test-bucket".into(), Some(dir.clone())) 485 | .flatten() 486 | .try_fold(0usize, |acc, _| ok(acc + 1)) 487 | .await 488 | .unwrap(); 489 | assert_eq!(count, N_FILES); 490 | 491 | // Delete all 492 | algo.list_prefix("test-bucket".into(), Some(dir.clone())) 493 | .delete_all( 494 | move |n| { 495 | println!("Listed {} items", n); 496 | let listed_files = listed_files2.clone(); 497 | async move { 498 | listed_files.fetch_add(n, Ordering::Relaxed); 499 | } 500 | }, 501 | move |del_rep| { 502 | let n = del_rep.size as usize; 503 | println!("Deleted {} items", n); 504 | let deleted_files = deleted_files2.clone(); 505 | async move { 506 | deleted_files.fetch_add(n, Ordering::Relaxed); 507 | } 508 | }, 509 | ) 510 | .await 511 | .unwrap(); 512 | 513 | // Assert number of objects listed and deleted 514 | assert_eq!(listed_files.load(Ordering::Relaxed), N_FILES); 515 | assert_eq!(deleted_files.load(Ordering::Relaxed), N_FILES); 516 | 517 | // Assert that number of files is 0 518 | let count = algo 519 | .list_prefix("test-bucket".into(), Some(dir)) 520 | .flatten() 521 | .try_fold(0usize, |acc, _| ok(acc + 1)) 522 | .await 523 | .unwrap(); 524 | 525 | assert_eq!(count, 0); 526 | } 527 | } 528 | --------------------------------------------------------------------------------