├── rustfmt.toml
├── .gitignore
├── .pre-commit
├── examples
    ├── simple.rs
    └── perf_data.rs
├── Cargo.toml
├── src
    ├── timeout.rs
    ├── err.rs
    ├── config.rs
    ├── upload.rs
    ├── test.rs
    ├── lib.rs
    └── list_actions.rs
└── README.md


/rustfmt.toml:
--------------------------------------------------------------------------------
1 | edition = "2021"
2 | imports_granularity = "Module"
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | **/*.rs.bk
3 | Cargo.lock
4 | rusty-tags.vi
5 | 


--------------------------------------------------------------------------------
/.pre-commit:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env sh
 2 | 
 3 | cargo fmt -- --check
 4 | code="$?"
 5 | 
 6 | if [ "$code" -ne 0 ]; then
 7 |     echo "Please run \`cargo fmt' before committing"
 8 |     exit "$code"
 9 | fi
10 | 


--------------------------------------------------------------------------------
/examples/simple.rs:
--------------------------------------------------------------------------------
 1 | use s3_algo::*;
 2 | 
 3 | #[tokio::main]
 4 | async fn main() {
 5 |     const N_FILES: usize = 10;
 6 |     let files =
 7 |         (0..N_FILES).map(|i| ObjectSource::data(format!("hey, {}", i), format!("hey{}", i)));
 8 |     let s3 = S3Algo::new(testing_sdk_client().await);
 9 |     s3.upload_files(
10 |         "test-bucket".into(),
11 |         files,
12 |         |result| async move { println!("File {}/{} successfully uploaded", result.seq + 1, N_FILES)},
13 |         |client| client.put_object()
14 |     )
15 |     .await
16 |     .unwrap();
17 | }
18 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "s3-algo"
 3 | description = "High-performance algorithms for batch operations to Amazon S3"
 4 | version = "0.7.0"
 5 | authors = ["Erlend Langseth <3rlendhl@gmail.com>"]
 6 | license = "MIT"
 7 | edition = "2018"
 8 | 
 9 | documentation = "https://docs.rs/s3-algo/"
10 | repository = "https://github.com/openanalytics/s3-algo"
11 | 
12 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
13 | 
14 | [dependencies]
15 | futures = "0.3.17"
16 | futures-stopwatch = "0.3.0"
17 | futures-retry = "0.6.0"
18 | tokio = {version = "1.10", features = ["time", "fs", "macros", "io-util", "sync", "rt-multi-thread"]}
19 | tokio-util = {version = "0.7.0", features = ["codec"]}
20 | bytes = "1.2.1"
21 | serde = {optional = true, version = "1.0.130", features = ["derive"]}
22 | snafu = {version = "0.6.1", features = ["futures"]}
23 | walkdir = "2.2.9"
24 | 
25 | aws-sdk-s3 = "1.14.0"
26 | aws-config = "1.1.4"
27 | aws-smithy-http = "0.60.4"
28 | aws-smithy-async = "1.1.4"
29 | aws-smithy-types-convert = {version = "0.60.4", features = ["convert-streams"]}
30 | 
31 | [target.'cfg(windows)'.dependencies]
32 | # only needed on windows for substituting \ with / in paths
33 | path-slash = "0.2.1"
34 | 
35 | [dev-dependencies]
36 | tempdir = "0.3.7"
37 | multi-default-trait-impl = "0.1.2"
38 | rand = "0.8.5"
39 | clap = "3.0.0"
40 | 
41 | [features]
42 | default = ["serde1"]
43 | serde1 = ["serde"]
44 | 


--------------------------------------------------------------------------------
/src/timeout.rs:
--------------------------------------------------------------------------------
 1 | //! The `Timeout` trait defines the how the timeout value of a multi-file upload evolves based on
 2 | //! past file upload results. A default implementation `TimeoutState` is provided.
 3 | use crate::config::*;
 4 | use crate::RequestReport;
 5 | use std::time::Duration;
 6 | pub trait Timeout: Send + 'static {
 7 |     /// Size is in either bytes or objects, depending on the type of requests.
 8 |     fn get_timeout(&self, size: usize, retries: usize) -> Duration;
 9 |     /// Update the internal estimate of the extra timeout per unit of size
10 |     fn update(&mut self, _: &RequestReport);
11 |     /// get estimated upload speed
12 |     fn get_estimate(&self) -> f64;
13 | }
14 | /// State for timeouts, especially tailored toward uploading files.
15 | /// But can be useful in any case where the size of an operation in bytes is known.
16 | pub struct TimeoutState {
17 |     seconds_per_unit_estimate: f64,
18 |     cfg: AlgorithmConfig,
19 |     specific: SpecificTimings,
20 | }
21 | impl TimeoutState {
22 |     pub fn new(cfg: AlgorithmConfig, specific: SpecificTimings) -> TimeoutState {
23 |         TimeoutState {
24 |             seconds_per_unit_estimate: specific.seconds_per_unit,
25 |             cfg,
26 |             specific,
27 |         }
28 |     }
29 | }
30 | impl Timeout for TimeoutState {
31 |     /// Not used by algorithm
32 |     fn get_estimate(&self) -> f64 {
33 |         self.seconds_per_unit_estimate
34 |     }
35 |     fn get_timeout(&self, size: usize, retries: usize) -> Duration {
36 |         let backoff = self.cfg.backoff.powi(retries as i32);
37 |         let time_estimate = (size as f64) * self.seconds_per_unit_estimate * backoff;
38 |         Duration::from_secs_f64(
39 |             self.cfg.base_timeout * backoff + self.cfg.timeout_fraction * time_estimate,
40 |         )
41 |     }
42 |     fn update(&mut self, result: &RequestReport) {
43 |         if result.size > self.specific.minimum_units_for_estimation {
44 |             let target = result.success_time.as_secs_f64() / (result.size as f64);
45 |             self.seconds_per_unit_estimate = self.cfg.avg_power * self.seconds_per_unit_estimate
46 |                 + (1.0 - self.cfg.avg_power) * target;
47 |         }
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/examples/perf_data.rs:
--------------------------------------------------------------------------------
  1 | use clap::*;
  2 | use s3_algo::*;
  3 | use std::io::Write;
  4 | use std::path::{Path, PathBuf};
  5 | 
  6 | #[tokio::main]
  7 | async fn main() {
  8 |     let mut app = App::new("Example 'perf_data'")
  9 |         .before_help("Upload a directory to S3 on localhost.")
 10 |         .arg(
 11 |             Arg::with_name("source")
 12 |                 .help("Path to a folder to upload to S3")
 13 |                 .required(true),
 14 |         )
 15 |         .arg(
 16 |             Arg::with_name("dest_bucket")
 17 |                 .help("Destination bucket")
 18 |                 .required(true),
 19 |         )
 20 |         .arg(
 21 |             Arg::with_name("dest_prefix")
 22 |                 .help("Destination prefix")
 23 |                 .required(true),
 24 |         )
 25 |         .arg(
 26 |             Arg::with_name("parallelization")
 27 |                 .short('n')
 28 |                 .takes_value(true)
 29 |                 .help("Maximum number of simultaneous upload requests"),
 30 |         );
 31 |     let matches = app.clone().get_matches();
 32 | 
 33 |     if let (Some(path), Some(bucket), Some(prefix)) = (
 34 |         matches.value_of("source"),
 35 |         matches.value_of("dest_bucket"),
 36 |         matches.value_of("dest_prefix"),
 37 |     ) {
 38 |         let parallelization = value_t_or_exit!(matches.value_of("parallelization"), usize);
 39 |         benchmark_s3_upload(
 40 |             Path::new(path).to_path_buf(),
 41 |             bucket.to_owned(),
 42 |             prefix.to_owned(),
 43 |             parallelization,
 44 |         )
 45 |         .await;
 46 |         println!("Done");
 47 |     } else {
 48 |         app.print_help().unwrap()
 49 |     }
 50 | }
 51 | 
 52 | async fn benchmark_s3_upload(
 53 |     dir_path: PathBuf,
 54 |     bucket: String,
 55 |     prefix: String,
 56 |     copy_parallelization: usize,
 57 | ) {
 58 |     let cfg = Config {
 59 |         copy_parallelization,
 60 |         ..Default::default()
 61 |     };
 62 |     let s3 = testing_sdk_client().await;
 63 |     let algo = S3Algo::with_config(s3, cfg);
 64 | 
 65 |     upload_perf_log_init(&mut std::io::stdout());
 66 |     let progress = |res| async move { upload_perf_log_update(&mut std::io::stdout(), res) };
 67 | 
 68 |     algo.upload_files(
 69 |         bucket,
 70 |         files_recursive(dir_path, PathBuf::from(&prefix)),
 71 |         progress,
 72 |         |client| client.put_object(),
 73 |     )
 74 |     .await
 75 |     .unwrap();
 76 | }
 77 | 
 78 | // Helpers for writing data
 79 | macro_rules! write_cell {
 80 |     ($out:expr, $x:expr) => {
 81 |         let _ = write!($out, "{0: >18}", format!("{:.5}", $x));
 82 |     };
 83 | }
 84 | pub fn upload_perf_log_init<W: Write>(out: &mut W) {
 85 |     let _ = writeln!(
 86 |         out,
 87 |         "{0: >w$}{1: >w$}{2: >w$}{3: >w$}{4: >w$}{5: >w$}",
 88 |         "attempts",
 89 |         "bytes",
 90 |         "success_ms",
 91 |         "total_ms",
 92 |         "MBps",
 93 |         "MBps est",
 94 |         w = 18
 95 |     );
 96 | }
 97 | pub fn upload_perf_log_update<W: Write>(out: &mut W, res: RequestReport) {
 98 |     // TODO: Write performance data to file with tokio
 99 |     let megabytes = res.size as f64 / 1_000_000.0;
100 |     let speed = megabytes / res.success_time.as_secs_f64();
101 |     write_cell!(out, res.attempts);
102 |     write_cell!(out, res.size);
103 |     write_cell!(out, res.success_time.as_millis());
104 |     write_cell!(out, res.total_time.as_millis());
105 |     write_cell!(out, speed);
106 |     write_cell!(out, res.est);
107 |     let _ = writeln!(out);
108 | }
109 | 


--------------------------------------------------------------------------------
/src/err.rs:
--------------------------------------------------------------------------------
  1 | use aws_sdk_s3::error::SdkError;
  2 | use aws_sdk_s3::operation::copy_object::CopyObjectError;
  3 | use aws_sdk_s3::operation::delete_object::DeleteObjectError;
  4 | use aws_sdk_s3::operation::delete_objects::DeleteObjectsError;
  5 | use aws_sdk_s3::operation::get_object::GetObjectError;
  6 | use aws_sdk_s3::operation::list_objects_v2::ListObjectsV2Error;
  7 | use aws_sdk_s3::operation::put_object::PutObjectError;
  8 | use aws_sdk_s3::primitives::ByteStreamError;
  9 | use snafu::{Backtrace, Snafu};
 10 | use std::io;
 11 | 
 12 | #[derive(Snafu, Debug)]
 13 | #[snafu(visibility = "pub")]
 14 | pub enum Error {
 15 |     #[snafu(display("Io error: {}: {}", description, source))]
 16 |     Io {
 17 |         source: io::Error,
 18 |         description: String,
 19 |         backtrace: Backtrace,
 20 |     },
 21 |     /// Error originating from tokio::Delay
 22 |     #[snafu(display("Tokio timer error: {}", source))]
 23 |     Delay {
 24 |         source: tokio::time::error::Error,
 25 |         backtrace: Backtrace,
 26 |     },
 27 |     #[snafu(display("S3 operation timed out"))]
 28 |     Timeout {
 29 |         source: tokio::time::error::Elapsed,
 30 |     },
 31 |     #[snafu(display("Error listing objects in S3: {:?}", source))]
 32 |     ListObjectsV2 {
 33 |         source: SdkError<ListObjectsV2Error>,
 34 |     },
 35 |     #[snafu(display("Error deleting objects in S3: {:?}", source))]
 36 |     DeleteObjects {
 37 |         source: SdkError<DeleteObjectsError>,
 38 |     },
 39 |     DeleteObject {
 40 |         source: SdkError<DeleteObjectError>,
 41 |     },
 42 |     CopyObject {
 43 |         source: SdkError<CopyObjectError>,
 44 |     },
 45 |     #[snafu(display("GetObject s3://{}/{}: {:#?}", bucket, key, source))]
 46 |     GetObject {
 47 |         key: String,
 48 |         bucket: String,
 49 |         source: SdkError<GetObjectError>,
 50 |     },
 51 |     #[snafu(display("IO error: {}", source))]
 52 |     TokioIo {
 53 |         source: tokio::io::Error,
 54 |     },
 55 |     AnyError {
 56 |         source: Box<dyn std::error::Error + Send + Sync>,
 57 |     },
 58 | 
 59 |     #[snafu(display("Downloading objects: missing key or size property"))]
 60 |     MissingKeyOrSize,
 61 |     #[snafu(display("Downloading objects: missing content_length property"))]
 62 |     MissingContentLength,
 63 | 
 64 |     // AWS SDK Errors
 65 |     #[snafu(display("S3 'put object' error on key '{}': {}", key, source))]
 66 |     PutObject {
 67 |         source: SdkError<PutObjectError>,
 68 |         key: String,
 69 |         backtrace: Backtrace,
 70 |     },
 71 | 
 72 |     #[snafu(display("Error listing objects in S3: {:?}", source))]
 73 |     NewListObjectsV2 {
 74 |         source: SdkError<ListObjectsV2Error>,
 75 |     },
 76 | 
 77 |     #[snafu(display("Error deleting objects in S3: {:?}", source))]
 78 |     NewDeleteObjects {
 79 |         source: SdkError<DeleteObjectsError>,
 80 |     },
 81 |     NewDeleteObject {
 82 |         source: SdkError<DeleteObjectError>,
 83 |     },
 84 |     NewCopyObject {
 85 |         source: SdkError<CopyObjectError>,
 86 |     },
 87 |     #[snafu(display("GetObject s3://{}/{}: {:#?}", bucket, key, source))]
 88 |     NewGetObject {
 89 |         key: String,
 90 |         bucket: String,
 91 |         source: SdkError<GetObjectError>,
 92 |     },
 93 | }
 94 | 
 95 | impl<T> From<SdkError<T>> for Error
 96 | where
 97 |     T: std::error::Error + Send + Sync + 'static,
 98 | {
 99 |     fn from(err: SdkError<T>) -> Self {
100 |         Self::AnyError {
101 |             source: Box::new(err),
102 |         }
103 |     }
104 | }
105 | 
106 | impl From<ByteStreamError> for Error {
107 |     fn from(err: ByteStreamError) -> Self {
108 |         Self::AnyError {
109 |             source: Box::new(err),
110 |         }
111 |     }
112 | }
113 | 
114 | #[cfg(test)]
115 | mod test {
116 |     use super::*;
117 |     use snafu::GenerateBacktrace;
118 |     #[test]
119 |     fn error_traits() {
120 |         fn foo<T: Send>(_: T) {}
121 |         foo(Error::Io {
122 |             source: io::Error::from_raw_os_error(1),
123 |             description: "hello".into(),
124 |             backtrace: Backtrace::generate(),
125 |         });
126 |     }
127 | }
128 | 


--------------------------------------------------------------------------------
/src/config.rs:
--------------------------------------------------------------------------------
  1 | use serde::{Deserialize, Serialize};
  2 | #[derive(Clone, Debug, Serialize, Deserialize)]
  3 | #[serde(default)]
  4 | #[serde(deny_unknown_fields)]
  5 | pub struct Config {
  6 |     /// Maximum number of simultaneous upload requests
  7 |     pub copy_parallelization: usize,
  8 | 
  9 |     pub algorithm: AlgorithmConfig,
 10 | 
 11 |     /// The "unit" of a delete request is number of objects
 12 |     pub delete_requests: SpecificTimings,
 13 | 
 14 |     /// NOTE: For now, `put_request` is used both in S3 `get`, `put` and `copy` operations.
 15 |     /// Reason: We don't know if it's worth it with different configurations for these operations
 16 |     /// that all have a duration that depends on the number of bytes of the objects in question.
 17 |     /// The "unit" for such requests are number of bytes.
 18 |     pub put_requests: SpecificTimings,
 19 | }
 20 | 
 21 | impl Default for Config {
 22 |     fn default() -> Self {
 23 |         Self {
 24 |             copy_parallelization: 20,
 25 |             algorithm: Default::default(),
 26 |             delete_requests: SpecificTimings {
 27 |                 seconds_per_unit: 0.2,
 28 |                 minimum_units_for_estimation: 10,
 29 |             },
 30 |             put_requests: SpecificTimings {
 31 |                 seconds_per_unit: 1.0 / 1_000_000.0, // 1 MBPS = 1e-06 seconds per MB
 32 |                 minimum_units_for_estimation: 10,
 33 |             },
 34 |         }
 35 |     }
 36 | }
 37 | 
 38 | #[derive(Clone, Debug, Serialize, Deserialize)]
 39 | #[serde(deny_unknown_fields)]
 40 | pub struct AlgorithmConfig {
 41 |     /// The base timeout which will always be there (an estimate of RTT)
 42 |     pub base_timeout: f64,
 43 | 
 44 |     /// Timeout is set to a fraction of expected upload time (> 1.0)
 45 |     pub timeout_fraction: f64,
 46 | 
 47 |     /// Every retry, the timeout is multiplied by backoff (> 1.0)
 48 |     pub backoff: f64,
 49 | 
 50 |     /// Number of times to retry a single request before giving up
 51 |     pub n_retries: usize,
 52 | 
 53 |     /// To estimate the upload speed incrementally, we use an exponential average:
 54 |     /// `new_avg_speed = avg_power * new_speed + (1 - avg_power) * avg_speed`.
 55 |     ///
 56 |     /// Thus, between 0.0 and 1.0, closer to 1.0 means that newer data points have
 57 |     /// more significance.
 58 |     pub avg_power: f64,
 59 | }
 60 | impl Default for AlgorithmConfig {
 61 |     fn default() -> Self {
 62 |         Self {
 63 |             base_timeout: 0.5,
 64 |             timeout_fraction: 1.5,
 65 |             backoff: 1.5,
 66 |             n_retries: 8,
 67 |             avg_power: 0.7,
 68 |         }
 69 |     }
 70 | }
 71 | 
 72 | /// These settings are specific to the kind of operation we do. For example delete or put in S3.
 73 | #[derive(Clone, Debug, Serialize, Deserialize)]
 74 | pub struct SpecificTimings {
 75 |     /// The initial estimate of extra timeout per unit (byte or object)
 76 |     pub seconds_per_unit: f64,
 77 |     /// The amount of units in a request, below which it does not affect estimation
 78 |     pub minimum_units_for_estimation: usize,
 79 | }
 80 | 
 81 | impl SpecificTimings {
 82 |     /// Sane default setting for when the size is number of bytes
 83 |     pub fn default_for_bytes() -> Self {
 84 |         Self {
 85 |             seconds_per_unit: 1.0 / 1_000_000.0,   // 1 MBPS
 86 |             minimum_units_for_estimation: 500_000, // 500 KB
 87 |         }
 88 |     }
 89 |     /// Sane default setting for when the size is number of objects
 90 |     pub fn default_for_objects() -> Self {
 91 |         Self {
 92 |             seconds_per_unit: 0.2,
 93 |             minimum_units_for_estimation: 2,
 94 |         }
 95 |     }
 96 | }
 97 | 
 98 | // DRAFT
 99 | //
100 | // Now, we don't have "avg_min_bytes". Because... we will just substract the assumed constant
101 | // anyway.
102 | // What if the assumption is wrong?
103 | // Well, it should be rather small anyway. It is exclusively thought to be the RTT...
104 | // If the substraction is negative after all...? Then... idk
105 | 
106 | // put_timeout_per_byte..?
107 | //  should configure it as an assumed MBPS just like before. expected_upload_speed.
108 | // delete_timeout_per_object..?
109 | //  quite straight-forward seconds per object
110 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # `s3-algo`
 2 | 
 3 | High-performance algorithms for batch operations in Amazon S3, on top of [rusoto](https://github.com/rusoto/rusoto).
 4 | Reliability and performance achieved through a configurable timeout/retry/backoff algorithm, for high volumn of requests.
 5 | Monitor progress closely with closures that get called for every finished request, for accurate user feedback.
 6 | 
 7 | 
 8 | https://docs.aws.amazon.com/AmazonS3/latest/dev/optimizing-performance-guidelines.html
 9 | 
10 | - Upload multiple files with `s3_upload_files`.
11 | - List files with `s3_list_objects` or `s3_list_prefix`, and then execute deletion or copy on all the files.
12 | 
13 | This crate is only in its infancy, and we happily welcome PR's, feature requests, suggestions for improvement of the API.
14 | 
15 | # Running tests and examples
16 | Both tests and examples require that an S3 service such as `minio` is running locally at port 9000.
17 | Tests assume that a credentials profile exists - for example in `~/.aws/credentials`:
18 | 
19 | ```
20 | [testing]
21 | aws_access_key_id = 123456789
22 | aws_secret_access_key = 123456789
23 | ```
24 | 
25 | # Listing, deleting and copying objects
26 | Is all done with entrypoint `s3_list_objects()` or `s3_list_prefix()`, which return a `ListObjects`
27 | object which can delete and copy files.
28 | Example:
29 | 
30 | ```rust
31 | s3_list_prefix(s3, "test-bucket".to_string(), "some/prefix".to_string())
32 |     .delete_all()
33 |     .await
34 |     .unwrap();
35 | ```
36 | 
37 | # Upload
38 | ## Features of the `s3_upload_files` function
39 | * As generic as possible, to support many use cases.
40 | * It is possible to collect detailed data from the upload through a closure - one can choose to use this data to analyze performance, or for example to implement a live progress percentage report.
41 | * Backoff mechanism
42 | * Fast. Several mechanisms are in place, such as [aggressive timeouts](https://docs.aws.amazon.com/AmazonS3/latest/dev/optimizing-performance-guidelines.html), parallelization and streaming files from file system while uploading.
43 | 
44 | ## Algorithm details
45 | The documentation for `UploadConfig` may help illuminate the components of the algorithm.
46 | The currnetly most important aspect of the algorithm revolves around deciding timeout values. That is, how long to wait for a request before trying again.
47 | It is important for performance that the timeout is tight enough.
48 | The main mechanism to this end is the estimation of the upload bandwidth through a running exponential average of the upload speed (on success) of individual files.
49 | Additionally, on each successive retry, the timeout increases by some factor (back-off).
50 | 
51 | ## Yet to consider
52 | * Is the algorithm considerate with respect to other processes that want to use the same network? For example in the case of congestion. It does implement increasing back-off intervals after failed requests, but the real effect on a shared network should be tested.
53 | 
54 | 
55 | ## Examples
56 | ### `perf_data`
57 | Command-line interface for uploading any directory to any bucket and prefix in a locally running S3 service (such as `minio`).
58 | Example:
59 | ```
60 | cargo run --example perf_data  -- -n 3 ./src test-bucket lala
61 | ```
62 | 
63 | Prints:
64 | ```
65 |           attempts             bytes        success_ms          total_ms              MBps          MBps est
66 |                  1              1990                32                32           0.06042           1.00000
67 |                  1             24943                33                33           0.74043           1.00000
68 |                  1              2383                29                29           0.08211           1.00000
69 |                  1               417                13                13           0.03080           1.00000
70 |                  1              8562                16                16           0.51480           1.00000
71 | ```
72 | `total_ms` is the total time including all retries, and `success_ms` is the time of only the last attempt.
73 | The distinction between these two is useful in real cases where `attempts` is not always `1`.
74 | 
75 | You can then verify that the upload happened by entering the container. Something like:
76 | 
77 | ```
78 | $ docker exec -it $(docker ps --filter "ancestor=minio" --format "{{.Names}}") bash
79 | [user@144aff4dae5b ~]$ ls s3/
80 | test-bucket/ 
81 | [user@144aff4dae5b ~]$ ls s3/test-bucket/
82 | lala
83 | ```
84 | 
85 | 
86 | 


--------------------------------------------------------------------------------
/src/upload.rs:
--------------------------------------------------------------------------------
  1 | use super::*;
  2 | use aws_sdk_s3::operation::put_object::builders::PutObjectFluentBuilder;
  3 | use aws_sdk_s3::primitives::ByteStream;
  4 | 
  5 | impl S3Algo {
  6 |     /// Upload multiple files to S3.
  7 |     ///
  8 |     /// `upload_files` provides counting of uploaded files and bytes through the `progress` closure:
  9 |     ///
 10 |     /// For common use cases it is adviced to use [`files_recursive`](files_recursive) for the `files` parameter.
 11 |     ///
 12 |     /// `progress` will be called after the upload of each file, with some data about that upload.
 13 |     /// The first `usize` parameter is the number of this file in the upload, while [`RequestReport`](struct.RequestReport.html)
 14 |     /// holds more data such as size in bytes, and the duration of the upload. It is thus possible to
 15 |     /// report progress both in amount of files, or amount of bytes, depending on what granularity is
 16 |     /// desired.
 17 |     /// `progress` returns a generic `F: Future` to support async operations like, for example, logging the
 18 |     /// results to a file; this future will be run as part of the upload algorithm.
 19 |     ///
 20 |     /// `default_request` constructs the default request struct - only the fields `bucket`, `key`,
 21 |     /// `body` and `content_length` are overwritten by the upload algorithm.
 22 |     pub async fn upload_files<P, F, I, R>(
 23 |         &self,
 24 |         bucket: String,
 25 |         files: I,
 26 |         progress: P,
 27 |         default_request: R,
 28 |     ) -> Result<(), Error>
 29 |     where
 30 |         P: Fn(RequestReport) -> F + Clone + Send + Sync + 'static,
 31 |         F: Future<Output = ()> + Send + 'static,
 32 |         I: Iterator<Item = ObjectSource> + Send + 'static,
 33 |         R: Fn(&Client) -> PutObjectFluentBuilder + Clone + Unpin + Sync + Send + 'static,
 34 |     {
 35 |         let copy_parallelization = self.config.copy_parallelization;
 36 |         let n_retries = self.config.algorithm.n_retries;
 37 | 
 38 |         let timeout_state = Arc::new(Mutex::new(TimeoutState::new(
 39 |             self.config.algorithm.clone(),
 40 |             self.config.put_requests.clone(),
 41 |         )));
 42 |         let timeout_state2 = timeout_state.clone();
 43 | 
 44 |         let jobs = files.map(move |src| {
 45 |             let (default, bucket, s3) = (default_request.clone(), bucket.clone(), self.s3.clone());
 46 |             s3_request(
 47 |                 move || {
 48 |                     src.clone()
 49 |                         .create_upload_future(s3.clone(), bucket.clone(), default.clone())
 50 |                 },
 51 |                 |_, size| size,
 52 |                 n_retries,
 53 |                 timeout_state.clone(),
 54 |             )
 55 |             .boxed()
 56 |         });
 57 | 
 58 |         // Run jobs in parallel,
 59 |         //  adding eventual delays after each file upload and also at the end,
 60 |         //  and counting the progress
 61 |         stream::iter(jobs)
 62 |             .buffer_unordered(copy_parallelization)
 63 |             .zip(stream::iter(0..))
 64 |             .map(|(result, i)| result.map(|result| (i, result)))
 65 |             .try_for_each(move |(i, (mut result, _))| {
 66 |                 let progress = progress.clone();
 67 |                 let timeout_state = timeout_state2.clone();
 68 |                 async move {
 69 |                     result.seq = i;
 70 |                     timeout_state.lock().await.update(&result);
 71 |                     progress(result).map(Ok).await
 72 |                 }
 73 |             })
 74 |             .await
 75 |     }
 76 | }
 77 | 
 78 | #[derive(Clone, Debug)]
 79 | pub enum ObjectSource {
 80 |     File { path: PathBuf, key: String },
 81 |     Data { data: Vec<u8>, key: String },
 82 | }
 83 | impl ObjectSource {
 84 |     pub fn file(path: PathBuf, key: String) -> Self {
 85 |         Self::File { path, key }
 86 |     }
 87 |     pub fn data<D: Into<Vec<u8>>>(data: D, key: String) -> Self {
 88 |         Self::Data {
 89 |             data: data.into(),
 90 |             key,
 91 |         }
 92 |     }
 93 |     pub async fn create_stream(&self) -> Result<(ByteStream, usize), Error> {
 94 |         match self {
 95 |             Self::File { path, .. } => {
 96 |                 let file = tokio::fs::File::open(path.clone()).await.with_context({
 97 |                     let path = path.clone();
 98 |                     move || err::Io {
 99 |                         description: path.display().to_string(),
100 |                     }
101 |                 })?;
102 |                 let metadata = file.metadata().await.with_context({
103 |                     let path = path.clone();
104 |                     move || err::Io {
105 |                         description: path.display().to_string(),
106 |                     }
107 |                 })?;
108 | 
109 |                 let len = metadata.len() as usize;
110 |                 // let boxbody = BoxBody::new(
111 |                 //     FramedRead::new(file, BytesCodec::new()).map_ok(bytes::BytesMut::freeze),
112 |                 // );
113 |                 // let sdk_body = SdkBody::from_dyn(boxbody);
114 | 
115 |                 Ok((ByteStream::read_from().file(file).build().await?, len))
116 |             }
117 |             Self::Data { data, .. } => Ok((data.clone().into(), data.len())),
118 |         }
119 |     }
120 |     pub async fn create_upload_future<R>(
121 |         self,
122 |         s3: aws_sdk_s3::Client,
123 |         bucket: String,
124 |         default: R,
125 |     ) -> Result<(impl Future<Output = Result<(), Error>>, usize), Error>
126 |     where
127 |         R: Fn(&Client) -> PutObjectFluentBuilder + Clone + Unpin + Sync + Send + 'static,
128 |     {
129 |         let (stream, len) = self.create_stream().await?;
130 |         let key = self.get_key().to_owned();
131 |         let (s3, bucket, default) = (s3.clone(), bucket.clone(), default.clone());
132 |         let future = async move {
133 |             default(&s3)
134 |                 .set_bucket(Some(bucket.clone()))
135 |                 .set_key(Some(key.clone()))
136 |                 .set_body(Some(stream))
137 |                 .set_content_length(Some(len as i64))
138 |                 .send()
139 |                 .await
140 |                 .map_err(|e| e.into())
141 |                 // .await
142 |                 .map(drop)
143 |         };
144 |         Ok((future, len))
145 |     }
146 |     pub fn get_key(&self) -> &str {
147 |         match self {
148 |             Self::File { key, .. } => key,
149 |             Self::Data { key, .. } => key,
150 |         }
151 |     }
152 | }
153 | 
154 | /// Convenience function (using `walkdir`) to traverse all files in directory `src_dir`. Returns an
155 | /// iterator that can be used as input to `S3Algo::upload_files`, which uploads files
156 | /// with a key equal to the file's path with `src_dir` stripped away, and with `key_prefix`
157 | /// prepended.
158 | pub fn files_recursive(
159 |     src_dir: PathBuf,
160 |     key_prefix: PathBuf,
161 | ) -> impl Iterator<Item = ObjectSource> {
162 |     #[cfg(windows)]
163 |     use path_slash::PathExt;
164 |     walkdir::WalkDir::new(&src_dir)
165 |         .into_iter()
166 |         .filter_map(move |entry| {
167 |             let src_dir = src_dir.clone();
168 |             let key_prefix = key_prefix.clone();
169 |             entry.ok().and_then(move |entry| {
170 |                 if entry.file_type().is_file() {
171 |                     let path = entry.path().to_owned();
172 |                     let key_suffix = path.strip_prefix(&src_dir).unwrap().to_path_buf();
173 |                     let key = key_prefix.join(&key_suffix);
174 |                     Some(ObjectSource::File {
175 |                         path,
176 |                         #[cfg(unix)]
177 |                         key: key.to_string_lossy().to_string(),
178 |                         #[cfg(windows)]
179 |                         key: key.to_slash_lossy().to_string(),
180 |                     })
181 |                 } else {
182 |                     None
183 |                 }
184 |             })
185 |         })
186 | }
187 | 
188 | #[cfg(test)]
189 | mod test {
190 |     use super::*;
191 |     use tempdir::TempDir;
192 |     #[test]
193 |     fn test_files_recursive() {
194 |         let tmp_dir = TempDir::new("s3-testing").unwrap();
195 |         let dir = tmp_dir.path();
196 |         for i in 0..10 {
197 |             std::fs::write(dir.join(format!("img_{}.tif", i)), "file contents").unwrap();
198 |         }
199 |         let files = files_recursive(dir.to_owned(), PathBuf::new());
200 |         assert_eq!(files.count(), 10);
201 |     }
202 | }
203 | 


--------------------------------------------------------------------------------
/src/test.rs:
--------------------------------------------------------------------------------
  1 | use crate::*;
  2 | use rand::Rng;
  3 | use std::path::Path;
  4 | use std::sync::Arc;
  5 | use tempdir::TempDir;
  6 | use timeout::TimeoutState;
  7 | use tokio::io::AsyncReadExt;
  8 | use tokio::sync::Mutex;
  9 | 
 10 | /*
 11 | /// Timeout implementation used for testing
 12 | struct TimeoutState;
 13 | impl Timeout for TimeoutState {
 14 |     fn get_timeout(&self, _bytes: usize, _attempts: usize) -> Duration {
 15 |         Duration::from_secs(4)
 16 |     }
 17 |     fn update(&mut self, _: &RequestReport) {}
 18 |     fn get_estimate(&self) -> f64 {
 19 |         0.0
 20 |     }
 21 | }
 22 | */
 23 | 
 24 | pub(crate) fn rand_string(n: usize) -> String {
 25 |     rand::thread_rng()
 26 |         .sample_iter(&rand::distributions::Alphanumeric)
 27 |         .take(n)
 28 |         .map(|x| x as char)
 29 |         .collect::<String>()
 30 | }
 31 | 
 32 | #[test]
 33 | fn everything_is_sync_and_static() {
 34 |     // This is only to test that it compiles
 35 |     fn verify<F, T>(_: F)
 36 |     where
 37 |         F: Future<Output = T> + Send + 'static,
 38 |     {
 39 |     }
 40 | 
 41 |     verify(s3_request(
 42 |         || async move { Ok((async move { Ok(()) }, 0)) },
 43 |         |_, size| size,
 44 |         5,
 45 |         Arc::new(Mutex::new(TimeoutState::new(
 46 |             AlgorithmConfig::default(),
 47 |             SpecificTimings::default_for_bytes(),
 48 |         ))),
 49 |     ))
 50 | }
 51 | 
 52 | #[tokio::test]
 53 | async fn test_s3_upload_files() {
 54 |     const N_FILES: usize = 100;
 55 |     let tmp_dir = TempDir::new("s3-testing").unwrap();
 56 | 
 57 |     let s3 = testing_sdk_client().await;
 58 |     let algo = S3Algo::new(s3.clone());
 59 |     let dir_key = upload_test_files(algo.clone(), tmp_dir.path(), N_FILES)
 60 |         .await
 61 |         .unwrap();
 62 | 
 63 |     // Check that all files are there
 64 |     for i in 0..N_FILES {
 65 |         // let key = format!("{}/img_{}.tif", dir_key, i);
 66 |         let key = dir_key.join(format!("img_{}.tif", i));
 67 | 
 68 |         let response = s3
 69 |             .get_object()
 70 |             .bucket("test-bucket".to_string())
 71 |             .key(key.to_str().unwrap().to_string())
 72 |             .send()
 73 |             .await
 74 |             .unwrap();
 75 | 
 76 |         let mut body = response.body.into_async_read();
 77 |         let mut content = Vec::new();
 78 |         body.read_to_end(&mut content).await.unwrap();
 79 |         let content = std::str::from_utf8(&content).unwrap();
 80 |         assert_eq!(content, "file contents");
 81 |     }
 82 | }
 83 | 
 84 | #[tokio::test]
 85 | async fn test_s3_timeouts() {
 86 |     // TODO finish test
 87 |     // Currently just prints things to inspect how timeout behaves
 88 | 
 89 |     let bytes: Vec<usize> = vec![500_000, 999_999, 1_000_001, 2_000_000];
 90 |     // Test that timeout on successive errors follows a desired curve
 91 | 
 92 |     // These are all parameters related to timeout, shown explicitly
 93 |     let cfg = Config {
 94 |         algorithm: AlgorithmConfig {
 95 |             backoff: 1.5,
 96 |             base_timeout: 0.5,
 97 |             timeout_fraction: 1.5,
 98 |             avg_power: 0.7,
 99 |             ..Default::default()
100 |         },
101 |         ..Default::default()
102 |     };
103 | 
104 |     for bytes in bytes {
105 |         println!("# Bytes = {}", bytes);
106 |         let timeout = TimeoutState::new(cfg.algorithm.clone(), cfg.put_requests.clone());
107 | 
108 |         let timeouts = (1..=10)
109 |             .map(|retries| timeout.get_timeout(bytes, retries))
110 |             .collect::<Vec<_>>();
111 |         println!("{:?}", timeouts);
112 |     }
113 | }
114 | 
115 | /// Returns the common prefix of all files in S3
116 | async fn upload_test_files(s3: S3Algo, parent: &Path, n_files: usize) -> Result<PathBuf, Error> {
117 |     let dir_key = Path::new(&rand_string(4))
118 |         .join(rand_string(4))
119 |         .join(rand_string(4));
120 |     let dir = parent.join(&dir_key);
121 |     std::fs::create_dir_all(&dir).unwrap();
122 |     for i in 0..n_files {
123 |         std::fs::write(dir.join(format!("img_{}.tif", i)), "file contents").unwrap();
124 |     }
125 | 
126 |     println!("Upload {} to {:?} ", dir.display(), dir_key);
127 |     s3.upload_files(
128 |         "test-bucket".into(),
129 |         files_recursive(dir.clone(), dir.strip_prefix(parent).unwrap().to_owned()),
130 |         |_| async move {},
131 |         |client| client.put_object(),
132 |     )
133 |     .await?;
134 |     Ok(dir_key)
135 | }
136 | 
137 | // TODO uncomment after rewriting move_all function ETC
138 | /*
139 | #[tokio::test]
140 | async fn test_move_files() {
141 |     const N_FILES: usize = 100;
142 |     let s3 = testing_sdk_client().await;
143 |     let algo = S3Algo::new(s3.clone());
144 |     let tmp_dir = TempDir::new("s3-testing").unwrap();
145 |     let prefix = upload_test_files(algo.clone(), tmp_dir.path(), N_FILES)
146 |         .await
147 |         .unwrap();
148 |     let new_prefix = PathBuf::from("haha/lala");
149 |     println!(
150 |         "Move prefix {} to {}",
151 |         prefix.display(),
152 |         new_prefix.display()
153 |     );
154 | 
155 |     // TODO try also the following more manual way of doing the same
156 |     /*
157 |     algo.list_prefix("test-bucket".into(), format!("{}", prefix.display()))
158 |         .boxed() // hope we can remove boxed() soon (it's for reducing type size)
159 |         .move_all(
160 |             move |key| {
161 |                 let key = PathBuf::from(key);
162 |                 let name = key.file_name().unwrap();
163 |                 format!("{}/{}", new_prefix2.display(), name.to_str().unwrap())
164 |             },
165 |             None,
166 |         )
167 |         .await
168 |         .unwrap();
169 |     */
170 |     algo.list_prefix("test-bucket".into(), prefix.to_str().map(|x| x.to_owned()))
171 |         .boxed() // hope we can remove boxed() soon (it's for reducing type size)
172 |         .move_to_prefix(
173 |             None,
174 |             new_prefix.to_str().unwrap().to_owned(),
175 |             Default::default,
176 |         )
177 |         .boxed()
178 |         .await
179 |         .unwrap();
180 | 
181 |     // Check that all files are under `new_prefix` and not under `prefix`
182 |     for i in 0..N_FILES {
183 |         let key = new_prefix.join(format!("img_{}.tif", i));
184 |         let response = s3.get_object(GetObjectRequest {
185 |             bucket: "test-bucket".to_string(),
186 |             key: key.to_str().unwrap().to_string(),
187 |             ..Default::default()
188 |         });
189 |         let _ = response.await.unwrap();
190 | 
191 |         let key = prefix.join(format!("img_{}.tif", i));
192 |         let response = s3.get_object(GetObjectRequest {
193 |             bucket: "test-bucket".to_string(),
194 |             key: key.to_str().unwrap().to_string(),
195 |             ..Default::default()
196 |         });
197 |         let _ = response.await.unwrap_err();
198 |     }
199 | }
200 | */
201 | 
202 | // TODO: uncomment after rewriting copy_all function
203 | /*
204 | #[tokio::test]
205 | async fn test_copy_files() {
206 |     const N_FILES: usize = 100;
207 |     let s3 = testing_s3_client();
208 |     let algo = S3Algo::new(s3.clone());
209 |     let tmp_dir = TempDir::new("s3-testing").unwrap();
210 |     let prefix = upload_test_files(algo.clone(), tmp_dir.path(), N_FILES)
211 |         .await
212 |         .unwrap();
213 | 
214 |     let n = Arc::new(std::sync::Mutex::new(0_usize));
215 |     let m = n.clone();
216 |     algo.list_prefix("test-bucket".into(), prefix.to_str().unwrap().to_owned())
217 |         .boxed() // hope we can remove boxed() soon (it's for reducing type size)
218 |         .copy_all(
219 |             Some("test-bucket2".into()),
220 |             move |key| {
221 |                 *m.lock().unwrap() += 1;
222 |                 format!("test_copy_files/{}", key)
223 |             },
224 |             Default::default,
225 |         )
226 |         .boxed()
227 |         .await
228 |         .unwrap();
229 |     assert_eq!(*n.lock().unwrap(), N_FILES);
230 | 
231 |     // Check that all objects are present in both buckets
232 |     for i in 0..N_FILES {
233 |         let key = format!("test_copy_files/{}/img_{}.tif", prefix.display(), i);
234 |         let response = s3.get_object(GetObjectRequest {
235 |             bucket: "test-bucket2".to_string(),
236 |             key,
237 |             ..Default::default()
238 |         });
239 |         let _ = response.await.unwrap();
240 | 
241 |         let key = prefix.join(format!("img_{}.tif", i));
242 |         let response = s3.get_object(GetObjectRequest {
243 |             bucket: "test-bucket".to_string(),
244 |             key: key.to_str().unwrap().to_string(),
245 |             ..Default::default()
246 |         });
247 |         let _ = response.await.unwrap();
248 |     }
249 | }
250 | */
251 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
  1 | //! # S3 high-performance algorithms
  2 | //! High-performance algorithms for batch operations in Amazon S3.
  3 | //!
  4 | //! https://docs.aws.amazon.com/AmazonS3/latest/dev/optimizing-performance-guidelines.html
  5 | //!
  6 | //! - Upload multiple files with `S3Algo::upload_files`.
  7 | //! - List files with `S3Algo::s3_list_objects` or `S3Algo::s3_list_prefix`,
  8 | //! and then execute deletion or copy on all the files.
  9 | 
 10 | use crate::timeout::*;
 11 | use aws_config::default_provider::credentials::DefaultCredentialsChain;
 12 | use aws_config::meta::region::RegionProviderChain;
 13 | use aws_config::BehaviorVersion;
 14 | use aws_sdk_s3::config::retry::RetryConfig;
 15 | use aws_sdk_s3::Client;
 16 | use futures::future::{Future, TryFutureExt};
 17 | use futures::prelude::*;
 18 | use futures::stream;
 19 | use futures_retry::{FutureRetry, RetryPolicy};
 20 | use futures_stopwatch::try_stopwatch;
 21 | use snafu::futures::TryFutureExt as S;
 22 | use snafu::ResultExt;
 23 | use std::marker::Unpin;
 24 | use std::path::PathBuf;
 25 | use std::sync::Arc;
 26 | use std::time::Duration;
 27 | use tokio::sync::Mutex;
 28 | 
 29 | mod config;
 30 | pub mod err;
 31 | mod list_actions;
 32 | mod upload;
 33 | 
 34 | pub use list_actions::*;
 35 | pub use upload::*;
 36 | pub mod timeout;
 37 | pub use config::*;
 38 | pub use err::Error;
 39 | 
 40 | #[cfg(test)]
 41 | mod test;
 42 | 
 43 | #[derive(Clone)]
 44 | pub struct S3Algo {
 45 |     s3: Client,
 46 |     config: Config,
 47 | }
 48 | impl S3Algo {
 49 |     pub fn new(s3: Client) -> Self {
 50 |         Self {
 51 |             s3,
 52 |             config: Config::default(),
 53 |         }
 54 |     }
 55 |     pub fn with_config(s3: Client, config: Config) -> Self {
 56 |         Self { s3, config }
 57 |     }
 58 | }
 59 | 
 60 | /// Result of a single S3 request.
 61 | #[derive(Debug, Clone, Copy)]
 62 | pub struct RequestReport {
 63 |     /// The number of this request in a series of multiple requests (0 if not applicable)
 64 |     pub seq: usize,
 65 |     /// Size of request - in bytes or in number of objects, depending on the type of request.
 66 |     pub size: usize,
 67 |     /// The total time including all retries
 68 |     pub total_time: Duration,
 69 |     /// The time of the successful request
 70 |     pub success_time: Duration,
 71 |     /// Number of attempts. A value of `1` means no retries - success on first attempt.
 72 |     pub attempts: usize,
 73 |     /// Estimated sec/unit that was used in this request. Useful for
 74 |     /// debugging the upload algorithm and not much more.
 75 |     pub est: f64,
 76 | }
 77 | 
 78 | /// Issue a single S3 request, with retries and appropriate timeouts using sane defaults.
 79 | /// Basically an easier, less general version of `s3_request`.
 80 | ///
 81 | /// `extra_initial_timeout`: initial timeout of request (will increase with backoff) added to
 82 | /// `cfg.base_timeout`. It can be set to 0 if the S3 operation is a small one, but if the operation
 83 | /// size depends on for example a byte count or object count, set it to something that depends on
 84 | /// that.
 85 | pub async fn s3_single_request<F, G, R>(
 86 |     future_factory: F,
 87 |     extra_initial_timeout_s: f64,
 88 | ) -> Result<(RequestReport, R), Error>
 89 | where
 90 |     F: Fn() -> G + Unpin + Clone + Send + Sync + 'static,
 91 |     G: Future<Output = Result<R, Error>> + Send,
 92 | {
 93 |     // Configure a one-time Timeout that gives the desired initial_timeout_s on first try.
 94 |     // We tell `s3_request` that the request is of size `1`
 95 | 
 96 |     let timeout = TimeoutState::new(
 97 |         AlgorithmConfig::default(),
 98 |         SpecificTimings {
 99 |             seconds_per_unit: extra_initial_timeout_s,
100 |             minimum_units_for_estimation: 0, // doesn't matter
101 |         },
102 |     );
103 | 
104 |     s3_request(
105 |         move || {
106 |             let factory = future_factory.clone();
107 |             async move { Ok((factory(), 1)) }
108 |         },
109 |         |_, size| size,
110 |         10,
111 |         Arc::new(Mutex::new(timeout)),
112 |     )
113 |     .await
114 | }
115 | 
116 | /// Every request to S3 should be issued with `s3_request`, which puts the appropriate timeouts and
117 | /// retries the request, as well as times it.
118 | ///
119 | /// `future_factory` is a bit funky, being a closure that returns a future that resolves to another
120 | /// future. We need the closure F to run the request multiple times. Its return type G is a future
121 | /// because it might need to for example open a file using async, which might then be used in H to
122 | /// stream from the file...
123 | /// This is needed so that we can get e.g. the length of the file before streaming to S3.
124 | ///
125 | /// `get_size(report, expected)`: get the real size of the request. For some types of requests
126 | /// (e.g. DeleteObjects/PutObject), we know the size upfront, so real size = expected.
127 | /// For others (ListObjectsV2), we need to result of the action to know the size.
128 | /// The size returned from this function is only used to construct the `RequestReport`, which in
129 | /// turn is only useful for eventual progress closures. So the existence of `get_size` parameter is
130 | /// due to the feature of monitoring progress.
131 | ///
132 | /// The "expected" size returned by `future_factory` on the other hand is needed to calculate the
133 | /// timeout.
134 | pub(crate) async fn s3_request<F, G, H, T, R, S>(
135 |     future_factory: F,
136 |     get_size: S,
137 |     n_retries: usize,
138 |     timeout: Arc<Mutex<T>>,
139 | ) -> Result<(RequestReport, R), Error>
140 | where
141 |     F: Fn() -> G + Unpin + Clone + Send + Sync + 'static,
142 |     G: Future<Output = Result<(H, usize), Error>> + Send,
143 |     H: Future<Output = Result<R, Error>> + Send,
144 |     S: Fn(&R, usize) -> usize + Unpin + Clone + Send + Sync + 'static,
145 |     T: timeout::Timeout,
146 | {
147 |     let mut attempts1 = 0;
148 |     let mut attempts2 = 0;
149 |     try_stopwatch(
150 |         // Time the entire file upload (across all retries)
151 |         FutureRetry::new(
152 |             // Future factory - creates a future that reads file while uploading it
153 |             move || {
154 |                 let (future_factory, timeout, get_size) =
155 |                     (future_factory.clone(), timeout.clone(), get_size.clone());
156 | 
157 |                 async move {
158 |                     attempts1 += 1;
159 |                     let (request, expected_size) = future_factory().await?;
160 |                     let (est, timeout_value) = {
161 |                         let t = timeout.lock().await;
162 |                         (t.get_estimate(), t.get_timeout(expected_size, attempts1))
163 |                     };
164 |                     try_stopwatch(
165 |                         tokio::time::timeout(timeout_value, request)
166 |                             .with_context(|| err::Timeout {})
167 |                             .map(|result| result.and_then(|x| x)), // flatten the Result<Result<(), err>, timeout err>
168 |                     )
169 |                     .map_ok(move |(response, success_time)| {
170 |                         let real_size = get_size(&response, expected_size);
171 |                         (response, success_time, real_size, est)
172 |                     })
173 |                     .await
174 |                 }
175 |             },
176 |             // retry function
177 |             {
178 |                 move |e| {
179 |                     attempts2 += 1;
180 |                     if attempts2 > n_retries {
181 |                         RetryPolicy::ForwardError(e)
182 |                     } else {
183 |                         RetryPolicy::WaitRetry(Duration::from_millis(200)) //  TODO adjust the time, maybe depending on retries
184 |                     }
185 |                 }
186 |             },
187 |         ),
188 |     )
189 |     .await
190 |     .map(
191 |         move |(((response, success_time, size, est), attempts), total_time)| {
192 |             (
193 |                 RequestReport {
194 |                     seq: 0,
195 |                     size,
196 |                     total_time,
197 |                     success_time,
198 |                     attempts,
199 |                     est,
200 |                 },
201 |                 response,
202 |             )
203 |         },
204 |     )
205 |     .map_err(|(err, _attempts)| err)
206 | }
207 | 
208 | pub async fn retriable_s3_client() -> Client {
209 |     let retry_config = RetryConfig::standard()
210 |         .with_max_attempts(3)
211 |         .with_initial_backoff(Duration::from_secs(10));
212 | 
213 |     let region_provider = RegionProviderChain::default_provider();
214 |     let sdk_config = aws_config::defaults(BehaviorVersion::v2024_03_28())
215 |         .region(region_provider)
216 |         .load()
217 |         .await;
218 | 
219 |     let mut s3_config_builder = aws_sdk_s3::config::Builder::from(&sdk_config);
220 |     s3_config_builder.set_retry_config(Some(retry_config));
221 | 
222 |     aws_sdk_s3::Client::from_conf(s3_config_builder.build())
223 | }
224 | 
225 | pub async fn testing_sdk_client() -> Client {
226 |     let retry_config = RetryConfig::standard()
227 |         .with_max_attempts(3)
228 |         .with_initial_backoff(Duration::from_secs(10));
229 | 
230 |     let credentials_provider = DefaultCredentialsChain::builder()
231 |         .profile_name("testing")
232 |         .build()
233 |         .await;
234 |     let region_provider = RegionProviderChain::first_try("EuWest1");
235 |     let sdk_config = aws_config::defaults(BehaviorVersion::v2024_03_28())
236 |         .region(region_provider)
237 |         .endpoint_url("http://localhost:9000")
238 |         .credentials_provider(credentials_provider)
239 |         .load()
240 |         .await;
241 | 
242 |     let mut s3_config_builder = aws_sdk_s3::config::Builder::from(&sdk_config);
243 |     s3_config_builder.set_retry_config(Some(retry_config));
244 |     s3_config_builder.set_force_path_style(Some(true));
245 | 
246 |     Client::from_conf(s3_config_builder.build())
247 | }
248 | 


--------------------------------------------------------------------------------
/src/list_actions.rs:
--------------------------------------------------------------------------------
  1 | use super::*;
  2 | use aws_sdk_s3::operation::list_objects_v2::ListObjectsV2Output;
  3 | use aws_sdk_s3::primitives::ByteStream;
  4 | use aws_sdk_s3::types::{Delete, Object, ObjectIdentifier};
  5 | use aws_smithy_types_convert::stream::PaginationStreamExt;
  6 | use futures::future::ok;
  7 | use futures::stream::Stream;
  8 | use std::future::Future;
  9 | use std::pin::Pin;
 10 | use std::task::{Context, Poll};
 11 | use tokio::io;
 12 | 
 13 | /// A stream that can list objects, and (using member functions) delete or copy listed files.
 14 | pub struct ListObjects<S> {
 15 |     s3: Client,
 16 |     config: Config,
 17 |     bucket: String,
 18 |     /// Common prefix (as requested) of the listed objects. Empty string if all objects were
 19 |     /// requestd.
 20 |     prefix: String,
 21 |     stream: S,
 22 | }
 23 | impl<S> ListObjects<S>
 24 | where
 25 |     S: Stream<Item = Result<ListObjectsV2Output, Error>> + Sized + Send + 'static,
 26 | {
 27 |     pub fn boxed(
 28 |         self,
 29 |     ) -> ListObjects<Pin<Box<dyn Stream<Item = Result<ListObjectsV2Output, Error>> + Send>>> {
 30 |         ListObjects {
 31 |             s3: self.s3,
 32 |             config: self.config,
 33 |             bucket: self.bucket,
 34 |             stream: self.stream.boxed(),
 35 |             prefix: self.prefix,
 36 |         }
 37 |     }
 38 | 
 39 |     /// Calls an async closure on all the individual objects of the list operation
 40 |     pub async fn process<P, F>(self, f: P) -> Result<(), Error>
 41 |     where
 42 |         P: Fn(Object) -> F + Clone,
 43 |         F: Future<Output = ()>,
 44 |     {
 45 |         let ListObjects {
 46 |             stream, prefix: _, ..
 47 |         } = self;
 48 |         stream
 49 |             .try_filter_map(|response| ok(response.contents))
 50 |             .map_ok(|x| stream::iter(x).map(Ok))
 51 |             .try_flatten()
 52 |             .try_for_each_concurrent(None, move |object| {
 53 |                 let f = f.clone();
 54 |                 async move {
 55 |                     f(object).await;
 56 |                     Ok(())
 57 |                 }
 58 |             })
 59 |             .await
 60 |     }
 61 |     /// Download all listed objects - returns a stream of the contents.
 62 |     /// Used as a basis for other `download_all_*` functions.
 63 |     pub fn download_all_stream(
 64 |         self,
 65 |     ) -> impl Stream<Item = Result<(String, ByteStream, Option<i64>), Error>> {
 66 |         let ListObjects {
 67 |             s3,
 68 |             config: _,
 69 |             bucket,
 70 |             stream,
 71 |             prefix: _,
 72 |         } = self;
 73 |         stream
 74 |             .try_filter_map(|response| ok(response.contents))
 75 |             .map_ok(|x| stream::iter(x).map(Ok))
 76 |             .try_flatten()
 77 |             .map(|result| {
 78 |                 result.and_then(|obj| {
 79 |                     let Object { key, size, .. } = obj;
 80 |                     if let Some(key) = key {
 81 |                         Ok((key, size))
 82 |                     } else {
 83 |                         Err(Error::MissingKeyOrSize)
 84 |                     }
 85 |                 })
 86 |             })
 87 |             .and_then(move |(key, _)| {
 88 |                 let (s3, bucket) = (s3.clone(), bucket.clone());
 89 | 
 90 |                 async move {
 91 |                     let output = s3
 92 |                         .get_object()
 93 |                         .bucket(bucket.clone())
 94 |                         .key(key.clone())
 95 |                         .send()
 96 |                         .await
 97 |                         .context(err::GetObject {
 98 |                             key: key.clone(),
 99 |                             bucket,
100 |                         })?;
101 |                     Ok((key, output.body, output.content_length))
102 |                 }
103 |             })
104 |     }
105 | 
106 |     pub fn download_all_to_vec(self) -> impl Stream<Item = Result<(String, Vec<u8>), Error>> {
107 |         self.download_all_stream()
108 |             .and_then(|(key, body, _)| async move {
109 |                 let mut contents = vec![];
110 |                 io::copy(&mut body.into_async_read(), &mut contents)
111 |                     .await
112 |                     .context(err::TokioIo)?;
113 |                 Ok((key, contents))
114 |             })
115 |     }
116 | 
117 |     /*
118 |     /// Download all listed objects to file system.
119 |     /// UNIMPLEMENTED.
120 |     pub fn download_all(self) -> impl Future<Output = Result<(), Error>> {
121 |         // TODO use download_all_stream
122 |         ok(unimplemented!())
123 |     }
124 |     */
125 | 
126 |     /// Delete all listed objects.
127 |     ///
128 |     /// With the two arguments, you can implement a detailed real-time progress report of both how
129 |     /// many files have been listed, and how many files have been deleted.
130 |     ///
131 |     /// `list_progress`: Closure that is given number of files listed as argument. Is called
132 |     /// several times, one for each batch of files listed.
133 |     /// `delete_progress`: Closure that is given RequestReport of a delete request. The `size`
134 |     /// field refers to the number of fields deleted.
135 |     ///
136 |     pub fn delete_all<P1, P2, F1, F2>(
137 |         self,
138 |         list_progress: P1,
139 |         delete_progress: P2,
140 |     ) -> impl Future<Output = Result<(), Error>>
141 |     where
142 |         P1: Fn(usize) -> F1 + Clone + Send + Sync + 'static,
143 |         P2: Fn(RequestReport) -> F2 + Clone + Send + Sync + 'static,
144 |         F1: Future<Output = ()> + Send + 'static,
145 |         F2: Future<Output = ()> + Send + 'static,
146 |     {
147 |         // For each ListObjectsV2Output, send a request to delete all the listed objects
148 |         let ListObjects {
149 |             s3,
150 |             config,
151 |             bucket,
152 |             stream,
153 |             prefix: _,
154 |         } = self;
155 |         let timeout = Arc::new(Mutex::new(TimeoutState::new(
156 |             config.algorithm.clone(),
157 |             config.delete_requests.clone(),
158 |         )));
159 |         let n_retries = config.algorithm.n_retries;
160 |         stream.try_for_each_concurrent(None, move |object| {
161 |             let (s3, bucket, timeout, delete_progress2, list_progress2) = (
162 |                 s3.clone(),
163 |                 bucket.clone(),
164 |                 timeout.clone(),
165 |                 delete_progress.clone(),
166 |                 list_progress.clone(),
167 |             );
168 |             let objects = object
169 |                 .contents
170 |                 .unwrap_or_default() // unwrap or empty Vec
171 |                 .iter()
172 |                 .filter_map(|obj| {
173 |                     obj.key.as_ref().map(|key| {
174 |                         ObjectIdentifier::builder()
175 |                             .set_key(Some(key.clone()))
176 |                             .set_version_id(None)
177 |                             .build()
178 |                             .unwrap() // unwrap: shouldn't fail building as the key comes directly
179 |                                       // from S3
180 |                     })
181 |                 })
182 |                 .collect::<Vec<_>>();
183 |             let n_objects = objects.len();
184 | 
185 |             async move {
186 |                 list_progress2(n_objects).await;
187 |                 let (report, _) = s3_request(
188 |                     move || {
189 |                         let (s3, bucket, objects) = (s3.clone(), bucket.clone(), objects.clone());
190 |                         async move {
191 |                             let (s3, bucket, objects) =
192 |                                 (s3.clone(), bucket.clone(), objects.clone());
193 |                             Ok((
194 |                                 async move {
195 |                                     s3.delete_objects()
196 |                                         .set_bucket(Some(bucket))
197 |                                         .set_delete(Some(
198 |                                             Delete::builder()
199 |                                                 .set_objects(Some(objects))
200 |                                                 .build()
201 |                                                 .unwrap(), // unwrap: shouldn't fail building
202 |                                                            // because all the input comes directly from S3
203 |                                         ))
204 |                                         .send()
205 |                                         .await
206 |                                         .map_err(|e| e.into())
207 |                                 },
208 |                                 n_objects,
209 |                             ))
210 |                         }
211 |                     },
212 |                     |_, size| size,
213 |                     n_retries,
214 |                     timeout.clone(),
215 |                 )
216 |                 .await?;
217 |                 timeout.lock().await.update(&report);
218 |                 delete_progress2(report).await;
219 |                 Ok(())
220 |             }
221 |         })
222 |     }
223 | 
224 |     /// Flatten into a stream of Objects.
225 |     pub fn flatten(self) -> impl Stream<Item = Result<Object, Error>> {
226 |         self.stream
227 |             .try_filter_map(|response| ok(response.contents))
228 |             .map_ok(|x| stream::iter(x).map(Ok))
229 |             .try_flatten()
230 |     }
231 | 
232 |     /*
233 |     /// This function exists to provide a stream to copy all objects, for both `copy_all` and
234 |     /// `move_all`. The `String` that is the stream's `Item` is the _source key_. An `Ok` value
235 |     /// thus signals (relevant when used in `move_all`) that a certain key is ready for deletion.
236 |     fn copy_all_stream<F, R>(
237 |         self,
238 |         dest_bucket: Option<String>,
239 |         mapping: F,
240 |         default_request: R,
241 |     ) -> impl Stream<Item = Result<String, Error>>
242 |     where
243 |         F: Fn(&str) -> String + Clone + Send + Sync + Unpin + 'static,
244 |         R: Fn() -> CopyObjectRequest + Clone + Unpin + Sync + Send + 'static,
245 |     {
246 |         let ListObjects {
247 |             s3,
248 |             config,
249 |             bucket,
250 |             stream,
251 |             prefix: _,
252 |         } = self;
253 |         let timeout = Arc::new(Mutex::new(TimeoutState::new(
254 |             config.algorithm.clone(),
255 |             config.put_requests.clone(),
256 |         )));
257 |         let n_retries = config.algorithm.n_retries;
258 |         let dest_bucket = dest_bucket.unwrap_or_else(|| bucket.clone());
259 |         stream
260 |             .try_filter_map(|response| ok(response.1.contents))
261 |             .map_ok(|x| stream::iter(x).map(Ok))
262 |             .try_flatten()
263 |             .try_filter_map(|obj| {
264 |                 // Just filter out any object that does not have both of `key` and `size`
265 |                 let Object { key, size, .. } = obj;
266 |                 ok(key.and_then(|key| size.map(|size| (key, size))))
267 |             })
268 |             .and_then(move |(key, size)| {
269 |                 let (s3, timeout) = (s3.clone(), timeout.clone());
270 |                 let request = CopyObjectRequest {
271 |                     copy_source: format!("{}/{}", bucket, key),
272 |                     bucket: dest_bucket.clone(),
273 |                     key: mapping(&key),
274 |                     ..default_request()
275 |                 };
276 |                 // println!("COPY REQUEST\n{:#?}", request);
277 |                 s3_request(
278 |                     move || {
279 |                         let (s3, request) = (s3.clone(), request.clone());
280 |                         async move {
281 |                             let (s3, request) = (s3.clone(), request.clone());
282 |                             Ok((async move{s3.copy_object(request).context(err::CopyObject).await}, size as usize))
283 |                         }
284 |                     },
285 |                     |_, size| size,
286 |                     n_retries,
287 |                     timeout,
288 |                 )
289 |                 .map_ok(|_| key)
290 |             })
291 |     }
292 | 
293 |     /// Copy all listed objects, to a different S3 location as defined in `mapping` and
294 |     /// `dest_bucket`.
295 |     /// If `other_bucket` is not provided, copy to same bucket
296 |     pub fn copy_all<F, R>(
297 |         self,
298 |         dest_bucket: Option<String>,
299 |         mapping: F,
300 |         default_request: R,
301 |     ) -> impl Future<Output = Result<(), Error>>
302 |     where
303 |         F: Fn(&str) -> String + Clone + Send + Sync + Unpin + 'static,
304 |         R: Fn() -> CopyObjectRequest + Clone + Unpin + Sync + Send + 'static,
305 |     {
306 |         self.copy_all_stream(dest_bucket, mapping, default_request)
307 |             .try_for_each(|_| async { Ok(()) })
308 |     }
309 |     // TODO: Is it possible to change copy_all so that we can move_all by just chaining copy_all
310 |     // and delete_all? Then copy_all would need to return a stream of old keys, but does that make
311 |     // sense in general?
312 |     // For now, this is code duplication.
313 |     pub fn move_all<F, R>(
314 |         self,
315 |         dest_bucket: Option<String>,
316 |         mapping: F,
317 |         default_request: R,
318 |     ) -> impl Future<Output = Result<(), Error>>
319 |     where
320 |         F: Fn(&str) -> String + Clone + Send + Sync + Unpin + 'static,
321 |         R: Fn() -> CopyObjectRequest + Clone + Unpin + Sync + Send + 'static,
322 |     {
323 |         let src_bucket = self.bucket.clone();
324 |         let timeout = Arc::new(Mutex::new(TimeoutState::new(
325 |             self.config.algorithm.clone(),
326 |             self.config.delete_requests.clone(),
327 |         )));
328 |         let n_retries = self.config.algorithm.n_retries;
329 |         let s3 = self.s3.clone();
330 |         self.copy_all_stream(dest_bucket, mapping, default_request)
331 |             .and_then(move |src_key| {
332 |                 let delete_request = DeleteObjectRequest {
333 |                     bucket: src_bucket.clone(),
334 |                     key: src_key,
335 |                     ..Default::default()
336 |                 };
337 |                 let (s3, timeout) = (s3.clone(), timeout.clone());
338 |                 s3_request(
339 |                     move || {
340 |                         let (s3, delete_request) = (s3.clone(), delete_request.clone());
341 |                         async move {
342 |                             let (s3, delete_request) = (s3.clone(), delete_request.clone());
343 |                             Ok((
344 |                                 async move {
345 |                                     s3.delete_object(delete_request)
346 |                                         .context(err::DeleteObject)
347 |                                         .await
348 |                                 },
349 |                                 1,
350 |                             ))
351 |                         }
352 |                     },
353 |                     |_, _| 1,
354 |                     n_retries,
355 |                     timeout,
356 |                 )
357 |                 .map_ok(drop)
358 |                 .boxed()
359 |             })
360 |             .try_for_each(|_| async { Ok(()) })
361 |             .boxed()
362 |     }
363 |     /// Move all listed objects by substituting their common prefix with `new_prefix`.
364 |     pub fn move_to_prefix<R>(
365 |         self,
366 |         dest_bucket: Option<String>,
367 |         new_prefix: String,
368 |         default_request: R,
369 |     ) -> impl Future<Output = Result<(), Error>>
370 |     where
371 |         R: Fn() -> CopyObjectRequest + Clone + Unpin + Sync + Send + 'static,
372 |     {
373 |         let old_prefix = self.prefix.clone();
374 |         let substitute_prefix =
375 |             move |source: &str| format!("{}{}", new_prefix, source.trim_start_matches(&old_prefix));
376 |         self.move_all(dest_bucket, substitute_prefix, default_request)
377 |             .boxed()
378 |     }
379 |     */
380 | }
381 | 
382 | impl<S> Stream for ListObjects<S>
383 | where
384 |     S: Stream<Item = Result<ListObjectsV2Output, Error>> + Sized + Send + Unpin,
385 | {
386 |     type Item = Result<ListObjectsV2Output, Error>;
387 |     fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context) -> Poll<Option<Self::Item>> {
388 |         Pin::new(&mut self.stream).poll_next(cx)
389 |     }
390 | }
391 | 
392 | impl S3Algo {
393 |     /// List objects of a bucket.
394 |     pub fn list_prefix(
395 |         &self,
396 |         bucket: String,
397 |         prefix: Option<String>,
398 |     ) -> ListObjects<impl Stream<Item = Result<ListObjectsV2Output, Error>> + Sized + Send> {
399 |         // TODO: Reintroduce retry and timeout
400 | 
401 |         let stream = self
402 |             .s3
403 |             .list_objects_v2()
404 |             .bucket(bucket.clone())
405 |             .set_prefix(prefix)
406 |             .into_paginator()
407 |             .send();
408 |         let stream = PaginationStreamExt::into_stream_03x(stream)
409 |             // Turn into a stream of Objects
410 |             .map_err(|source| Error::ListObjectsV2 { source });
411 | 
412 |         ListObjects {
413 |             s3: self.s3.clone(),
414 |             config: self.config.clone(),
415 |             stream,
416 |             bucket,
417 |             prefix: String::new(),
418 |         }
419 |     }
420 | }
421 | 
422 | #[cfg(test)]
423 | mod test {
424 |     use super::*;
425 |     use crate::test::rand_string;
426 |     use std::sync::atomic::{AtomicUsize, Ordering};
427 |     #[tokio::test]
428 |     async fn test_s3_delete_files_progress() {
429 |         // Minio does paging at 10'000 fles, so we need more than that.
430 |         // It means this test will take a minutes or two.
431 |         let algo = S3Algo::new(testing_sdk_client().await);
432 |         let dir = rand_string(14);
433 |         let dir2 = dir.clone();
434 |         const N_FILES: usize = 11_000;
435 |         let files = (0..N_FILES).map(move |i| ObjectSource::Data {
436 |             data: vec![1, 2, 3],
437 |             key: format!("{}/{}.file", dir2, i),
438 |         });
439 |         algo.upload_files(
440 |             "test-bucket".into(),
441 |             files,
442 |             |result| async move {
443 |                 if result.seq % 100 == 0 {
444 |                     println!("{} files uploaded", result.seq);
445 |                 }
446 |             },
447 |             |client| client.put_object(),
448 |         )
449 |         .await
450 |         .unwrap();
451 | 
452 |         let listed_files = Arc::new(AtomicUsize::new(0));
453 |         let deleted_files = Arc::new(AtomicUsize::new(0));
454 |         let listed_files2 = listed_files.clone();
455 |         let deleted_files2 = deleted_files.clone();
456 | 
457 |         // Do one listing only to check the exact file names
458 |         let present = Arc::new(Mutex::new(std::collections::HashSet::new()));
459 |         algo.list_prefix("test-bucket".into(), Some(dir.clone()))
460 |             .process(|object| async {
461 |                 let name = object.key.unwrap_or_else(|| "NONE".to_string());
462 |                 println!("OBJ {}", name);
463 |                 present.lock().await.insert(name);
464 |             })
465 |             .await
466 |             .unwrap();
467 |         let mut present = present.lock().await;
468 | 
469 |         // All files are present
470 |         for i in 0..N_FILES {
471 |             let file_name = &format!("{}/{}.file", dir, i);
472 |             assert!(present.remove(file_name));
473 |         }
474 | 
475 |         // No unexpected filesnames.
476 |         // Because once, it listed 11_200 files instead of 11_000
477 |         if !present.is_empty() {
478 |             println!("Left-over object names: {:?}", present);
479 |             panic!("Not empty ({} files)", present.len());
480 |         }
481 | 
482 |         // Assert that number of files is N_FILES
483 |         let count = algo
484 |             .list_prefix("test-bucket".into(), Some(dir.clone()))
485 |             .flatten()
486 |             .try_fold(0usize, |acc, _| ok(acc + 1))
487 |             .await
488 |             .unwrap();
489 |         assert_eq!(count, N_FILES);
490 | 
491 |         // Delete all
492 |         algo.list_prefix("test-bucket".into(), Some(dir.clone()))
493 |             .delete_all(
494 |                 move |n| {
495 |                     println!("Listed {} items", n);
496 |                     let listed_files = listed_files2.clone();
497 |                     async move {
498 |                         listed_files.fetch_add(n, Ordering::Relaxed);
499 |                     }
500 |                 },
501 |                 move |del_rep| {
502 |                     let n = del_rep.size as usize;
503 |                     println!("Deleted {} items", n);
504 |                     let deleted_files = deleted_files2.clone();
505 |                     async move {
506 |                         deleted_files.fetch_add(n, Ordering::Relaxed);
507 |                     }
508 |                 },
509 |             )
510 |             .await
511 |             .unwrap();
512 | 
513 |         // Assert number of objects listed and deleted
514 |         assert_eq!(listed_files.load(Ordering::Relaxed), N_FILES);
515 |         assert_eq!(deleted_files.load(Ordering::Relaxed), N_FILES);
516 | 
517 |         // Assert that number of files is 0
518 |         let count = algo
519 |             .list_prefix("test-bucket".into(), Some(dir))
520 |             .flatten()
521 |             .try_fold(0usize, |acc, _| ok(acc + 1))
522 |             .await
523 |             .unwrap();
524 | 
525 |         assert_eq!(count, 0);
526 |     }
527 | }
528 | 


--------------------------------------------------------------------------------